├── megalodon_extras ├── __init__.py ├── phase_variants_extract_haplotype_reads.py ├── modified_bases_index_database.py ├── variants_index_database.py ├── modified_bases_describe_alphabet.py ├── calibrate_merge_modified_bases_stats.py ├── modified_bases_estimate_threshold.py ├── aggregate_run.py ├── validate_mod_bases_from_calibration.py ├── modified_bases_create_motif_bed.py ├── per_read_text_variants.py ├── per_read_text_modified_bases.py ├── merge_variants.py ├── calibrate_merge_modified_bases.py ├── phase_variants_whatshap_filter.py ├── modified_bases_create_ground_truth.py ├── variants_atomize.py ├── modified_bases_update_database.py ├── merge_aggregated_modified_bases.py ├── calibrate_generate_modified_base_stats.py ├── calibrate_modified_bases.py ├── variants_heterozygous_factor.py ├── modified_bases_split_by_motif.py └── __main__.py ├── megalodon ├── __init__.py ├── model_data │ ├── dna_r9.4.1_450bps_modbases_5mc_hac.cfg │ ├── dna_r10.3_450bps_modbases_5mc_hac_prom.cfg │ ├── dna_r9.4.1_450bps_modbases_5mc_hac_prom.cfg │ ├── dna_r10.3_450bps_fast.cfg │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── dna_r10.3_450bps_hac.cfg │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── dna_r9.4.1_450bps_hac.cfg │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── dna_r9.4.1_450bps_fast.cfg │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── dna_r10.3_450bps_fast_prom.cfg │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── dna_r10.3_450bps_hac_prom.cfg │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── dna_r9.4.1_450bps_hac_prom.cfg │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── dna_r9.4.1_450bps_fast_prom.cfg │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── res_dna_r103_prom_modbases_5mC_v001.cfg │ │ ├── megalodon_mod_calibration.npz │ │ ├── megalodon_mod_calibration.pdf │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── res_dna_r941_min_modbases_5mC_v001.cfg │ │ ├── megalodon_mod_calibration.npz │ │ ├── megalodon_mod_calibration.pdf │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── res_dna_r941_prom_modbases_5mC_v001.cfg │ │ ├── megalodon_mod_calibration.npz │ │ ├── megalodon_mod_calibration.pdf │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── res_dna_r941_min_modbases_5mC_5hmC_v001.cfg │ │ ├── megalodon_mod_calibration.npz │ │ └── megalodon_mod_calibration.pdf │ ├── res_dna_r941_min_modbases_5mC_CpG_v001.cfg │ │ ├── megalodon_mod_calibration.npz │ │ ├── megalodon_mod_calibration.pdf │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── res_dna_r941_prom_modbases_5mC_CpG_v001.cfg │ │ ├── megalodon_mod_calibration.npz │ │ ├── megalodon_mod_calibration.pdf │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg │ │ ├── megalodon_mod_calibration.npz │ │ ├── megalodon_mod_calibration.pdf │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── res_dna_r941_min_modbases-all-context_v001.cfg │ │ ├── megalodon_mod_calibration.npz │ │ ├── megalodon_mod_calibration.pdf │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ ├── res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg │ │ ├── megalodon_mod_calibration.npz │ │ ├── megalodon_mod_calibration.pdf │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf │ └── dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg │ │ ├── megalodon_mod_calibration.npz │ │ ├── megalodon_mod_calibration.pdf │ │ ├── megalodon_variant_calibration.npz │ │ └── megalodon_variant_calibration.pdf ├── constrained_decoding.py ├── logging.py ├── megalodon_multiprocessing.py ├── banding.py ├── signal_mapping.py └── validation.py ├── MANIFEST.in ├── ONT_logo_deprecate.png ├── docs ├── _images │ ├── mod_mapping_viz.png │ ├── mod_agg_comp_cov.png │ ├── mod_agg_comp_log.png │ ├── mod_agg_comp_linear.png │ ├── mod_agg_dist_results.png │ ├── mod_pr_validate_results.png │ ├── mapping_validate_results.png │ ├── mod_dist_validate_results.png │ ├── mod_roc_validate_results.png │ ├── modified_base_calibration.png │ ├── sequence_variant_calibration.png │ └── whatshap_haplotagged_variant_viz.png ├── extras_per_read_text.rst ├── extras_aggregate.rst ├── extras_merge.rst ├── extras_phase_variants.rst ├── computing_considerations.rst ├── variant_phasing.rst ├── model_training.rst ├── extras_variants.rst ├── common_arguments.rst ├── index.rst ├── extras_modified_bases.rst ├── extras_validate.rst ├── algorithm_details.rst ├── extras_calibrate.rst ├── file_formats.rst └── modbase_training.rst ├── pyproject.toml ├── .gitlab-ci.yml ├── setup.py ├── .travis.yml ├── setup.cfg └── test └── test_api.py /megalodon_extras/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /megalodon/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "2.5.0" 2 | -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_modbases_5mc_hac.cfg: -------------------------------------------------------------------------------- 1 | res_dna_r941_min_modbases_5mC_v001.cfg/ -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENCE.txt 3 | include megalodon/model_data/*/*.npz 4 | -------------------------------------------------------------------------------- /ONT_logo_deprecate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/ONT_logo_deprecate.png -------------------------------------------------------------------------------- /megalodon/model_data/dna_r10.3_450bps_modbases_5mc_hac_prom.cfg: -------------------------------------------------------------------------------- 1 | res_dna_r103_prom_modbases_5mC_v001.cfg -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_modbases_5mc_hac_prom.cfg: -------------------------------------------------------------------------------- 1 | res_dna_r941_prom_modbases_5mC_v001.cfg -------------------------------------------------------------------------------- /docs/_images/mod_mapping_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_mapping_viz.png -------------------------------------------------------------------------------- /docs/_images/mod_agg_comp_cov.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_agg_comp_cov.png -------------------------------------------------------------------------------- /docs/_images/mod_agg_comp_log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_agg_comp_log.png -------------------------------------------------------------------------------- /docs/_images/mod_agg_comp_linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_agg_comp_linear.png -------------------------------------------------------------------------------- /docs/_images/mod_agg_dist_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_agg_dist_results.png -------------------------------------------------------------------------------- /docs/_images/mod_pr_validate_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_pr_validate_results.png -------------------------------------------------------------------------------- /docs/_images/mapping_validate_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mapping_validate_results.png -------------------------------------------------------------------------------- /docs/_images/mod_dist_validate_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_dist_validate_results.png -------------------------------------------------------------------------------- /docs/_images/mod_roc_validate_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_roc_validate_results.png -------------------------------------------------------------------------------- /docs/_images/modified_base_calibration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/modified_base_calibration.png -------------------------------------------------------------------------------- /docs/_images/sequence_variant_calibration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/sequence_variant_calibration.png -------------------------------------------------------------------------------- /docs/_images/whatshap_haplotagged_variant_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/whatshap_haplotagged_variant_viz.png -------------------------------------------------------------------------------- /megalodon/model_data/dna_r10.3_450bps_fast.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_fast.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r10.3_450bps_fast.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_fast.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/dna_r10.3_450bps_hac.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_hac.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r10.3_450bps_hac.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_hac.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_hac.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_hac.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_hac.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_hac.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_fast.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_fast.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_fast.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_fast.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/dna_r10.3_450bps_fast_prom.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_fast_prom.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r10.3_450bps_fast_prom.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_fast_prom.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/dna_r10.3_450bps_hac_prom.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_hac_prom.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r10.3_450bps_hac_prom.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_hac_prom.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_hac_prom.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_hac_prom.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_hac_prom.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_hac_prom.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_fast_prom.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_fast_prom.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_fast_prom.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_fast_prom.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_v001.cfg/megalodon_mod_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_v001.cfg/megalodon_mod_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_v001.cfg/megalodon_mod_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_v001.cfg/megalodon_mod_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_mod_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_mod_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_mod_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_mod_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_mod_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_mod_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_mod_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_mod_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_mod_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_mod_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_mod_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_mod_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_mod_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_mod_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_mod_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_mod_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_variant_calibration.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_variant_calibration.npz -------------------------------------------------------------------------------- /megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_variant_calibration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_variant_calibration.pdf -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 80 3 | target-version = ['py36'] 4 | include = '\.pyi?$' 5 | exclude = ''' 6 | 7 | ( 8 | /( 9 | \.eggs # exclude a few common directories in the 10 | | \.git # root of the project 11 | | \.hg 12 | | \.mypy_cache 13 | | \.tox 14 | | \.venv 15 | | _build 16 | | buck-out 17 | | build 18 | | dist 19 | | venv 20 | )/ 21 | ) 22 | ''' 23 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: python:3.6 2 | 3 | stages: 4 | - format 5 | 6 | before_script: 7 | - python -V # Print out python version for debugging 8 | - pip install virtualenv 9 | - virtualenv venv 10 | - source venv/bin/activate 11 | - pip install --upgrade pip 12 | - pip install .[testing] 13 | 14 | black: 15 | stage: format 16 | script: 17 | - source venv/bin/activate 18 | - black --check . 19 | 20 | docs: 21 | stage: format 22 | script: 23 | - source venv/bin/activate 24 | - sphinx-build -b html docs builddir && tar -zcf docs_build.tgz builddir 25 | artifacts: 26 | paths: 27 | - docs_build.tgz 28 | expire_in: 1 week 29 | only: 30 | - master 31 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from setuptools import setup, Extension 3 | 4 | 5 | if __name__ == "__main__": 6 | # Note that using setup_requires cython allows users to install megalodon 7 | # without first installing cython (as required when using cythonize) 8 | extra_compile_args = ["-std=c99"] 9 | if sys.platform == "darwin": 10 | extra_compile_args.append("-mmacosx-version-min=10.9") 11 | print("Using macOS clang args") 12 | ext_modules = [ 13 | Extension( 14 | "megalodon.decode", 15 | sources=["megalodon/decode.pyx"], 16 | extra_compile_args=extra_compile_args, 17 | language="c", 18 | ), 19 | ] 20 | setup( 21 | use_pyscaffold=True, 22 | setup_requires=["setuptools>=38.3", "cython"], 23 | ext_modules=ext_modules, 24 | ) 25 | -------------------------------------------------------------------------------- /megalodon_extras/phase_variants_extract_haplotype_reads.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | 3 | from ._extras_parsers import get_parser_phase_variants_extract_haplotype_reads 4 | 5 | 6 | def _main(args): 7 | out_fps = {} 8 | for rec in pysam.AlignmentFile(args.alignment_filename): 9 | try: 10 | hp = dict(rec.tags)["HP"] 11 | except KeyError: 12 | # skip un-tagged reads 13 | continue 14 | if hp not in out_fps: 15 | out_fps[hp] = open( 16 | "{}.haplotype_{}_read_ids.txt".format(args.out_basename, hp), 17 | "w", 18 | ) 19 | out_fps[hp].write(rec.qname + "\n") 20 | 21 | for fp in out_fps.values(): 22 | fp.close() 23 | 24 | return 25 | 26 | 27 | if __name__ == "__main__": 28 | _main(get_parser_phase_variants_extract_haplotype_reads().parse_args()) 29 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | dist: trusty 5 | sudo: required 6 | 7 | addons: 8 | apt: 9 | sources: 10 | - ubuntu-toolchain-r-test 11 | packages: 12 | - libhdf5-dev 13 | 14 | 15 | install: 16 | - echo $TRAVIS_PYTHON_VERSION 17 | - pip install --upgrade pip setuptools wheel 18 | - pip install --only-binary=numpy,scipy,cython numpy scipy cython 19 | - pip install . 20 | - pip install sphinx sphinx_rtd_theme sphinx-argparse 21 | 22 | 23 | script: 24 | - echo "No testing implemented" 25 | 26 | 27 | before_deploy: 28 | - cd docs 29 | - sphinx-build -b html -d _build/doctrees . _build/html 30 | - cd ../ 31 | - touch docs/_build/html/.nojekyll 32 | 33 | deploy: 34 | provider: pages 35 | skip_cleanup: true 36 | github_token: $GHPAGES_TOKEN 37 | local_dir: docs/_build/html 38 | target_branch: gh-pages 39 | on: 40 | branch: master 41 | python: "3.6" 42 | -------------------------------------------------------------------------------- /docs/extras_per_read_text.rst: -------------------------------------------------------------------------------- 1 | ********************************** 2 | ``megalodon_extras per_read_text`` 3 | ********************************** 4 | 5 | The ``megalodon_extras per_read_text`` command group contains commands to convert per-read modified base database statistics to text files. 6 | These files will be TSV files with headers describing the fields contained within the file. 7 | 8 | Note that these scripts are single threaded and can be quite slow for reasonable sized runs. 9 | 10 | ------------------------------------------------- 11 | ``megalodon_extras per_read_text modified_bases`` 12 | ------------------------------------------------- 13 | 14 | Extract text format per-read modified base scores from a Megalodon per-read modified base database. 15 | 16 | ------------------------------------------- 17 | ``megalodon_extras per_read_text variants`` 18 | ------------------------------------------- 19 | 20 | Extract text format per-read sequence variant scores from a Megalodon per-read sequence variant database. 21 | -------------------------------------------------------------------------------- /megalodon_extras/modified_bases_index_database.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from megalodon import logging, mods, megalodon_helper as mh 3 | from ._extras_parsers import get_parser_modified_bases_index_database 4 | 5 | 6 | LOGGER = logging.get_logger() 7 | 8 | 9 | def _main(args): 10 | logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix) 11 | LOGGER.debug('Command: """' + " ".join(sys.argv) + '"""') 12 | 13 | mods_db_fn = mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME) 14 | mods_db = mods.ModsDb(mods_db_fn, read_only=False) 15 | try: 16 | mods_db.check_data_covering_index_exists() 17 | LOGGER.info("Modified bases database index already exists") 18 | except mh.MegaError: 19 | LOGGER.info("Creating modified bases database index") 20 | mods_db.create_data_covering_index() 21 | LOGGER.debug("Closing database") 22 | mods_db.close() 23 | 24 | 25 | if __name__ == "__main__": 26 | _main(get_parser_modified_bases_index_database().parse_args()) 27 | -------------------------------------------------------------------------------- /megalodon_extras/variants_index_database.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from megalodon import logging, variants, megalodon_helper as mh 3 | from ._extras_parsers import get_parser_variants_index_database 4 | 5 | 6 | LOGGER = logging.get_logger() 7 | 8 | 9 | def _main(args): 10 | raise NotImplementedError( 11 | "Variant index creation not currently implemented." 12 | ) 13 | 14 | logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix) 15 | LOGGER.debug('Command: """' + " ".join(sys.argv) + '"""') 16 | 17 | vars_db_fn = mh.get_megalodon_fn(args.megalodon_directory, mh.PR_VAR_NAME) 18 | vars_db = variants.VarsDb(vars_db_fn, read_only=False) 19 | try: 20 | vars_db.check_data_covering_index_exists() 21 | LOGGER.info("Variants database index already exists") 22 | except mh.MegaError: 23 | LOGGER.info("Creating variants database index") 24 | vars_db.create_data_covering_index() 25 | LOGGER.debug("Closing database") 26 | vars_db.close() 27 | 28 | 29 | if __name__ == "__main__": 30 | _main(get_parser_variants_index_database().parse_args()) 31 | -------------------------------------------------------------------------------- /docs/extras_aggregate.rst: -------------------------------------------------------------------------------- 1 | ****************************** 2 | ``megalodon_extras aggregate`` 3 | ****************************** 4 | 5 | The ``megalodon_extras aggregate`` command group contains a single command, ``run``, to perform aggregation of per-read sequence variant or modified base results. 6 | 7 | ---------------------------------- 8 | ``megalodon_extras aggregate run`` 9 | ---------------------------------- 10 | 11 | Aggregate per-read sequence variant and/or modified base from the main ``megalodon`` command. 12 | 13 | This command can be useful in processing Megalodon pipelines efficiently. 14 | This command allows the ``megalodon`` command can be performed on one set of computing resources and then ``megalodon_extras aggregate run`` can be completed on a separate set of computing resources. 15 | The ``megalodon`` command, running the basecalling backend, generally requires GPU resources, while the aggregation step generally requires fast disk (SSDs) and a lot of CPU cores. 16 | This command allows one to perform these steps separately and on appropriate compute resources. 17 | 18 | Additionally, this command can allow for the adjustment of aggregation parameters without the need to repeat the compute expensive basecalling step. 19 | -------------------------------------------------------------------------------- /megalodon_extras/modified_bases_describe_alphabet.py: -------------------------------------------------------------------------------- 1 | from megalodon import backends, logging, megalodon_helper as mh 2 | from ._extras_parsers import get_parser_modified_bases_describe_alphabet 3 | 4 | 5 | LOGGER = logging.get_logger() 6 | 7 | 8 | def _main(args): 9 | try: 10 | mh.mkdir(args.guppy_logs_output_directory, False) 11 | except mh.MegaError: 12 | LOGGER.warning( 13 | "Guppy logs output directory exists. Potentially overwriting " 14 | + "guppy logs." 15 | ) 16 | logging.init_logger(args.log_directory) 17 | # set args that are not relevant to alphabet 18 | args.devices = None 19 | 20 | # set guppy args 21 | args.guppy_server_port = None 22 | args.guppy_timeout = mh.DEFAULT_GUPPY_TIMEOUT 23 | args.guppy_concurrent_reads = mh.DEFAULT_GUPPY_CONCURRENT_READS 24 | args.output_directory = args.guppy_logs_output_directory 25 | 26 | # set taiyaki args 27 | args.chunk_size = 1000 28 | args.chunk_overlap = 100 29 | args.max_concurrent_chunks = 200 30 | backend_params = backends.parse_backend_params(args) 31 | with backends.ModelInfo(backend_params, 1) as model_info: 32 | LOGGER.info(model_info.get_alphabet_str()) 33 | 34 | 35 | if __name__ == "__main__": 36 | _main(get_parser_modified_bases_describe_alphabet().parse_args()) 37 | -------------------------------------------------------------------------------- /megalodon_extras/calibrate_merge_modified_bases_stats.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | 5 | from megalodon import logging, mods 6 | from ._extras_parsers import get_parser_calibrate_merge_modified_bases_stats 7 | 8 | 9 | LOGGER = logging.get_logger() 10 | 11 | 12 | def _main(args): 13 | logging.init_logger() 14 | 15 | fn_mod_base_llrs = defaultdict(lambda: ([], [])) 16 | for llr_fn in args.modified_base_calibration_stats_files: 17 | llrs_data = np.load(llr_fn) 18 | for mod_base in llrs_data[mods.GT_ALL_MOD_BASE_STR]: 19 | fn_mod_base_llrs[mod_base][0].append( 20 | llrs_data[mods.GT_MOD_LLR_STR.format(mod_base)] 21 | ) 22 | fn_mod_base_llrs[mod_base][1].append( 23 | llrs_data[mods.GT_CAN_LLR_STR.format(mod_base)] 24 | ) 25 | 26 | mod_base_stats = {mods.GT_ALL_MOD_BASE_STR: list(fn_mod_base_llrs)} 27 | for mod_base, (mod_llrs, can_llrs) in fn_mod_base_llrs.items(): 28 | mod_base_stats[mods.GT_MOD_LLR_STR.format(mod_base)] = np.concatenate( 29 | mod_llrs 30 | ) 31 | mod_base_stats[mods.GT_CAN_LLR_STR.format(mod_base)] = np.concatenate( 32 | can_llrs 33 | ) 34 | np.savez(args.out_filename, **mod_base_stats) 35 | 36 | 37 | if __name__ == "__main__": 38 | _main(get_parser_calibrate_merge_modified_bases_stats().parse_args()) 39 | -------------------------------------------------------------------------------- /docs/extras_merge.rst: -------------------------------------------------------------------------------- 1 | ************************** 2 | ``megalodon_extras merge`` 3 | ************************** 4 | 5 | The ``megalodon_extras merge`` command group contains commands to merge multiple per-read modified base or sequence variant databases. 6 | 7 | These commands can assist in deploying Megalodon on an array of compute resources. 8 | 9 | ----------------------------------------- 10 | ``megalodon_extras merge modified_bases`` 11 | ----------------------------------------- 12 | 13 | Merge multiple per-read modified base databases together. 14 | 15 | This command contains multi-process capabilities, but may encounter disk I/O bottlenecks. 16 | Note that the full set of modified base positions must be stored in memory to allow this command to process at high performance. 17 | Thus the number of processes should likely be set dependent upon the amount of RAM available and not the number of CPU cores available. 18 | It is recommended that the output location be on a fast disk (i.e. local SSD and not NFS or mounted drives). 19 | 20 | ---------------------------------------------------- 21 | ``megalodon_extras merge aggregated_modified_bases`` 22 | ---------------------------------------------------- 23 | 24 | Merge multiple aggregated modified base bedmethyl files together. 25 | 26 | This command can be useful when used in processing pipelines and when preparting modified base training ground truth files. 27 | The ``--sorted-inputs`` option is provided to allow processing of very large files that cannot be stored in memory. 28 | Note that bedmethyl files output by Megalodon are not sorted by default. 29 | Sorting can be completed by running the unix command ``sort -k1V -k2n megalodon_results/modified_bases.5mC.bed > megalodon_results/modified_bases.5mC.sorted.bed``. 30 | 31 | ----------------------------------- 32 | ``megalodon_extras merge variants`` 33 | ----------------------------------- 34 | 35 | Merge multiple per-read sequence variant databases together. 36 | -------------------------------------------------------------------------------- /megalodon_extras/modified_bases_estimate_threshold.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | 4 | from megalodon import logging, mods, megalodon_helper as mh 5 | from ._extras_parsers import get_parser_modified_bases_estimate_threshold 6 | 7 | 8 | LOGGER = logging.get_logger() 9 | 10 | 11 | def _main(args): 12 | logging.init_logger() 13 | 14 | LOGGER.info("Loading database position statistics") 15 | mods_db = mods.ModsDb( 16 | mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME) 17 | ) 18 | db_mods = set(mod_base for mod_base, _ in mods_db.get_mod_long_names()) 19 | if args.mod_base not in db_mods: 20 | raise mh.MegaError("Target modified base not found in mods database.") 21 | 22 | scores = [] 23 | bar = tqdm(total=args.num_statistics, smoothing=0) 24 | for (chrm, strand, pos), mod_llrs in mods_db.iter_pos_scores( 25 | convert_pos=True, compute_llrs=True 26 | ): 27 | for mod_base, reads_llrs in mod_llrs.items(): 28 | if mod_base != args.mod_base: 29 | continue 30 | bar.update(len(reads_llrs)) 31 | scores.extend(reads_llrs) 32 | if args.num_statistics is not None and bar.n >= args.num_statistics: 33 | break 34 | 35 | LOGGER.info("Esitmating fraction of modified bases") 36 | scores = np.array(scores) 37 | frac_mod = args.fraction_modified 38 | if frac_mod is None: 39 | thresh_vals = np.percentile( 40 | scores, (args.mod_percentile, 100 - args.mod_percentile) 41 | ) 42 | thresh_val = np.abs(thresh_vals).min() 43 | n_can = np.greater_equal(scores, thresh_val).sum() 44 | n_mod = np.less_equal(scores, -thresh_val).sum() 45 | frac_mod = n_mod / (n_mod + n_can) 46 | print("Fraction mod: {}".format(frac_mod)) 47 | llr_thresh = np.percentile(scores, frac_mod * 100) 48 | print("Threshold: {}".format(llr_thresh)) 49 | 50 | 51 | if __name__ == "__main__": 52 | _main(get_parser_modified_bases_estimate_threshold().parse_args()) 53 | -------------------------------------------------------------------------------- /megalodon_extras/aggregate_run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | 5 | from megalodon import aggregate, logging, mods, variants, megalodon_helper as mh 6 | from ._extras_parsers import get_parser_aggregate_run 7 | 8 | 9 | # set blas library environment variables (without these the cblas calls 10 | # can completely halt processing) 11 | os.environ["OMP_NUM_THREADS"] = "1" 12 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 13 | 14 | LOGGER = logging.get_logger() 15 | 16 | 17 | def _main(args): 18 | log_suffix = ( 19 | "aggregation" 20 | if args.output_suffix is None 21 | else "aggregation." + args.output_suffix 22 | ) 23 | logging.init_logger(args.megalodon_directory, out_suffix=log_suffix) 24 | LOGGER.debug('Command: """' + " ".join(sys.argv) + '"""') 25 | 26 | if args.mod_aggregate_method == mh.MOD_EM_NAME: 27 | mod_agg_info = mods.AGG_INFO(mh.MOD_EM_NAME, None) 28 | elif args.mod_aggregate_method == mh.MOD_BIN_THRESH_NAME: 29 | mod_agg_info = mods.AGG_INFO( 30 | mh.MOD_BIN_THRESH_NAME, args.mod_binary_threshold 31 | ) 32 | valid_read_ids = mh.parse_read_ids(args.read_ids_filename) 33 | aggregate.aggregate_stats( 34 | args.outputs, 35 | args.megalodon_directory, 36 | args.processes, 37 | args.write_vcf_log_probs, 38 | args.heterozygous_factors, 39 | variants.HAPLIOD_MODE if args.haploid else variants.DIPLOID_MODE, 40 | mod_agg_info, 41 | args.write_mod_log_probs, 42 | args.mod_output_formats, 43 | args.suppress_progress, 44 | valid_read_ids, 45 | args.output_suffix, 46 | args.aggregate_batch_size, 47 | ) 48 | 49 | if mh.VAR_NAME in args.outputs: 50 | LOGGER.info("Sorting output variant file") 51 | variant_fn = mh.add_fn_suffix( 52 | mh.get_megalodon_fn(args.megalodon_directory, mh.VAR_NAME), 53 | args.output_suffix, 54 | ) 55 | sort_variant_fn = mh.add_fn_suffix(variant_fn, "sorted") 56 | variants.sort_variants(variant_fn, sort_variant_fn) 57 | LOGGER.info("Indexing output variant file") 58 | variants.index_variants(sort_variant_fn) 59 | 60 | 61 | if __name__ == "__main__": 62 | _main(get_parser_aggregate_run().parse_args()) 63 | -------------------------------------------------------------------------------- /docs/extras_phase_variants.rst: -------------------------------------------------------------------------------- 1 | *********************************** 2 | ``megalodon_extras phase_variants`` 3 | *********************************** 4 | 5 | The ``megalodon_extras phase_variants`` command group contains commands to assist in the Megalodon pipeline to produce the highest quality phased variants calls. 6 | 7 | --------------------------------------------------- 8 | ``megalodon_extras phase_variants whatshap_filter`` 9 | --------------------------------------------------- 10 | 11 | `WhatsHap `_ (as of version ``0.18``) cannot process some complex variants. 12 | Providing such variants causes WhatsHap to exit with an error. 13 | This command is provided to remove these complex variants and allow processing to proceed without error. 14 | Note that these variants are still considered outside of the WhatsHap phasing step of the Megalodon phasing pipeline. 15 | 16 | ----------------------------------------------------------- 17 | ``megalodon_extras phase_variants extract_haplotype_reads`` 18 | ----------------------------------------------------------- 19 | 20 | From alignment files produced by ``whatshap haplotag``, extract read ids for reads assigned to one of the two haplotypes. 21 | One file will be produced for each haplotype value in the alignment file (two for standard diploid processing). 22 | 23 | ---------------------------------------------------------- 24 | ``megalodon_extras phase_variants merge_haploid_variants`` 25 | ---------------------------------------------------------- 26 | 27 | Merge haploid calls from original Megalodon variants and separate haplotype sets of calls. 28 | 29 | This command should only be used as recommended in the Megalodon variant phasing pipeline. 30 | Use of this command outside of this context is not recommended as several processing steps depend upon the variant processing steps. 31 | 32 | This command iterates over the three sets of sorted variants. 33 | If a variant is not found in the haplotype variant files, the call from the original Megalodon run is taken. 34 | If a variant is called as homozygous in the original Megalodon calling a heterozygous call cannot be output. 35 | If a variant was originally calls as heterozygous and the variant is called in both haplotype calls, then the output call is determined from the two haplotype calls. 36 | -------------------------------------------------------------------------------- /megalodon_extras/validate_mod_bases_from_calibration.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | 4 | import numpy as np 5 | import matplotlib 6 | 7 | if True: 8 | # Agg appears to be the most robust backend when only saving plots. 9 | matplotlib.use("Agg") 10 | from matplotlib.backends.backend_pdf import PdfPages 11 | 12 | from megalodon import logging, megalodon_helper as mh, mods, validation 13 | from ._extras_parsers import get_parser_calibrate_modified_bases 14 | 15 | 16 | LOGGER = logging.get_logger() 17 | 18 | 19 | def extract_llrs(llr_fn): 20 | llrs_data = np.load(llr_fn) 21 | mod_bases = llrs_data[mods.GT_ALL_MOD_BASE_STR] 22 | mod_base_llrs = {} 23 | for mod_base in mod_bases: 24 | mod_base_llrs[mod_base] = ( 25 | llrs_data[mods.GT_MOD_LLR_STR.format(mod_base)], 26 | llrs_data[mods.GT_CAN_LLR_STR.format(mod_base)], 27 | ) 28 | 29 | return mod_base_llrs 30 | 31 | 32 | def _main(args): 33 | logging.init_logger() 34 | 35 | LOGGER.info("Parsing log-likelihood ratios") 36 | mod_base_llrs = extract_llrs(args.ground_truth_llrs) 37 | 38 | out_fp = ( 39 | sys.stdout 40 | if args.out_filename is None 41 | else open(args.out_filename, "w") 42 | ) 43 | out_fp.write(validation.MOD_VAL_METRICS_HEADER) 44 | pdf_fp = None if args.out_pdf is None else PdfPages(args.out_pdf) 45 | all_pr_data, all_roc_data = defaultdict(list), defaultdict(list) 46 | all_kde_data = [] 47 | for mod_base, (mod_llrs, can_llrs) in mod_base_llrs.items(): 48 | LOGGER.info(f'Computing "{mod_base}" modified base validation.') 49 | try: 50 | pr_data, roc_data, kde_data = validation.compute_mod_sites_stats( 51 | mod_llrs, 52 | can_llrs, 53 | not args.allow_unbalance_classes, 54 | mod_base, 55 | "Megalodon Calibration Data", 56 | "Sample", 57 | out_fp, 58 | ) 59 | all_pr_data[mod_base].append(pr_data) 60 | all_roc_data[mod_base].append(roc_data) 61 | all_kde_data.append(kde_data) 62 | except mh.MegaError as e: 63 | LOGGER.warning(str(e)) 64 | validation.plot_pr(pdf_fp, all_pr_data) 65 | validation.plot_roc(pdf_fp, all_roc_data) 66 | validation.plot_kde(pdf_fp, all_kde_data) 67 | if pdf_fp is not None: 68 | pdf_fp.close() 69 | 70 | 71 | if __name__ == "__main__": 72 | _main(get_parser_calibrate_modified_bases().parse_args()) 73 | -------------------------------------------------------------------------------- /megalodon_extras/modified_bases_create_motif_bed.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | import pysam 4 | from tqdm import tqdm 5 | 6 | from megalodon import logging, megalodon_helper as mh 7 | from ._extras_parsers import get_parser_modified_bases_create_motif_bed 8 | 9 | 10 | LOGGER = logging.get_logger() 11 | 12 | MOTIF_INFO = namedtuple( 13 | "MOTIF_INFO", 14 | ("bases_before", "bases_after", "raw_motif", "motif", "rc_motif"), 15 | ) 16 | BED_TMPLT = "{chrom}\t{pos}\t{end}\t.\t.\t{strand}\n" 17 | 18 | 19 | def parse_motifs(raw_motifs): 20 | motifs = [] 21 | for raw_motif, bases_before in raw_motifs: 22 | bases_before = int(bases_before) 23 | bases_after = len(raw_motif) - bases_before - 1 24 | motif = mh.compile_motif_pat(raw_motif) 25 | rc_motif = mh.compile_rev_comp_motif_pat(raw_motif) 26 | motifs.append( 27 | MOTIF_INFO( 28 | bases_before=bases_before, 29 | bases_after=bases_after, 30 | raw_motif=raw_motif, 31 | motif=motif, 32 | rc_motif=rc_motif, 33 | ) 34 | ) 35 | 36 | return motifs 37 | 38 | 39 | def _main(args): 40 | logging.init_logger() 41 | 42 | # parse motifs 43 | motifs = parse_motifs(args.motif) 44 | # open indexed FASTA reference 45 | ref = pysam.FastaFile(args.reference) 46 | 47 | with open(args.out_filename, "w") as fp: 48 | # sort using RefName 49 | for chrm in tqdm( 50 | sorted([mh.RefName(chrm) for chrm in ref.references]), 51 | desc="Contigs", 52 | smoothing=0, 53 | dynamic_ncols=True, 54 | ): 55 | chrm_seq = ref.fetch(chrm) 56 | chrm_sites = [] 57 | for motif in motifs: 58 | for m in motif.motif.finditer(chrm_seq): 59 | pos = m.start() + motif.bases_before 60 | chrm_sites.append((pos, "+")) 61 | for m in motif.rc_motif.finditer(chrm_seq): 62 | pos = m.start() + motif.bases_after 63 | chrm_sites.append((pos, "-")) 64 | fp.write( 65 | "".join( 66 | BED_TMPLT.format( 67 | chrom=chrm, pos=pos, end=pos + 1, strand=strand 68 | ) 69 | for pos, strand in sorted(chrm_sites) 70 | ) 71 | ) 72 | 73 | 74 | if __name__ == "__main__": 75 | _main(get_parser_modified_bases_create_motif_bed().parse_args()) 76 | -------------------------------------------------------------------------------- /megalodon_extras/per_read_text_variants.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | from megalodon import variants, megalodon_helper as mh 7 | from ._extras_parsers import get_parser_per_read_text_variants 8 | 9 | 10 | def _main(args): 11 | vars_db = variants.VarsDb( 12 | mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_VAR_NAME), 13 | uuid_strand_index_in_memory=True, 14 | ) 15 | vars_txt_fp = open( 16 | mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_VAR_TXT_NAME) 17 | if args.out_filename is None 18 | else args.out_filename, 19 | "w", 20 | ) 21 | vars_txt_fp.write("\t".join(vars_db.text_field_names) + "\n") 22 | for (loc_id, loc_chrm, pos, ref_seq, var_name, test_start) in tqdm( 23 | vars_db.iter_locs(), total=vars_db.get_num_uniq_var_loc(), smoothing=0 24 | ): 25 | pr_var_stats = vars_db.get_loc_stats( 26 | (loc_id, loc_chrm, pos, ref_seq, var_name, test_start) 27 | ) 28 | alt_type_stats = defaultdict(dict) 29 | for r_stats in pr_var_stats: 30 | alt_type_stats[r_stats.read_id][r_stats.alt_seq] = ( 31 | r_stats.score, 32 | r_stats.chrm, 33 | ) 34 | 35 | var_out_text = "" 36 | for read_id, r_var_stats in alt_type_stats.items(): 37 | uuid, strand = vars_db.get_uuid_strand(read_id) 38 | alt_lps = np.array(list(zip(*r_var_stats.values()))[0]) 39 | with np.errstate(divide="ignore"): 40 | ref_lp = np.log1p(-np.exp(alt_lps).sum()) 41 | var_out_text += ( 42 | "\n".join( 43 | ( 44 | ( 45 | "\t".join("{}" for _ in vars_db.text_field_names) 46 | ).format( 47 | uuid, 48 | chrm, 49 | strand, 50 | pos, 51 | ref_lp, 52 | alt_lp, 53 | ref_seq, 54 | alt_seq, 55 | var_name, 56 | ) 57 | for alt_seq, (alt_lp, chrm) in r_var_stats.items() 58 | ) 59 | ) 60 | + "\n" 61 | ) 62 | vars_txt_fp.write(var_out_text) 63 | 64 | return 65 | 66 | 67 | if __name__ == "__main__": 68 | _main(get_parser_per_read_text_variants().parse_args()) 69 | -------------------------------------------------------------------------------- /megalodon_extras/per_read_text_modified_bases.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | 4 | from megalodon import mods, megalodon_helper as mh 5 | from ._extras_parsers import get_parser_per_read_text_modified_bases 6 | 7 | 8 | def _main(args): 9 | mods_db = mods.ModsDb( 10 | mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME), 11 | in_mem_dbid_to_uuid=True, 12 | ) 13 | mods_txt_fp = open( 14 | mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME) 15 | if args.out_filename is None 16 | else args.out_filename, 17 | "w", 18 | ) 19 | mods_txt_fp.write("\t".join(mods_db.text_field_names) + "\n") 20 | rec_tmplt = "\t".join("{}" for _ in mods_db.text_field_names) + "\n" 21 | bar = tqdm( 22 | desc="Processing Per-read Data", 23 | unit="per-read calls", 24 | total=mods_db.get_num_uniq_stats(), 25 | smoothing=0, 26 | dynamic_ncols=True, 27 | ) 28 | for (chrm, strand, pos), pos_lps in mods_db.iter_pos_scores( 29 | convert_pos=True 30 | ): 31 | bar.update(len(pos_lps)) 32 | str_strand = mh.int_strand_to_str(strand) 33 | mod_out_text = "" 34 | prev_dbid = None 35 | mod_bs, r_lps = [], [] 36 | for read_dbid, mod_dbid, lp in sorted(pos_lps): 37 | if prev_dbid != read_dbid and prev_dbid is not None: 38 | uuid = mods_db.get_uuid(prev_dbid) 39 | # compute and store log likelihood ratios 40 | with np.errstate(divide="ignore"): 41 | can_lp = np.log1p(-np.exp(r_lps).sum()) 42 | for mod_b, r_lp in zip(mod_bs, r_lps): 43 | mod_out_text += rec_tmplt.format( 44 | uuid, chrm, str_strand, pos, r_lp, can_lp, mod_b 45 | ) 46 | mod_bs, r_lps = [], [] 47 | prev_dbid = read_dbid 48 | mod_bs.append(mods_db.get_mod_base(mod_dbid)) 49 | r_lps.append(lp) 50 | uuid = mods_db.get_uuid(prev_dbid) 51 | # compute and store log likelihood ratios 52 | with np.errstate(divide="ignore"): 53 | can_lp = np.log1p(-np.exp(r_lps).sum()) 54 | for mod_b, r_lp in zip(mod_bs, r_lps): 55 | mod_out_text += rec_tmplt.format( 56 | uuid, chrm, str_strand, pos, r_lp, can_lp, mod_b 57 | ) 58 | mods_txt_fp.write(mod_out_text) 59 | mods_txt_fp.close() 60 | 61 | 62 | if __name__ == "__main__": 63 | _main(get_parser_per_read_text_modified_bases().parse_args()) 64 | -------------------------------------------------------------------------------- /megalodon_extras/merge_variants.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | from megalodon import logging, megalodon_helper as mh, variants 4 | from ._extras_parsers import get_parser_merge_variants 5 | 6 | 7 | def _main(args): 8 | mh.mkdir(args.output_megalodon_results_dir, args.overwrite) 9 | logging.init_logger(args.output_megalodon_results_dir) 10 | logger = logging.get_logger() 11 | 12 | logger.info("Opening new sequence variant statistics database") 13 | out_vars_db = variants.VarsDb( 14 | mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_VAR_NAME), 15 | read_only=False, 16 | loc_index_in_memory=not args.var_locations_on_disk, 17 | uuid_index_in_memory=True, 18 | ) 19 | 20 | for mega_dir in args.megalodon_results_dirs: 21 | logger.info( 22 | "Adding sequence variant statistics from {}".format(mega_dir) 23 | ) 24 | # full read only mode with no indices read into memory 25 | vars_db = variants.VarsDb( 26 | mh.get_megalodon_fn(mega_dir, mh.PR_VAR_NAME), 27 | read_only=True, 28 | chrm_index_in_memory=False, 29 | alt_index_in_memory=False, 30 | uuid_index_in_memory=False, 31 | ) 32 | bar = tqdm( 33 | desc=mega_dir, 34 | total=vars_db.get_num_uniq_stats(), 35 | smoothing=0, 36 | dynamic_ncols=True, 37 | ) 38 | for ( 39 | score, 40 | uuid, 41 | strand, 42 | alt_seq, 43 | ref_seq, 44 | pos, 45 | var_name, 46 | test_end, 47 | test_start, 48 | chrm, 49 | chrm_len, 50 | ) in vars_db.iter_data(): 51 | chrm_id = out_vars_db.get_chrm_id_or_insert(chrm, chrm_len) 52 | loc_id = out_vars_db.get_loc_id_or_insert( 53 | chrm_id, test_start, test_end, pos, ref_seq, var_name 54 | ) 55 | alt_id = out_vars_db.get_alt_id_or_insert(alt_seq) 56 | read_id = out_vars_db.get_read_id_or_insert(uuid) 57 | out_vars_db.insert_data(score, loc_id, alt_id, read_id) 58 | bar.update() 59 | bar.close() 60 | 61 | logger.info("Creating indices and closing database") 62 | if out_vars_db.chrm_idx_in_mem: 63 | out_vars_db.create_chrm_index() 64 | if out_vars_db.loc_idx_in_mem: 65 | out_vars_db.create_loc_index() 66 | if out_vars_db.alt_idx_in_mem: 67 | out_vars_db.create_alt_index() 68 | out_vars_db.create_data_covering_index() 69 | out_vars_db.close() 70 | 71 | 72 | if __name__ == "__main__": 73 | _main(get_parser_merge_variants().parse_args()) 74 | -------------------------------------------------------------------------------- /megalodon_extras/calibrate_merge_modified_bases.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from megalodon import calibration, logging, megalodon_helper as mh 4 | from ._extras_parsers import get_parser_calibrate_merge_modified_bases 5 | 6 | 7 | LOGGER = logging.get_logger() 8 | 9 | 10 | def _main(args): 11 | logging.init_logger() 12 | mh.prep_out_fn(args.out_filename, args.overwrite) 13 | 14 | LOGGER.info( 15 | "Processing {}".format(args.modified_base_calibration_files[-1]) 16 | ) 17 | calib_data = np.load(args.modified_base_calibration_files[-1]) 18 | stratify_type = str(calib_data[calibration.MOD_STRAT_TYPE_TXT]) 19 | num_calib_vals = np.int(calib_data[calibration.SMOOTH_NVALS_TXT]) 20 | mod_calibs = {} 21 | for mod_base in calib_data[calibration.MOD_BASES_TXT]: 22 | LOGGER.info("\tUsing {} calibration".format(mod_base)) 23 | mod_calibs[mod_base] = ( 24 | calib_data[mod_base + calibration.LLR_RANGE_SUFFIX].copy(), 25 | calib_data[mod_base + calibration.CALIB_TABLE_SUFFIX].copy(), 26 | ) 27 | for mod_calib_fn in args.modified_base_calibration_files[-2::-1]: 28 | LOGGER.info("Processing {}".format(mod_calib_fn)) 29 | calib_data = np.load(mod_calib_fn) 30 | assert stratify_type == str(calib_data[calibration.MOD_STRAT_TYPE_TXT]) 31 | assert num_calib_vals == np.int( 32 | calib_data[calibration.SMOOTH_NVALS_TXT] 33 | ) 34 | for mod_base in calib_data[calibration.MOD_BASES_TXT]: 35 | # overwrite calibration data with files passed earlier 36 | if mod_base in mod_calibs: 37 | LOGGER.info("\tOverwriting {} calibration".format(mod_base)) 38 | else: 39 | LOGGER.info("\tUsing {} calibration".format(mod_base)) 40 | mod_calibs[mod_base] = ( 41 | calib_data[mod_base + calibration.LLR_RANGE_SUFFIX].copy(), 42 | calib_data[mod_base + calibration.CALIB_TABLE_SUFFIX].copy(), 43 | ) 44 | 45 | save_kwargs = {} 46 | for mod_base, (mod_llr_range, mod_calib) in mod_calibs.items(): 47 | save_kwargs[mod_base + calibration.LLR_RANGE_SUFFIX] = mod_llr_range 48 | save_kwargs[mod_base + calibration.CALIB_TABLE_SUFFIX] = mod_calib 49 | 50 | # save calibration table for reading into mod calibration table 51 | LOGGER.info("Saving calibrations to file.") 52 | mod_bases = list(mod_calibs.keys()) 53 | np.savez( 54 | args.out_filename, 55 | stratify_type=stratify_type, 56 | smooth_nvals=num_calib_vals, 57 | mod_bases=mod_bases, 58 | **save_kwargs, 59 | ) 60 | 61 | 62 | if __name__ == "__main__": 63 | _main(get_parser_calibrate_merge_modified_bases().parse_args()) 64 | -------------------------------------------------------------------------------- /megalodon/constrained_decoding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from megalodon import banding, decode, logging, megalodon_helper as mh 4 | 5 | LOGGER = logging.get_logger() 6 | 7 | 8 | def construct_allowed_bases(seq): 9 | """Construct numpy allowed bases array from string sequence containing 10 | ambiguous bases. 11 | """ 12 | allowed_bases = np.zeros((len(seq), 4), dtype=np.short) 13 | for seq_pos, base in enumerate(seq): 14 | try: 15 | pos_allowed_bases = mh.SINGLE_LETTER_CODE[base] 16 | except KeyError: 17 | raise mh.MegaError( 18 | f"Invalid IUPAC code ({base}) found wheile performing " 19 | "constrained basecalling." 20 | ) 21 | for pos_allowed_base in pos_allowed_bases: 22 | allowed_bases[seq_pos, mh.ALPHABET.find(pos_allowed_base)] = 1 23 | return allowed_bases 24 | 25 | 26 | def constrained_basecall( 27 | reference, 28 | trans_logprobs, 29 | ref_to_block, 30 | half_bandwidth=mh.DEFAULT_CONSTRAINED_HALF_BW, 31 | ): 32 | """Perform constrained basecalling from initial sequence to ambiguous 33 | reference. 34 | 35 | Args: 36 | reference (str): Reference sequence containing ambiguous bases 37 | trans_logprob (np.array): 2D Float array containing flip-flop transition 38 | log probabilities. Shape should be num_blocks by num_transitions. 39 | num_blocks is signal // stride and num_transitions is the number of 40 | flip-flop transitions (40 for 4 canonical bases). 41 | ref_to_block (np.array): Containing initial path coordinates from 42 | reference bases to block coordinates in trans_logprob 43 | half_bandwidth (int): Half bandwidth over which to restrict path 44 | between sequence and blocks. Band will be constructed along 45 | block/signal dimension. 46 | """ 47 | # if initial mapping starts within trans_logprobs trim and shift mapping 48 | if ref_to_block[0] != 0: 49 | trans_logprobs = trans_logprobs[ref_to_block[0] :] 50 | ref_to_block = ref_to_block - ref_to_block[0] 51 | # if mapping ends before end of trans_logprobs trim 52 | if ref_to_block.shape[0] > ref_to_block[-1]: 53 | trans_logprobs = trans_logprobs[: ref_to_block[-1]] 54 | allowed_bases = construct_allowed_bases(reference) 55 | sig_band = banding.compute_sig_band( 56 | ref_to_block, np.zeros(len(reference)), half_bandwidth 57 | ) 58 | seq_band = banding.convert_to_seq_band(sig_band) 59 | int_seq = decode.flipflop_constrain_decode( 60 | trans_logprobs, allowed_bases, seq_band 61 | ) 62 | constrained_seq = "".join(mh.ALPHABET[base % 4] for base in int_seq) 63 | return constrained_seq 64 | -------------------------------------------------------------------------------- /megalodon_extras/phase_variants_whatshap_filter.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | from megalodon import variants 4 | from ._extras_parsers import get_parser_phase_variants_whatshap_filter 5 | 6 | 7 | def is_complex_variant(ref, alts): 8 | # single base swaps aren't complex 9 | if any(len(allele) > 1 for allele in alts + [ref]): 10 | for alt in alts: 11 | simp_ref, simp_alt, _, _ = variants.simplify_var_seq(ref, alt) 12 | # if an allele simplifies to a SNV continue 13 | if len(simp_ref) == 0 and len(simp_alt) == 0: 14 | continue 15 | # if simplified sequence does not leave either allele empty 16 | # then this is a complex variant which cannot be processed by 17 | # whatshap 18 | if len(simp_ref) > 0 and len(simp_alt) > 0: 19 | return True 20 | return False 21 | 22 | 23 | def get_qual(vcf_line): 24 | qual = vcf_line.split()[5] 25 | try: 26 | qual = int(qual) 27 | except ValueError: 28 | qual = 0 29 | return qual 30 | 31 | 32 | def get_pos_ref_alts(vcf_line): 33 | chrm, pos, _, ref, alts = vcf_line.split()[:5] 34 | return chrm, int(pos), ref, alts.split(",") 35 | 36 | 37 | def _main(args): 38 | out_fp = open(args.out_vcf, "w") 39 | filt_fp = ( 40 | None 41 | if args.filtered_records is None 42 | else open(args.filtered_records, "w") 43 | ) 44 | 45 | with open(args.in_vcf) as fp: 46 | prev_line = prev_chrm = prev_end = None 47 | for line in tqdm(fp, desc="Filtering VCF", unit=" lines", smoothing=0): 48 | if line.startswith("#"): 49 | out_fp.write(line) 50 | continue 51 | chrm, start, ref, alts = get_pos_ref_alts(line) 52 | # skip complex variants 53 | if is_complex_variant(ref, alts): 54 | if filt_fp is not None: 55 | filt_fp.write("COMLEX_VARIANT: " + line) 56 | continue 57 | 58 | if prev_chrm == chrm and prev_end > start: 59 | if get_qual(line) > get_qual(prev_line): 60 | if filt_fp is not None: 61 | filt_fp.write("OVERLAPPING_VARIANT: " + prev_line) 62 | prev_line = line 63 | else: 64 | if filt_fp is not None: 65 | filt_fp.write("OVERLAPPING_VARIANT: " + line) 66 | continue 67 | 68 | if prev_line is not None: 69 | out_fp.write(line) 70 | prev_line = line 71 | prev_chrm = chrm 72 | prev_end = start + len(ref) 73 | 74 | if prev_line is not None: 75 | out_fp.write(line) 76 | if filt_fp is None: 77 | filt_fp.close() 78 | 79 | return 80 | 81 | 82 | if __name__ == "__main__": 83 | _main(get_parser_phase_variants_whatshap_filter().parse_args()) 84 | -------------------------------------------------------------------------------- /megalodon/logging.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | 5 | 6 | LOG_FN = "log.txt" 7 | 8 | 9 | class CustomFormatter(logging.Formatter): 10 | err_fmt = "*" * 100 + "\n\tERROR: %(msg)s\n" + "*" * 100 11 | warn_fmt = "*" * 20 + " WARNING: %(msg)s " + "*" * 20 12 | info_fmt = "[%(asctime)s] %(message)s" 13 | dbg_fmt = ( 14 | "DBG %(asctime)s : %(msg)s --- %(processName)s-" 15 | + "%(threadName)s %(module)s.py:%(lineno)d" 16 | ) 17 | 18 | def __init__(self, fmt="[%(asctime)s] %(levelname)-8s: %(message)s"): 19 | super().__init__(fmt=fmt, datefmt="%H:%M:%S", style="%") 20 | 21 | def format(self, record): 22 | format_orig = self._fmt 23 | 24 | # Replace the original format with one customized by logging level 25 | if record.levelno == logging.DEBUG: 26 | self._style._fmt = self.dbg_fmt 27 | elif record.levelno == logging.INFO: 28 | self._style._fmt = self.info_fmt 29 | elif record.levelno == logging.WARNING: 30 | self._style._fmt = self.warn_fmt 31 | elif record.levelno == logging.ERROR: 32 | self._style._fmt = self.err_fmt 33 | result = logging.Formatter.format(self, record) 34 | 35 | self._fmt = format_orig 36 | 37 | return result 38 | 39 | 40 | def init_logger( 41 | out_dir=None, out_suffix=None, log_fn=None, quiet=False, silent=False 42 | ): 43 | """Prepare logging output. Output file will be opened if out_dir or log_fn 44 | are specified. out_suffix will be added to the standard log.txt filename in 45 | out_dir (does not apply when log_fn is specified). 46 | 47 | File will include debug and above messages while stderr will include info 48 | and above. If quiet=True, stderr will include warning and above only. 49 | """ 50 | log_fp = None 51 | if out_dir is not None: 52 | log_fn = os.path.join(out_dir, LOG_FN) 53 | if out_suffix is not None: 54 | base_fn, fn_ext = os.path.splitext(log_fn) 55 | log_fn = base_fn + "." + out_suffix + fn_ext 56 | if log_fn is not None: 57 | log_fp = logging.FileHandler(log_fn, "w") 58 | log_fp.setLevel(logging.DEBUG) 59 | log_fp.setFormatter(CustomFormatter()) 60 | 61 | console = logging.StreamHandler() 62 | if silent: 63 | console.setLevel(logging.CRITICAL) 64 | elif quiet: 65 | console.setLevel(logging.WARNING) 66 | else: 67 | console.setLevel(logging.INFO) 68 | console.setFormatter(CustomFormatter()) 69 | 70 | root_logger = logging.getLogger("") 71 | root_logger.setLevel(logging.DEBUG) 72 | if log_fp is not None: 73 | root_logger.addHandler(log_fp) 74 | root_logger.addHandler(console) 75 | 76 | 77 | def get_logger(module_name=""): 78 | return logging.getLogger(module_name) 79 | 80 | 81 | if __name__ == "__main__": 82 | sys.stderr.write("This is a module. See commands with `megalodon -h`") 83 | sys.exit(1) 84 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = megalodon 3 | version = attr: megalodon.__init__.__version__ 4 | description = Nanopore base calling augmentation. 5 | license = ont_public_licence 6 | long-description = file: README.rst 7 | long-description-content-type = text/x-rst; charset=UTF-8; variant=GFM 8 | url = https://github.com/nanoporetech/megalodon 9 | author = Marcus Stoiber 10 | maintainer = Marcus Stoiber 11 | maintainer_email = marcus.stoiber@nanoporetech.com 12 | platforms = any 13 | classifiers = 14 | Development Status :: 4 - Beta 15 | Environment :: Console 16 | Environment :: GPU 17 | Intended Audience :: Developers 18 | Intended Audience :: Science/Research 19 | License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0) 20 | Natural Language :: English 21 | Operating System :: Unix 22 | Programming Language :: Python :: 3 :: Only 23 | Topic :: Scientific/Engineering :: Artificial Intelligence 24 | Topic :: Scientific/Engineering :: Bio-Informatics 25 | 26 | [options] 27 | zip_safe = False 28 | packages = 29 | megalodon 30 | megalodon_extras 31 | package_dir = 32 | =. 33 | include_package_data = True 34 | python_requires = 35 | >=3.6 36 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD! 37 | setup_requires = pyscaffold>=3.2a0,<3.3a0 38 | 39 | # project dependencies (testing deps listed in next section) 40 | install_requires = 41 | h5py >= 2.2.1 42 | numpy >= 1.9.0 43 | scipy >= 1.1.0 44 | Cython >= 0.25.2 45 | mappy >= 2.16 46 | pysam >= 0.15 47 | ont_fast5_api >= 3.2 48 | tqdm >= 2.17 49 | ont-pyguppy-client-lib 50 | scikit-learn 51 | seaborn 52 | ont-remora >= 1.0 53 | 54 | [options.package_data] 55 | * = model_data/*/*.npz 56 | 57 | [options.extras_require] 58 | testing = 59 | pytest 60 | black 61 | sphinx 62 | sphinx-rtd-theme 63 | sphinx-argparse 64 | 65 | [options.packages.find] 66 | where = 67 | megalodon/ 68 | exclude = 69 | docs 70 | tests 71 | .eggs 72 | 73 | [options.entry_points] 74 | console_scripts = 75 | megalodon = megalodon.__main__:_main 76 | megalodon_extras = megalodon_extras.__main__:_main 77 | 78 | [test] 79 | extras = True 80 | 81 | [tool:pytest] 82 | addopts = 83 | --cov megalodon --verbose --ignore *.egg* 84 | norecursedirs = 85 | dist 86 | build 87 | .tox 88 | .eggs 89 | testpaths = tests 90 | 91 | [aliases] 92 | dists = bdist_wheel 93 | 94 | [bdist_wheel] 95 | # Use this option if your package is pure-python 96 | universal = 1 97 | 98 | [build_sphinx] 99 | source_dir = docs 100 | build_dir = build/sphinx 101 | 102 | [devpi:upload] 103 | # Options for the devpi: PyPI server and packaging tool 104 | # VCS export must be deactivated since we are using setuptools-scm 105 | no-vcs = 1 106 | formats = bdist_wheel 107 | 108 | [pyscaffold] 109 | # PyScaffold's parameters when the project was created. 110 | # This will be used when updating. Do not change! 111 | version = 3.2.3 112 | package = megalodon 113 | extensions = 114 | markdown 115 | gitlab 116 | 117 | [flake8] 118 | # ignore: 119 | # E203 whitespace before ':' 120 | # W503 line break before binary operator 121 | ignore = E203,W503 122 | max-line-length = 80 123 | -------------------------------------------------------------------------------- /docs/computing_considerations.rst: -------------------------------------------------------------------------------- 1 | ************************ 2 | Computing Considerations 3 | ************************ 4 | 5 | This page aims to describe the Megalodon processing workflow, highlighting relevant computing considerations. 6 | 7 | ------------------ 8 | Raw Signal Loading 9 | ------------------ 10 | 11 | Raw signal is loaded from either single- or multi-FAST5 format via the ``ont_fast5_api``. 12 | Raw signal is loaded within a single process and distributed out the worker processes. 13 | The input queue status bar indicates how many read signals have been loaded and are awaiting processing. 14 | If this status bar is often empty, raw signal extraction from FAST5 files is likely a processing bottleneck. 15 | 16 | The reads queue is filled from a separate worker process. 17 | This process will enumerate FAST5 files and all read_ids stored within these files, but per-read processing will begin as soon as the first read_id/file is found. 18 | Users may notice a period of time where the progress bar does not have a known total number of reads. 19 | Once read enumeration is complete the progress bar will update to include the total number of reads found and ETA for run completion. 20 | 21 | ------------ 22 | Base Calling 23 | ------------ 24 | 25 | Basecalling is performed by the pyguppy backend. 26 | Basecalling consists of running the neural network and then decoding this output. 27 | See `guppy documentation on the community page (login required) `_ for more details. 28 | Parameters can be passed directly to the Guppy server initialization call via the ``--guppy-params`` argument. 29 | 30 | ----------------- 31 | Reference Mapping 32 | ----------------- 33 | 34 | Read mapping is completed using the ``minimap2`` python interface (``mappy``). 35 | The reference index is loaded into shared memory. 36 | A separate thread is linked to each per-read processing worker in order to access the shared memory index. 37 | Thus users may notice threads opened for this processing. 38 | These threads will generally consume less compute than the worker processes. 39 | 40 | --------------------------------- 41 | Variant and Modified Base Calling 42 | --------------------------------- 43 | 44 | Sequence variant and modified base calling is computed within the per-read processing workers using CPU resources. 45 | Generally, this portion of processing will consume a minority of the compute resources. 46 | Proposing many variants (e.g. all possible 2+ base indels) or modified bases in all contexts may show a bottle neck at this portion of processing. 47 | Internal testing shows that proposal of all possible single base substitutions shows minimal processing at this portion of per-read processing. 48 | 49 | --------------- 50 | Writing to Disk 51 | --------------- 52 | 53 | As of version 2.0, the status of output queues is displayed by default. 54 | As of version 2.2, the status of the input signal extraction queue is also displayed. 55 | If any of the output status bars indicate a full queue, Megalodon will stall waiting on that process to write data to disk. 56 | if the input signal extraction quque is often empty, raw signal extraction from FAST5 files is likely a processing bottleneck. 57 | Moving the input data or ``--output-directory`` respectively to a location with faster disk I/O performance should improve performance. 58 | -------------------------------------------------------------------------------- /megalodon_extras/modified_bases_create_ground_truth.py: -------------------------------------------------------------------------------- 1 | from megalodon import megalodon_helper as mh 2 | from ._extras_parsers import get_parser_modified_bases_create_ground_truth 3 | 4 | 5 | def _main(args): 6 | samp_cov, samp_mod_cov = mh.parse_bed_methyls( 7 | args.bed_methyl_files, strand_offset=args.strand_offset 8 | ) 9 | with open(args.out_csv, "w") as gt_fp: 10 | for (chrom, strand), ctg_cov in samp_cov.items(): 11 | for pos, cov in ctg_cov.items(): 12 | if cov < args.coverage_threshold: 13 | continue 14 | pct_mod = 100 * samp_mod_cov[(chrom, strand)][pos] / cov 15 | if pct_mod <= args.pct_mod_thresholds[0]: 16 | gt_fp.write( 17 | ",".join( 18 | map( 19 | str, 20 | ( 21 | chrom, 22 | mh.int_strand_to_str(strand), 23 | pos, 24 | "False", 25 | ), 26 | ) 27 | ) 28 | + "\n" 29 | ) 30 | if args.strand_offset is not None: 31 | gt_fp.write( 32 | ",".join( 33 | map( 34 | str, 35 | ( 36 | chrom, 37 | mh.int_strand_to_str(strand), 38 | pos + args.strand_offset, 39 | "False", 40 | ), 41 | ) 42 | ) 43 | + "\n" 44 | ) 45 | elif pct_mod >= args.pct_mod_thresholds[1]: 46 | gt_fp.write( 47 | ",".join( 48 | map( 49 | str, 50 | ( 51 | chrom, 52 | mh.int_strand_to_str(strand), 53 | pos, 54 | "True", 55 | ), 56 | ) 57 | ) 58 | + "\n" 59 | ) 60 | if args.strand_offset is not None: 61 | gt_fp.write( 62 | ",".join( 63 | map( 64 | str, 65 | ( 66 | chrom, 67 | mh.int_strand_to_str(strand), 68 | pos + args.strand_offset, 69 | "True", 70 | ), 71 | ) 72 | ) 73 | + "\n" 74 | ) 75 | 76 | 77 | if __name__ == "__main__": 78 | _main(get_parser_modified_bases_create_ground_truth().parse_args()) 79 | -------------------------------------------------------------------------------- /megalodon_extras/variants_atomize.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import mappy 4 | 5 | from megalodon import logging, mapping, megalodon_helper as mh, variants 6 | from ._extras_parsers import get_parser_variants_atomize 7 | 8 | 9 | LOGGER = logging.get_logger() 10 | 11 | HEADER = ["##fileformat=VCFv4.1", "##source=megalodon_atomized"] 12 | CONTIG_HEADER_LINE = "##contig=" 13 | COMMAND_HEADER_LINE = '##command="{}"' 14 | FIELDS_LINE = "#CHROM POS ID REF ALT QUAL FILTER" + " INFO FORMAT SAMPLE" 15 | RECORD_LINE = "{chrm}\t{pos}\t{rid}\t{ref}\t{alts}\t.\t.\t{info}\t.\t.\n" 16 | 17 | 18 | def _main(args): 19 | logging.init_logger() 20 | LOGGER.info("Loading reference") 21 | aligner = mappy.Aligner( 22 | str(args.reference), preset=str("map-ont"), best_n=1 23 | ) 24 | LOGGER.info("Loading variants") 25 | var_data = variants.VarInfo( 26 | args.in_vcf, aligner, args.max_indel_size, keep_var_fp_open=True 27 | ) 28 | contigs = var_data.variants_idx.header.contigs.values() 29 | LOGGER.info("Atomizing variants") 30 | with open(args.out_vcf, "w") as out_vars: 31 | # preprocess contigs to set contig lengths for VCF header 32 | ctg_lens = {} 33 | for ctg in contigs: 34 | chrm_seq = aligner.seq(ctg.name) 35 | if len(chrm_seq) != ctg.length: 36 | LOGGER.warning( 37 | ( 38 | "Mismatched contig lengths ({}) between " 39 | + "reference ({}) and input VCF ({}) using length from " 40 | "reference" 41 | ).format(ctg.name, len(chrm_seq), ctg.length) 42 | ) 43 | ctg_lens[ctg.name] = len(chrm_seq) 44 | 45 | out_vars.write( 46 | "\n".join( 47 | HEADER 48 | + [ 49 | CONTIG_HEADER_LINE.format(ctg, ctg_len) 50 | for ctg, ctg_len in ctg_lens.items() 51 | ] 52 | + [ 53 | variants.CONTEXT_BASE_MI_LINE, 54 | COMMAND_HEADER_LINE.format(" ".join(sys.argv)), 55 | FIELDS_LINE, 56 | ] 57 | ) 58 | + "\n" 59 | ) 60 | for ctg in contigs: 61 | chrm_seq = aligner.seq(ctg.name) 62 | map_pos = mapping.MAP_POS( 63 | chrm=ctg.name, 64 | strand=None, 65 | start=0, 66 | end=len(chrm_seq), 67 | q_trim_start=None, 68 | q_trim_end=None, 69 | ) 70 | for var in var_data.fetch_read_variants( 71 | map_pos, mh.seq_to_int(chrm_seq) 72 | ): 73 | out_vars.write( 74 | RECORD_LINE.format( 75 | chrm=ctg.name, 76 | pos=var.ref_start + 1, 77 | rid=var.id, 78 | ref=var.ref, 79 | alts=",".join(var.alts), 80 | info=variants.HAS_CONTEXT_BASE_TAG 81 | if var.has_context_base 82 | else ".", 83 | ) 84 | ) 85 | 86 | LOGGER.info("Indexing output variant file") 87 | variants.index_variants(args.out_vcf) 88 | 89 | 90 | if __name__ == "__main__": 91 | _main(get_parser_variants_atomize().parse_args()) 92 | -------------------------------------------------------------------------------- /megalodon_extras/modified_bases_update_database.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | from time import time 3 | 4 | from tqdm import tqdm 5 | 6 | from megalodon import logging, mods 7 | from ._extras_parsers import get_parser_modified_bases_update_database 8 | 9 | 10 | DEBUG = False 11 | N_DEBUG = 50000000 12 | 13 | INSERT_BATCH_SIZE = 10000 14 | 15 | LOGGER = logging.get_logger() 16 | 17 | 18 | def get_read_id(uuid, read_ids, new_db): 19 | try: 20 | read_id = read_ids[uuid] 21 | except KeyError: 22 | new_db.cur.execute("INSERT INTO read (uuid) VALUES (?)", (uuid,)) 23 | read_id = new_db.cur.lastrowid 24 | read_ids[uuid] = read_id 25 | return read_id, read_ids 26 | 27 | 28 | def insert_data(new_db, insert_batch): 29 | new_db.cur.executemany("INSERT INTO data VALUES (?,?,?,?)", insert_batch) 30 | 31 | 32 | def fill_mods(old_cur, new_db): 33 | read_ids = {} 34 | n_recs = old_cur.execute("SELECT MAX(rowid) FROM mods").fetchone()[0] 35 | old_cur.execute("SELECT * FROM mods") 36 | insert_batch = [] 37 | for i, ( 38 | uuid, 39 | chrm, 40 | strand, 41 | pos, 42 | score, 43 | mod_base, 44 | motif, 45 | motif_pos, 46 | raw_motif, 47 | ) in tqdm( 48 | enumerate(old_cur), total=n_recs, smoothing=0, dynamic_ncols=True 49 | ): 50 | if DEBUG and i > N_DEBUG: 51 | break 52 | read_id, read_ids = get_read_id(uuid, read_ids, new_db) 53 | pos_id = new_db.get_pos_id_or_insert(chrm, strand, pos) 54 | mod_base_id = new_db.get_mod_base_id_or_insert( 55 | mod_base, motif, motif_pos, raw_motif 56 | ) 57 | insert_batch.append((score, pos_id, mod_base_id, read_id)) 58 | if len(insert_batch) >= INSERT_BATCH_SIZE: 59 | insert_data(new_db, insert_batch) 60 | insert_batch = [] 61 | 62 | if len(insert_batch) >= 0: 63 | insert_data(new_db, insert_batch) 64 | 65 | 66 | def fill_refs(old_cur, new_db): 67 | old_cur.execute("SELECT DISTINCT chrm FROM mods") 68 | for (ref_name,) in old_cur: 69 | new_db.insert_chrm(ref_name) 70 | new_db.create_chrm_index() 71 | 72 | 73 | def _main(args): 74 | raise NotImplementedError( 75 | "The previous version of this script updated version 0 to " 76 | + "version 1. Updgreade to version 2 not yet implemented." 77 | ) 78 | logging.init_logger() 79 | old_db = sqlite3.connect(args.old_db) 80 | old_cur = old_db.cursor() 81 | new_db = mods.ModsDb(args.new_db, read_only=False) 82 | 83 | LOGGER.info("Reading/loading reference record names.") 84 | fill_refs(old_cur, new_db) 85 | 86 | LOGGER.info("Reading/loading modified base scores.") 87 | fill_mods(old_cur, new_db) 88 | 89 | if not DEBUG: 90 | new_db.create_mod_index() 91 | t0 = time() 92 | LOGGER.info("Creating positions index.") 93 | new_db.create_pos_index() 94 | t1 = time() 95 | LOGGER.info("Took {} seconds.".format(t1 - t0)) 96 | LOGGER.info("Creating scores position index.") 97 | new_db.create_data_covering_index() 98 | LOGGER.info("Took {} seconds.".format(time() - t1)) 99 | new_db.close() 100 | 101 | 102 | if __name__ == "__main__": 103 | _main(get_parser_modified_bases_update_database().parse_args()) 104 | -------------------------------------------------------------------------------- /megalodon_extras/merge_aggregated_modified_bases.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | 4 | from megalodon import logging, megalodon_helper as mh, mods 5 | from ._extras_parsers import get_parser_merge_aggregated_modified_bases 6 | 7 | 8 | def write_unsorted_merge(in_fns, out_fp, bar): 9 | cov, mod_cov = mh.parse_bed_methyls(in_fns) 10 | for chrm in sorted( 11 | mh.RefName(chrm) for chrm in set(chrm for chrm, _ in cov) 12 | ): 13 | # convert back to string after sorting 14 | chrm = str(chrm) 15 | s_poss = [] 16 | if (chrm, 1) in cov: 17 | s_poss.extend([(pos, 1) for pos in cov[(chrm, 1)]]) 18 | if (chrm, -1) in cov: 19 | s_poss.extend([(pos, -1) for pos in cov[(chrm, -1)]]) 20 | for pos, strand in sorted(s_poss): 21 | pcov = cov[(chrm, strand)][pos] 22 | out_fp.write( 23 | mods.BEDMETHYL_TMPLT.format( 24 | chrom=chrm, 25 | pos=pos, 26 | end=pos + 1, 27 | strand=mh.int_strand_to_str(strand), 28 | cov=pcov, 29 | score=min(int(pcov), 1000), 30 | perc=np.around( 31 | mod_cov[(chrm, strand)][pos] / pcov * 100, 1 32 | ), 33 | ) 34 | + "\n" 35 | ) 36 | bar.update() 37 | 38 | 39 | def write_batch(out_fp, chrms, poss, strands, mod_covs, covs): 40 | covs = np.array(covs, dtype=int) 41 | mod_covs = np.array(mod_covs, dtype=int) 42 | pct_mods = np.zeros_like(covs) 43 | valid_covs = covs > 0 44 | pct_mods[valid_covs] = np.around( 45 | np.array(mod_covs[valid_covs], dtype=int) * 100 / covs[valid_covs], 1 46 | ) 47 | out_fp.write( 48 | "\n".join( 49 | mods.BEDMETHYL_TMPLT.format( 50 | chrom=chrm, 51 | pos=pos, 52 | end=pos + 1, 53 | strand=strand, 54 | cov=cov, 55 | score=score, 56 | perc=pct_mod, 57 | ) 58 | for chrm, pos, strand, cov, score, pct_mod in zip( 59 | chrms, 60 | poss, 61 | strands, 62 | covs, 63 | np.minimum(covs, 1000), 64 | pct_mods, 65 | ) 66 | ) 67 | + "\n" 68 | ) 69 | 70 | 71 | def write_sorted_merge(in_fns, out_fp, bar, batch_size=50000): 72 | chrms, poss, strands, mod_covs, covs = [], [], [], [], [] 73 | for chrm, pos, strand, mod_cov, cov in mh.iter_merged_bedmethyl( 74 | [mh.iter_bed_methyl_recs(in_fn) for in_fn in in_fns] 75 | ): 76 | chrms.append(chrm) 77 | poss.append(pos) 78 | strands.append(strand) 79 | mod_covs.append(mod_cov) 80 | covs.append(cov) 81 | bar.update() 82 | if len(chrms) >= batch_size: 83 | write_batch(out_fp, chrms, poss, strands, mod_covs, covs) 84 | chrms, poss, strands, mod_covs, covs = [], [], [], [], [] 85 | if len(chrms) >= 0: 86 | write_batch(out_fp, chrms, poss, strands, mod_covs, covs) 87 | 88 | 89 | def _main(args): 90 | logging.init_logger() 91 | with open(args.output_bed_methyl_file, "w") as out_fp, tqdm( 92 | desc="Records Written", smoothing=0 93 | ) as bar: 94 | if args.sorted_inputs: 95 | write_sorted_merge(args.bed_methyl_files, out_fp, bar) 96 | else: 97 | write_unsorted_merge(args.bed_methyl_files, out_fp, bar) 98 | 99 | 100 | if __name__ == "__main__": 101 | _main(get_parser_merge_aggregated_modified_bases().parse_args()) 102 | -------------------------------------------------------------------------------- /docs/variant_phasing.rst: -------------------------------------------------------------------------------- 1 | *************** 2 | Variant Phasing 3 | *************** 4 | 5 | This page walks through the steps to use megalodon in conjunction with `whatshap `_ to produce the highest quality phased variant calls. 6 | 7 | This pipeline produces the ``variants.haploid_merged.vcf`` file containing high quality phased variant calls. 8 | The intermediate ``variant_mappings.haplotagged.bam`` file can be of particular interest to investigate variant calls at the per-read level. 9 | This file contains the reference sequence for each read annotated only with the proposed variant calls, including quality scores for SNVs. 10 | Thus random read errors are masked allowing for more accurate analysis on proposed variants. 11 | See an example of this per-read variant genome browser visualization below. 12 | 13 | ---- 14 | 15 | .. figure:: _images/whatshap_haplotagged_variant_viz.png 16 | :align: center 17 | :width: 600 18 | 19 | Genome browser visualization. Megalodon variant_mappings haplotagged with whatshap (upper panel) and raw read mappings (lower panel). 20 | 21 | ---- 22 | 23 | -------- 24 | Workflow 25 | -------- 26 | 27 | :: 28 | 29 | reads_dir="fast5s" 30 | ref="reference.fasta" 31 | variants_vcf="variants.vcf.gz" 32 | out_dir="megalodon_results" 33 | nproc=16 34 | gpu_devices="0 1" 35 | 36 | # run megalodon to produce variant_mappings 37 | megalodon \ 38 | $reads_dir --outputs mappings variants variant_mappings \ 39 | --reference $ref --variant-filename $variants_vcf \ 40 | --output-directory $out_dir \ 41 | --processes $nproc --devices $gpu_devices \ 42 | --verbose-read-progress 3 43 | 44 | # filter whatshap incompatible variants and create indices 45 | megalodon_extras \ 46 | phase_variants whatshap_filter \ 47 | $out_dir/variants.sorted.vcf \ 48 | $out_dir/variants.sorted.whatshap_filt.vcf \ 49 | --filtered-records $out_dir/whatshap_filt.txt 50 | bgzip $out_dir/variants.sorted.whatshap_filt.vcf 51 | tabix $out_dir/variants.sorted.whatshap_filt.vcf.gz 52 | samtools index $out_dir/variant_mappings.sorted.bam 53 | 54 | # run whatshap with produced mappings and variants 55 | whatshap \ 56 | phase --distrust-genotypes \ 57 | -o $out_dir/variants.phased.vcf \ 58 | $out_dir/variants.sorted.whatshap_filt.vcf.gz \ 59 | $out_dir/variant_mappings.sorted.bam 60 | 61 | # assign haplotypes to reads 62 | bgzip $out_dir/variants.phased.vcf 63 | tabix $out_dir/variants.phased.vcf.gz 64 | whatshap \ 65 | haplotag $out_dir/variants.phased.vcf.gz \ 66 | $out_dir/variant_mappings.sorted.bam \ 67 | -o $out_dir/variant_mappings.haplotagged.bam 68 | 69 | # extract haplotype reads and call haploid variants 70 | megalodon_extras \ 71 | phase_variants extract_haplotype_reads \ 72 | $out_dir/variant_mappings.haplotagged.bam \ 73 | $out_dir/variant_mappings 74 | megalodon_extras \ 75 | aggregate run \ 76 | --megalodon-directory $out_dir --output-suffix haplotype_1 \ 77 | --read-ids-filename $out_dir/variant_mappings.haplotype_1_read_ids.txt \ 78 | --outputs variants --haploid --processes $nproc 79 | megalodon_extras \ 80 | aggregate run \ 81 | --megalodon-directory $out_dir --output-suffix haplotype_2 \ 82 | --read-ids-filename $out_dir/variant_mappings.haplotype_2_read_ids.txt \ 83 | --outputs variants --haploid --processes $nproc 84 | 85 | # merge haploid variants to produce diploid variants 86 | megalodon_extras \ 87 | phase_variants merge_haploid_variants \ 88 | $out_dir/variants.sorted.vcf.gz \ 89 | $out_dir/variants.haplotype_1.sorted.vcf.gz \ 90 | $out_dir/variants.haplotype_2.sorted.vcf.gz \ 91 | --out-vcf $out_dir/variants.haploid_merged.vcf 92 | -------------------------------------------------------------------------------- /docs/model_training.rst: -------------------------------------------------------------------------------- 1 | ************************ 2 | Megalodon Model Training 3 | ************************ 4 | 5 | This page describes how to use Megalodon to prepare training data and train a new basecalling model using Taiyaki. 6 | For modified base data preparation and model training documentation see the :doc:`modified base training documentation ` page. 7 | 8 | .. note:: 9 | 10 | Preparation of training data via Megalodon requires a basecalling model that can produce valid reference mappings. 11 | If valid reference mappings using ``minimap2`` cannot be produced for a set of reads, model training will not proceed successfully. 12 | 13 | ---------------- 14 | Data Preparation 15 | ---------------- 16 | 17 | To produce a training data ("mapped signal") file the ``--outputs signal_mappings`` argument should be added to a Megalodon call. 18 | This will produce a ``signal_mappings.hdf5`` file in the specified Megalodon output directory. 19 | For each read producing a valid reference mapping, this file contains a mapping between the raw signal and the mapped reference bases. 20 | This file can then be directly passed to the Taiyaki ``train_flipflop.py`` command for model training. 21 | 22 | :: 23 | 24 | # run megalodon; output signal mappings 25 | megalodon raw_fast5s/ \ 26 | --outputs signal_mappings \ 27 | --reference reference.fa \ 28 | --devices 0 --processes 40 29 | 30 | # run taiyaki training 31 | train_flipflop.py ./taiyaki/models/mLstm_flipflop.py \ 32 | megalodon_results/signal_mappings.hdf5 --device 0 33 | 34 | Once training completes, the ``training/model_final.checkpoint`` contains the model. 35 | This can be converted to a guppy compatible model with the ``taiyaki/bin/dump_json.py`` script. 36 | A guppy config with appropriate settings should also be produced for new models. 37 | 38 | .. note:: 39 | 40 | For optimal performance, it is recommended that the ``OMP_NUM_THREADS`` unix environment variable be set to ``1`` for the above Megalodon command and a larger value for the Taiyaki training command. 41 | 42 | 43 | ---------------------- 44 | Signal Mapping Options 45 | ---------------------- 46 | 47 | Several options are available to control the behavior of the ``signal_mappings`` output. 48 | 49 | - ``--ref-length-range`` 50 | 51 | - Only allow reads with a reference mapping length within this range into the output. 52 | - ``--ref-percent-identity-threshold`` 53 | 54 | - Only include reads with higher mapping percent identity in signal_mappings output. 55 | - ``--ref-percent-coverage-threshold`` 56 | 57 | - Only include reads with higher read alignment coverage in signal_mappings output. 58 | - ``--ref-include-variants`` 59 | 60 | - This option replaces the reference sequence with more likely proposed alternative sequences as called in the ``per_read_variants`` output. 61 | - Cannot specify both this option and ``--ref-include-mods``. 62 | 63 | 64 | --------------------- 65 | Megalodon Calibration 66 | --------------------- 67 | 68 | When a new model is trained, the produced scores must be calibrated to achieve optimal aggregated results (over reads). 69 | Once produced, calibration files can be passed to Megalodon via the ``--variant-calibration-filename`` and ``--mod-calibration-filename`` arguments. 70 | 71 | Sequence variant calibration requires a ground truth against which to compute scores. 72 | For sequence variants, a high quality reference for a set of reads will suffice for this requirement. 73 | Random sequence variants are proposed and scored in order to create distributions over which to calibrate the produced scores. 74 | In order to create a sequence variant calibration file, run ``megalodon/scripts/generate_ground_truth_variant_llr_scores.py`` followed by ``megalodon/scripts/calibrate_variant_llr_scores.py``. 75 | The optional ``--out-pdf`` provides visualization of the likelihood ratio score correction. 76 | -------------------------------------------------------------------------------- /test/test_api.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from megalodon import backends, logging, megalodon_helper as mh 4 | 5 | 6 | LOGGER = logging.get_logger() 7 | 8 | 9 | def open_pyguppy_backend(args): 10 | args.do_not_use_guppy_server = False 11 | try: 12 | mh.mkdir(args.output_directory, False) 13 | except mh.MegaError: 14 | LOGGER.warning( 15 | "Guppy logs output directory exists. Potentially overwriting guppy " 16 | "logs." 17 | ) 18 | backend_params = backends.parse_backend_params(args) 19 | model_info = None 20 | try: 21 | model_info = backends.ModelInfo(backend_params, args.processes) 22 | # if spawning multiple workers run this inside newly spawned processes 23 | model_info.prep_model_worker() 24 | LOGGER.info(model_info.get_alphabet_str()) 25 | LOGGER.info( 26 | "Model structure:\n\tStride: {}\n\tState size: {}".format( 27 | model_info.stride, model_info.output_size 28 | ) 29 | ) 30 | # use model_info.iter_basecalled_reads to basecall reads and return 31 | # relevant signal anchored information. 32 | model_info.client.disconnect() 33 | finally: 34 | # ensure guppy server is closed in finally block 35 | if model_info is not None: 36 | model_info.close() 37 | 38 | 39 | def get_parser(): 40 | parser = argparse.ArgumentParser() 41 | 42 | parser.add_argument( 43 | "--log-directory", 44 | default=".", 45 | help="Directory to output megalodon log. Default: current " 46 | + "working directory", 47 | ) 48 | 49 | pyg_grp = parser.add_argument_group("Guppy Backend Arguments") 50 | pyg_grp.add_argument( 51 | "--guppy-config", 52 | default=mh.DEFAULT_GUPPY_CFG, 53 | help="Guppy config. Default: %(default)s", 54 | ) 55 | pyg_grp.add_argument( 56 | "--guppy-server-path", 57 | default=mh.DEFAULT_GUPPY_SERVER_PATH, 58 | help="Path to guppy server executable. Default: %(default)s", 59 | ) 60 | pyg_grp.add_argument( 61 | "--guppy-server-port", 62 | type=int, 63 | help="Guppy server port. Default: Guppy auto", 64 | ) 65 | pyg_grp.add_argument( 66 | "--guppy-params", 67 | help="Extra guppy server parameters. Main purpose for optimal " 68 | + "performance based on compute environment. Quote parameters to " 69 | + "avoid them being parsed by megalodon.", 70 | ) 71 | pyg_grp.add_argument( 72 | "--guppy-concurrent-reads", 73 | type=int, 74 | default=mh.DEFAULT_GUPPY_CONCURRENT_READS, 75 | help="Number of reads to process concurrently within each worker " 76 | "processes. Default: %(default)d", 77 | ) 78 | pyg_grp.add_argument( 79 | "--guppy-timeout", 80 | type=float, 81 | default=mh.DEFAULT_GUPPY_TIMEOUT, 82 | help="Timeout to wait for guppy server to call a single read in " 83 | + "seconds. Default: %(default)f", 84 | ) 85 | pyg_grp.add_argument( 86 | "--output-directory", 87 | default="guppy_logs", 88 | help="Directory to output guppy logs. Default: %(default)s", 89 | ) 90 | pyg_grp.add_argument( 91 | "--devices", 92 | nargs="+", 93 | help="GPU devices for guppy basecalling backend.", 94 | ) 95 | pyg_grp.add_argument( 96 | "--processes", 97 | type=int, 98 | default=1, 99 | help="Number of parallel processes. Default: %(default)d", 100 | ) 101 | 102 | return parser 103 | 104 | 105 | def main(args): 106 | logging.init_logger(args.log_directory) 107 | open_pyguppy_backend(args) 108 | 109 | 110 | if __name__ == "__main__": 111 | main(get_parser().parse_args()) 112 | -------------------------------------------------------------------------------- /megalodon_extras/calibrate_generate_modified_base_stats.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | 4 | from megalodon import logging, megalodon_helper as mh, mods 5 | from ._extras_parsers import get_parser_calibrate_generate_modified_base_stats 6 | 7 | 8 | LOGGER = logging.get_logger() 9 | 10 | 11 | def output_mods_data( 12 | all_mod_llrs, all_can_llrs, mod_base_set, exclude_mod_bases, out_fn 13 | ): 14 | LOGGER.info("Merging modified base data") 15 | all_mod_bases = list( 16 | set(all_mod_llrs.keys()).intersection(all_can_llrs.keys()) 17 | ) 18 | if len(set(all_mod_llrs.keys()).difference(all_mod_bases)) > 0: 19 | LOGGER.warning( 20 | "Modified base(s) found in modified dataset which were not " 21 | + "found in canonical dataset: {}".format( 22 | ",".join(set(all_mod_llrs.keys()).difference(all_mod_bases)) 23 | ) 24 | ) 25 | if len(set(all_can_llrs.keys()).difference(all_mod_bases)) > 0: 26 | LOGGER.warning( 27 | "Modified base(s) found in modified dataset which were " 28 | + "not found in canonical dataset: {}".format( 29 | ",".join(set(all_mod_llrs.keys()).difference(all_mod_bases)) 30 | ) 31 | ) 32 | if mod_base_set is not None: 33 | all_mod_bases = list(set(all_mod_bases).intersection(mod_base_set)) 34 | if len(all_mod_bases) == 0: 35 | LOGGER.error( 36 | ( 37 | "No modified bases to process.\n\tModified bases from " 38 | + "results: {}\n\tModified base set: {}" 39 | ).format(",".join(all_mod_bases), ",".join(mod_base_set)) 40 | ) 41 | if exclude_mod_bases is not None: 42 | all_mod_bases = list(set(all_mod_bases).difference(exclude_mod_bases)) 43 | if len(all_mod_bases) == 0: 44 | LOGGER.error( 45 | ( 46 | "No modified bases to process.\n\tModified bases from " 47 | + "results: {}\n\tExcluded modified bases: {}" 48 | ).format(",".join(all_mod_bases), ",".join(exclude_mod_bases)) 49 | ) 50 | mod_base_stats = {mods.GT_ALL_MOD_BASE_STR: all_mod_bases} 51 | for mod_base in all_mod_bases: 52 | mod_base_stats[mods.GT_MOD_LLR_STR.format(mod_base)] = all_mod_llrs[ 53 | mod_base 54 | ] 55 | mod_base_stats[mods.GT_CAN_LLR_STR.format(mod_base)] = all_can_llrs[ 56 | mod_base 57 | ] 58 | np.savez(out_fn, **mod_base_stats) 59 | 60 | 61 | def _main(args): 62 | logging.init_logger(quiet=args.quiet) 63 | 64 | if ( 65 | args.ground_truth_data is None 66 | and args.control_megalodon_results_dir is None 67 | ): 68 | LOGGER.error( 69 | "Must provide either --control-megalodon-results-dir or " 70 | + "--ground-truth-data" 71 | ) 72 | sys.exit() 73 | 74 | db_fn = mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME) 75 | if args.ground_truth_data is not None: 76 | LOGGER.info("Parsing ground truth data") 77 | gt_mod_pos, gt_can_pos = mh.parse_ground_truth_file( 78 | args.ground_truth_data, include_strand=args.strand_specific_sites 79 | ) 80 | LOGGER.info( 81 | ( 82 | "Loaded ground truth data with {} modified sites and {} " 83 | + "canonical sites." 84 | ).format(len(gt_mod_pos), len(gt_can_pos)) 85 | ) 86 | LOGGER.info( 87 | "Reading ground truth modified base statistics from " + "database." 88 | ) 89 | all_mod_llrs, all_can_llrs = mods.extract_stats_at_valid_sites( 90 | db_fn, 91 | [gt_mod_pos, gt_can_pos], 92 | include_strand=args.strand_specific_sites, 93 | ) 94 | else: 95 | LOGGER.info( 96 | "Reading ground truth modified base statistics from " + "database" 97 | ) 98 | all_mod_llrs = mods.extract_all_stats(db_fn) 99 | LOGGER.info( 100 | "Reading ground truth modified base statistics from " 101 | + "canonical sample database" 102 | ) 103 | all_can_llrs = mods.extract_all_stats( 104 | mh.get_megalodon_fn( 105 | args.control_megalodon_results_dir, mh.PR_MOD_NAME 106 | ) 107 | ) 108 | 109 | mod_summary = [ 110 | ( 111 | mod, 112 | len(all_mod_llrs[mod]) if mod in all_mod_llrs else 0, 113 | len(all_can_llrs[mod]) if mod in all_can_llrs else 0, 114 | ) 115 | for mod in set(all_mod_llrs).union(all_can_llrs) 116 | ] 117 | LOGGER.info( 118 | "Data summary:\n\tmod\tmod_N\tcan_N\n" 119 | + "\n".join("\t" + "\t".join(map(str, x)) for x in mod_summary) 120 | ) 121 | output_mods_data( 122 | all_mod_llrs, 123 | all_can_llrs, 124 | args.modified_bases_set, 125 | args.exclude_modified_bases, 126 | args.out_filename, 127 | ) 128 | 129 | 130 | if __name__ == "__main__": 131 | _main(get_parser_calibrate_generate_modified_base_stats().parse_args()) 132 | -------------------------------------------------------------------------------- /megalodon_extras/calibrate_modified_bases.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | 4 | if True: 5 | # Agg appears to be the most robust backend when only saving plots. 6 | matplotlib.use("Agg") 7 | import matplotlib.pyplot as plt 8 | from matplotlib.backends.backend_pdf import PdfPages 9 | 10 | from megalodon import calibration, logging, megalodon_helper as mh, mods 11 | from ._extras_parsers import get_parser_calibrate_modified_bases 12 | 13 | 14 | LOGGER = logging.get_logger() 15 | PROB_COLORS = ("#bcbddc", "#807dba", "#6a51a3") 16 | 17 | 18 | def plot_calib( 19 | pdf_fp, 20 | mod_base, 21 | smooth_ls, 22 | s_ref, 23 | sm_ref, 24 | s_alt, 25 | sm_alt, 26 | mono_prob, 27 | prob_alt, 28 | prob_threshs, 29 | add_prob_thresh, 30 | ): 31 | f, axarr = plt.subplots(3, sharex=True, figsize=(11, 7)) 32 | axarr[0].plot(smooth_ls, s_ref, color="orange") 33 | axarr[0].plot(smooth_ls, sm_ref, color="red") 34 | axarr[0].plot(smooth_ls, s_alt, color="grey") 35 | axarr[0].plot(smooth_ls, sm_alt, color="blue") 36 | axarr[0].set_ylabel( 37 | "Probability Density\nred/orange=canonical\nblue/grey=modified" 38 | ) 39 | axarr[0].set_title(mod_base + " Calibration") 40 | axarr[1].plot(smooth_ls, mono_prob, color="orange") 41 | axarr[1].plot(smooth_ls, 1 / (np.exp(smooth_ls) + 1), color="purple") 42 | axarr[1].set_ylabel( 43 | "Emperical Modified\nProbability\norange=calibrated\npurple=raw" 44 | ) 45 | axarr[2].plot(smooth_ls, np.log((1 - prob_alt) / prob_alt), color="red") 46 | axarr[2].plot( 47 | smooth_ls, np.log((1 - mono_prob) / mono_prob), color="orange" 48 | ) 49 | axarr[2].set_ylabel("Calibrated LLR\norage=monotonic") 50 | axarr[2].set_xlabel("Theoretical LLR (NN Score)") 51 | if add_prob_thresh: 52 | # indicate the cutoff points for several common cutoff locations 53 | thresh_f = np.log((1 - mono_prob) / mono_prob) 54 | for p, col in zip(prob_threshs, PROB_COLORS): 55 | llr_x = np.log(p / (1 - p)) 56 | thresh_val = np.argmin(np.abs(thresh_f - llr_x)) 57 | nthresh_val = np.argmin(np.abs(thresh_f + llr_x)) 58 | prop_filt = ( 59 | sum(sm_ref[nthresh_val:thresh_val]) 60 | + sum(sm_alt[nthresh_val:thresh_val]) 61 | ) / (sum(sm_ref) + sum(sm_alt)) 62 | for i in range(2): 63 | axarr[i].axvline(x=smooth_ls[thresh_val], color=col) 64 | axarr[i].axvline(x=smooth_ls[nthresh_val], color=col) 65 | axarr[2].axvline(x=smooth_ls[thresh_val], color=col) 66 | axarr[2].axvline( 67 | x=smooth_ls[nthresh_val], 68 | color=col, 69 | label=("--mod-binary-threshold={} (filters {:.0f}%)").format( 70 | p, 100 * prop_filt 71 | ), 72 | ) 73 | axarr[2].legend(fontsize="small") 74 | 75 | pdf_fp.savefig(bbox_inches="tight") 76 | plt.close() 77 | 78 | 79 | def extract_llrs(llr_fn): 80 | llrs_data = np.load(llr_fn) 81 | mod_bases = llrs_data[mods.GT_ALL_MOD_BASE_STR] 82 | mod_base_llrs = {} 83 | for mod_base in mod_bases: 84 | mod_base_llrs[mod_base] = ( 85 | llrs_data[mods.GT_MOD_LLR_STR.format(mod_base)], 86 | llrs_data[mods.GT_CAN_LLR_STR.format(mod_base)], 87 | ) 88 | 89 | return mod_base_llrs 90 | 91 | 92 | def _main(args): 93 | logging.init_logger() 94 | mh.prep_out_fn(args.out_filename, args.overwrite) 95 | 96 | LOGGER.info("Parsing log-likelihood ratios") 97 | mod_base_llrs = extract_llrs(args.ground_truth_llrs) 98 | 99 | pdf_fp = None if args.out_pdf is None else PdfPages(args.out_pdf) 100 | save_kwargs = {} 101 | for mod_base, (mod_llrs, can_llrs) in mod_base_llrs.items(): 102 | LOGGER.info("Computing {} modified base calibration.".format(mod_base)) 103 | mod_calib, mod_llr_range, plot_data = calibration.compute_calibration( 104 | can_llrs, 105 | mod_llrs, 106 | args.max_input_llr, 107 | args.num_calibration_values, 108 | args.smooth_bandwidth, 109 | args.min_density, 110 | args.diff_epsilon, 111 | args.llr_clip_buffer, 112 | pdf_fp is not None, 113 | num_proc=args.processes, 114 | ) 115 | save_kwargs[mod_base + calibration.LLR_RANGE_SUFFIX] = mod_llr_range 116 | save_kwargs[mod_base + calibration.CALIB_TABLE_SUFFIX] = mod_calib 117 | if pdf_fp is not None: 118 | plot_calib( 119 | pdf_fp, 120 | mod_base, 121 | *plot_data, 122 | args.pdf_prob_thresholds, 123 | not args.plot_without_prob_thresholds, 124 | ) 125 | if pdf_fp is not None: 126 | pdf_fp.close() 127 | 128 | # save calibration table for reading into mod calibration table 129 | LOGGER.info("Saving calibrations to file.") 130 | mod_bases = list(mod_base_llrs.keys()) 131 | np.savez( 132 | args.out_filename, 133 | stratify_type=calibration.MOD_BASE_STRAT_TYPE, 134 | smooth_nvals=args.num_calibration_values, 135 | mod_bases=mod_bases, 136 | **save_kwargs, 137 | ) 138 | 139 | 140 | if __name__ == "__main__": 141 | _main(get_parser_calibrate_modified_bases().parse_args()) 142 | -------------------------------------------------------------------------------- /megalodon_extras/variants_heterozygous_factor.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import pysam 4 | import numpy as np 5 | 6 | from megalodon import logging 7 | from ._extras_parsers import get_parser_variants_heterozygous_factor 8 | 9 | 10 | LOGGER = logging.get_logger() 11 | 12 | HOM_REF_TXT = "hom_ref" 13 | HET_TXT = "het" 14 | HOM_ALT_TXT = "hom_alt" 15 | 16 | SNP_TXT = "SNP" 17 | DEL_TXT = "DEL" 18 | INS_TXT = "INS" 19 | 20 | STAT_WIDTH = 12 21 | STATS_FMT_STR = "{:<" + str(STAT_WIDTH) + "}" 22 | FLOAT_FMT_STR = "{:<" + str(STAT_WIDTH) + ".4f}" 23 | STAT_NAMES = ("HomRef", "Het", "HomAlt", "F1", "Precision", "Recall") 24 | N_STATS = len(STAT_NAMES) 25 | N_INT_STATS = 3 26 | N_FLOAT_STATS = N_STATS - N_INT_STATS 27 | HEADER_TMPLT = STATS_FMT_STR * (N_STATS + 1) 28 | STATS_TMPLT = STATS_FMT_STR * (N_INT_STATS + 1) + FLOAT_FMT_STR * N_FLOAT_STATS 29 | 30 | 31 | def _main(args): 32 | def conv_call_str(gt_vals): 33 | gt_set = set(gt_vals) 34 | if gt_set == set([0]): 35 | return HOM_REF_TXT 36 | elif gt_set == set([0, 1]): 37 | return HET_TXT 38 | return HOM_ALT_TXT 39 | 40 | logging.init_logger() 41 | gt_calls = defaultdict(dict) 42 | for variant in pysam.VariantFile(args.ground_truth_variants).fetch(): 43 | # skip mutli-allelic sites 44 | if variant.alts is None or len(variant.alts) > 1: 45 | continue 46 | if len(variant.ref) == len(variant.alts[0]): 47 | gt_calls[SNP_TXT][ 48 | (variant.contig, variant.pos, variant.ref, variant.alts[0]) 49 | ] = conv_call_str(variant.samples.values()[0]["GT"]) 50 | elif len(variant.ref) > len(variant.alts[0]): 51 | gt_calls[DEL_TXT][ 52 | (variant.contig, variant.pos, variant.ref, variant.alts[0]) 53 | ] = conv_call_str(variant.samples.values()[0]["GT"]) 54 | else: 55 | gt_calls[INS_TXT][ 56 | (variant.contig, variant.pos, variant.ref, variant.alts[0]) 57 | ] = conv_call_str(variant.samples.values()[0]["GT"]) 58 | mega_calls = defaultdict(dict) 59 | for variant in pysam.VariantFile(args.megalodon_variants).fetch(): 60 | # skip mutli-allelic sites 61 | if len(variant.alts) > 1: 62 | continue 63 | if len(variant.ref) == len(variant.alts[0]): 64 | mega_calls[SNP_TXT][ 65 | (variant.contig, variant.pos, variant.ref, variant.alts[0]) 66 | ] = conv_call_str(variant.samples.values()[0]["GT"]) 67 | elif len(variant.ref) > len(variant.alts[0]): 68 | mega_calls[DEL_TXT][ 69 | (variant.contig, variant.pos, variant.ref, variant.alts[0]) 70 | ] = conv_call_str(variant.samples.values()[0]["GT"]) 71 | else: 72 | mega_calls[INS_TXT][ 73 | (variant.contig, variant.pos, variant.ref, variant.alts[0]) 74 | ] = conv_call_str(variant.samples.values()[0]["GT"]) 75 | 76 | for var_type in (SNP_TXT, DEL_TXT, INS_TXT): 77 | counts = defaultdict(int) 78 | for chrm_pos_ref_alt in set(gt_calls[var_type]).intersection( 79 | mega_calls[var_type] 80 | ): 81 | counts[ 82 | ( 83 | gt_calls[var_type][chrm_pos_ref_alt], 84 | mega_calls[var_type][chrm_pos_ref_alt], 85 | ) 86 | ] += 1 87 | 88 | # compute F1 stat 89 | vt_stats = [] 90 | for truth_type in (HOM_REF_TXT, HET_TXT, HOM_ALT_TXT): 91 | gt_count = sum( 92 | counts[(truth_type, mega_call)] 93 | for mega_call in (HOM_REF_TXT, HET_TXT, HOM_ALT_TXT) 94 | ) 95 | mega_count = sum( 96 | counts[(gt_call, truth_type)] 97 | for gt_call in (HOM_REF_TXT, HET_TXT, HOM_ALT_TXT) 98 | ) 99 | if gt_count == 0 or mega_count == 0: 100 | vt_stats.append((np.NAN, np.NAN, np.NAN)) 101 | else: 102 | prec = counts[(truth_type, truth_type)] / mega_count 103 | recall = counts[(truth_type, truth_type)] / gt_count 104 | vt_stats.append( 105 | (2 * (prec * recall) / (prec + recall), prec, recall) 106 | ) 107 | 108 | # print output 109 | LOGGER.info(var_type) 110 | LOGGER.info(HEADER_TMPLT.format("Truth\tCalls", *STAT_NAMES)) 111 | for truth, (f1, prec, recall) in zip( 112 | (HOM_REF_TXT, HET_TXT, HOM_ALT_TXT), vt_stats 113 | ): 114 | LOGGER.info( 115 | STATS_TMPLT.format( 116 | truth, 117 | counts[(truth, HOM_REF_TXT)], 118 | counts[(truth, HET_TXT)], 119 | counts[(truth, HOM_ALT_TXT)], 120 | f1, 121 | prec, 122 | recall, 123 | ) 124 | ) 125 | mean_f1_fmt = ( 126 | "{:>" 127 | + str(STAT_WIDTH * (N_STATS - 2)) 128 | + "}" 129 | + FLOAT_FMT_STR * N_FLOAT_STATS 130 | + "\n" 131 | ) 132 | mean_stats = map(np.nanmean, zip(*vt_stats)) 133 | LOGGER.info(mean_f1_fmt.format("Mean Stats: ", *mean_stats)) 134 | 135 | 136 | if __name__ == "__main__": 137 | _main(get_parser_variants_heterozygous_factor().parse_args()) 138 | -------------------------------------------------------------------------------- /megalodon_extras/modified_bases_split_by_motif.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | import pysam 4 | from tqdm import tqdm 5 | 6 | from megalodon import backends, logging, mods, megalodon_helper as mh 7 | from ._extras_parsers import get_parser_modified_bases_split_calls_by_motif 8 | 9 | 10 | LOGGER = logging.get_logger() 11 | 12 | MOTIF_INFO = namedtuple( 13 | "MOTIF_INFO", 14 | ("bases_before", "bases_after", "raw_motif", "motif", "rc_motif"), 15 | ) 16 | 17 | 18 | ######################## 19 | # data table functions # 20 | ######################## 21 | 22 | 23 | def split_data(in_mods_db, out_mods_dbs, ref): 24 | LOGGER.info("Inserting modified base data") 25 | bar = tqdm( 26 | desc="Inserting Data", 27 | unit="per-read calls", 28 | total=in_mods_db.get_num_uniq_stats(), 29 | smoothing=0, 30 | dynamic_ncols=True, 31 | ) 32 | curr_ref_seq = curr_chrm = None 33 | # TODO multiprocess over contigs (need to implement iteration over range 34 | # of pos_dbids via chrm string) 35 | for pos_dbid, pos_mod_data in in_mods_db.iter_pos_scores(): 36 | bar.update(len(pos_mod_data)) 37 | chrm, strand, pos = in_mods_db.get_pos(pos_dbid) 38 | if chrm != curr_chrm: 39 | curr_chrm = chrm 40 | curr_ref_seq = ref.fetch(chrm) 41 | for out_mods_db, motif_info in out_mods_dbs: 42 | motif_match = ( 43 | motif_info.motif.match( 44 | curr_ref_seq[ 45 | pos 46 | - motif_info.bases_before : pos 47 | + motif_info.bases_after 48 | + 1 49 | ] 50 | ) 51 | if strand == 1 52 | else motif_info.rc_motif.match( 53 | curr_ref_seq[ 54 | pos 55 | - motif_info.bases_after : pos 56 | + motif_info.bases_before 57 | + 1 58 | ] 59 | ) 60 | ) 61 | if motif_match is not None: 62 | pos_insert_data = [ 63 | (lp, pos_dbid, mod_dbid, read_dbid) 64 | for read_dbid, mod_dbid, lp in pos_mod_data 65 | ] 66 | out_mods_db.insert_batch_data(pos_insert_data) 67 | break 68 | bar.close() 69 | 70 | 71 | ########## 72 | # motifs # 73 | ########## 74 | 75 | 76 | def parse_motifs(raw_motifs): 77 | motifs = [] 78 | for raw_motif, bases_before in raw_motifs: 79 | bases_before = int(bases_before) 80 | bases_after = len(raw_motif) - bases_before - 1 81 | motif = mh.compile_motif_pat(raw_motif) 82 | rc_motif = mh.compile_rev_comp_motif_pat(raw_motif) 83 | motifs.append( 84 | MOTIF_INFO( 85 | bases_before=bases_before, 86 | bases_after=bases_after, 87 | raw_motif=raw_motif, 88 | motif=motif, 89 | rc_motif=rc_motif, 90 | ) 91 | ) 92 | 93 | return motifs 94 | 95 | 96 | ######## 97 | # main # 98 | ######## 99 | 100 | 101 | def _main(args): 102 | logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix) 103 | 104 | # parse motifs 105 | motifs = parse_motifs(args.motif) 106 | # open indexed FASTA reference 107 | ref = pysam.FastaFile(args.reference) 108 | 109 | LOGGER.info("Extracting mods and chrms from input database") 110 | in_mods_db = mods.ModsDb( 111 | mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME) 112 | ) 113 | alphabet, _, mod_long_names = in_mods_db.get_alphabet_info() 114 | ref_names_and_lens = list(zip(*in_mods_db.iter_chrms()))[1:] 115 | LOGGER.info("Extracting read uuid table") 116 | in_uuids = [uuid for _, uuid in in_mods_db.iter_uuids()] 117 | 118 | LOGGER.info("Opening new per-read modified base statistics databases") 119 | model_info = backends.DetachedModelInfo( 120 | alphabet=alphabet, mod_long_names=mod_long_names 121 | ) 122 | out_mods_dbs = [] 123 | for motif_info in motifs: 124 | out_dir = "{}.{}_{}".format( 125 | args.output_prefix, motif_info.raw_motif, motif_info.bases_before 126 | ) 127 | mh.mkdir(out_dir, overwrite=False) 128 | mods_info = mods.ModInfo(model_info, out_dir=out_dir) 129 | mods.init_mods_db(mods_info, ref_names_and_lens) 130 | out_mods_dbs.append( 131 | (mods.ModsDb(mods_info.mods_db_fn, read_only=False), motif_info) 132 | ) 133 | out_mods_dbs[-1][0].insert_uuids(in_uuids) 134 | out_mods_dbs[-1][0].commit() 135 | 136 | # commit so read uuids are available to worker processes 137 | LOGGER.info("Inserting per-read calls from input databases") 138 | split_data(in_mods_db, out_mods_dbs, ref) 139 | 140 | # TOOD do this in separate processes 141 | LOGGER.info( 142 | "Creating data covering indices for efficient iteration by position" 143 | ) 144 | for out_mods_db, _ in out_mods_dbs: 145 | out_mods_db.create_data_covering_index() 146 | out_mods_db.commit() 147 | out_mods_db.close() 148 | LOGGER.info("Finished indexing {}".format(out_mods_db.fn)) 149 | 150 | 151 | if __name__ == "__main__": 152 | _main(get_parser_modified_bases_split_calls_by_motif().parse_args()) 153 | -------------------------------------------------------------------------------- /docs/extras_variants.rst: -------------------------------------------------------------------------------- 1 | ***************************** 2 | ``megalodon_extras variants`` 3 | ***************************** 4 | 5 | The ``megalodon_extras variants`` command group contains various commands related to sequence variant processing within Megalodon. 6 | 7 | ------------------------------------- 8 | ``megalodon_extras variants atomize`` 9 | ------------------------------------- 10 | 11 | This command processes each variant in a VCF file to convert the variants to their atomic form. 12 | When the variant file produced by this command is used in the ``megalodon`` command with the ``--variants-are-atomized`` flag, processing speed can be drastically increased. 13 | This is especially true for high-density variant files. 14 | 15 | For each variant processed in Megalodon the "atomic" variant is required for the highest quality results. 16 | For single nucleotide variants (SNVs) the atomic form simply reduces any multi-nucleotide SNP into the single nucleotide swaps. 17 | For insertions and deletions (indels) the atomic form removes context bases and expands the indel to include all unambiguous positions. 18 | For example consider a bit of reference sequence ``AGCGCA``. 19 | An insertion after the first base of a ``GC`` could be validly annotated in a VCF file as reference allele ``A`` and alternative allele ``AGC``. 20 | But the atomic form of this variant would be reference allele ``GCGC`` and alternative allele ``GCGCGC``. 21 | In this way atomic variants capture all reference bases impacted by a variant and no more. 22 | Processing variants in this way ensures that the correct bit of sequence and raw signal is considered when determining the correct allele within Megalodon. 23 | 24 | The processing of each variant to its atomic form must be completed each time a variant overlaps a read since VCF files can often be too large to store in RAM. 25 | Thus the computational cost for high coverage or dense variant files can be quite high for this step of the processing. 26 | This command allows this atomizing step to be performed once before starting a ``megalodon`` run. 27 | When passing a variant file processed with this command, the ``--variants-are-atomized`` flag should be specified. 28 | Note that this flag should not be used with VCF files originating from other sources. 29 | Specifically, this command adds a non-standard ``CB`` (context base) VCF flag in order to indicate that a context base added to a variant is not part of the atomic form of that variant. 30 | 31 | ------------------------------------- 32 | ``megalodon_extras variants resolve`` 33 | ------------------------------------- 34 | 35 | The ``megalodon_extras variants resolve`` command resolves conflicting variants and removes variants determined to be the reference allele. 36 | 37 | Megalodon processes sequence variants to make a call for each variant individually (taking nearby variants into account). 38 | Nearby variants are often overlapping and thus incompatible. 39 | The primary use for this command is to resolve situations where multiple overlapping variants are called. 40 | At each site with overlapping variants, the one with the highest probability is selected for the output file. 41 | 42 | In addition, this command has options to filter variants which were called as the reference allele (``--max-likelihood-ratio``), filter for coverage (``--min-depth``), and revert atomic variant notation (``--trim-variants``). 43 | 44 | There are also a number of options to inspect potential systematic bias in identified sequence variants. 45 | Providing a VCF file called only from reverse strand mapping reads via the ``--reverse-strand-variants`` argument activates the output. 46 | The main VCF provided is then assumed to be derived from forward strand mapping reads only. 47 | In this mode, variants are output when they are identified only on one strand and not the other to allow analysis of potential bias in basecalling models. 48 | This feature is experimental and does not have defined pipelines for downstream use. 49 | 50 | ------------------------------------------------- 51 | ``megalodon_extras variants heterozygous_factor`` 52 | ------------------------------------------------- 53 | 54 | Determine the result of the heterozygous factor on identifying the correct balance of homozygous to heterozygous variants. 55 | 56 | This command can assist in setting an optimal value for the ``--heterozygous-factors`` argument. 57 | The default value is intended to minimize the number of false heterozygous calls. 58 | The recommended phased variant pipeline for Megalodon processes variants such that if a variant is not initially called heterozygous, a heterozygous call cannot be made, but heterozygous calls can be converted to homozygous calls. 59 | Since this phased variant pipeline is recommended in order to obtain the highest quality variants, false homozygous calls are minimized. 60 | 61 | If the aim is to achieve a balance of homozygous and heterozygous calls, this command can be used to evaluate this balance for a particular ``--heterozygous-factors`` setting. 62 | This command will output the number of called homozygous and heterozygous calls compared to their ground truth from a provided set of variants. 63 | As a guide, previous Megalodon version had the default value of ``--heterozygous-factors 2.1 1.6`` which achieves a better balance than the current default of ``--heterozygous-factors 1.0 1.0`` which minimizes false homozygous calls. 64 | 65 | -------------------------------------------- 66 | ``megalodon_extras variants index_database`` 67 | -------------------------------------------- 68 | 69 | This command is not currently implemented, but will be in a future release. 70 | -------------------------------------------------------------------------------- /docs/common_arguments.rst: -------------------------------------------------------------------------------- 1 | **************** 2 | Common Arguments 3 | **************** 4 | 5 | ----------------- 6 | Required Argument 7 | ----------------- 8 | 9 | - ``fast5s_dir`` 10 | 11 | - Path to directory containing raw FAST5-format nanopore reads. 12 | - Both single and multi FAST5 formats are supported. 13 | - Default searches recursively for fast5 read files. To search only one-level specify ``--not-recursive``. 14 | 15 | ---------------------- 16 | Guppy Backend Argument 17 | ---------------------- 18 | 19 | - ``--guppy-config`` 20 | 21 | - Guppy config. 22 | - Default: ``dna_r9.4.1_450bps_modbases_5mc_hac.cfg`` 23 | 24 | - ``--guppy-server-path`` 25 | 26 | - Path to guppy server executable. 27 | - Default: ``./ont-guppy/bin/guppy_basecall_server`` 28 | 29 | ---------------- 30 | Output Arguments 31 | ---------------- 32 | 33 | - ``--live-processing`` 34 | 35 | - As of version 2.2, Megalodon now supports live run processing. 36 | - Activate live processing mode by simply adding the ``--live-processing`` argument and specifying the MinKNOW output directory as the input FAST5 directory. 37 | - Megalodon will continue to search for FAST5s until the ``final_summary*`` file is created by MinKNOW, indicating data production has completed. 38 | - ``--outputs`` 39 | 40 | - Specify desired outputs. 41 | - Options are ``basecalls``, ``mod_basecalls``, ``mappings``, ``variant_mappings``, ``mod_mappings``, ``per_read_variants``, ``per_read_mods``, ``variants``, and ``mods``. 42 | 43 | - ``mod_basecalls`` are output in a BAM file via the ``Mm`` and ``Ml`` tags `described by hts-specs here `_. 44 | - ``variant_mappings`` are intended for obtaining highly accurate phased variant genotypes, but also provide a nice genome browser visualiztion of per-read variant calls. 45 | 46 | - These mappings contain reference sequence at all positions except for per-read called variants. The base quality scores encode the likelihood for that reference anchored variant for use in the ``whathap`` phasing algorithm. 47 | - ``mod_mappings`` provide reference-anchored per-read modified base calls. 48 | 49 | - As of version 2.2, the default output uses the ``Mm`` and ``Ml`` hts-specs tags (see above) with all modified bases in one output file. 50 | - Specify the ``--mod-map-emulate-bisulfite`` option to output one BAM per modified base with modified bases converted using ``--mod-map-base-conv`` 51 | 52 | - This file is useful for visualizing per-read modified base calls (e.g. IGV bisulfite mode for CpG calls). 53 | - This file may also allow a port to standard bisulfite pipelines that are capable of processing long-reads. 54 | - Default output is ``basecalls`` only. 55 | - ``--output-directory`` 56 | 57 | - Specify the directory to output results. 58 | Default ``megalodon_results`` 59 | - ``--overwrite`` 60 | 61 | - Overwrite the ``--output-directory`` if it exists. 62 | - Note that this is a recursive file deletion and should be used with caution. 63 | 64 | ----------------- 65 | Mapping Arguments 66 | ----------------- 67 | 68 | - ``--mappings-format`` 69 | 70 | - Format for ``mapping`` output. 71 | - Options include ``bam`` (default), ``cram``, and ``sam``. 72 | - As of version 2.2, mappings are no longer sorted by default. 73 | 74 | - Set ``--sort-mappings`` to sort mappings. If ``samtools`` is not in ``$PATH`` provide path to executable via the ``--samtools-executable`` argument. 75 | - ``--reference`` 76 | 77 | - Reference genome or transcriptome in FASTA or minimap2 index format. 78 | 79 | - If ``--reference`` is a minimap2 index and ``--mapping-format`` is ``cram``, provide FASTA reference via ``--cram-reference``. 80 | 81 | -------------------------- 82 | Sequence Variant Arguments 83 | -------------------------- 84 | 85 | - ``--haploid`` 86 | 87 | - Compute sequence variants assuming a haploid reference. Default: diploid 88 | - ``--variant-filename`` 89 | 90 | - File containing putative variants in VCF/BCF format. 91 | 92 | - Variants file must be sorted. 93 | - If variant file is not compressed and indexed this will be performed before further processing. 94 | - Variants must be matched to the ``--reference`` provided. 95 | 96 | ----------------------- 97 | Modified Base Arguments 98 | ----------------------- 99 | 100 | - ``--mod-motif`` 101 | 102 | - Restrict modified base results to the specified motifs. 103 | - This argument takes 3 values representing: 104 | 105 | 1. Modified base single letter codes (see ``megalodon_extras modified_bases describe_alphabet`` command) 106 | 2. Canonical sequence motif (may contain `ambiguity codes `_) 107 | 3. Relative position (0-based) of the modified base within the canonical sequence motif 108 | - Multiple ``--mod-motif`` arguments can be provided to a single ``megalodon`` command. 109 | - If not provided (and ``per_read_mods`` or ``mods`` outputs requested) all relevant sites are tested (e.g. all ``C`` bases for ``5mC``). 110 | 111 | - Note that restricting to motifs of interest can save computationally expensive steps and is considered more than a simple post-processing filter. 112 | 113 | -------------------------- 114 | Compute Resource Arguments 115 | -------------------------- 116 | 117 | - ``--processes`` 118 | 119 | - Number of CPU read-processing workers to spawn. 120 | - ``--devices`` 121 | 122 | - GPU devices to use for basecalling acceleration. 123 | - If not provided CPU basecalling will be performed. 124 | - Device names can be provided in the following formats: ``0``, ``cuda0`` or ``cuda:0``. 125 | - Multiple devices can be specified separated by a space. 126 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ************************************* 2 | Welcome to Megalodon's documentation! 3 | ************************************* 4 | 5 | Megalodon is a research command line tool to extract high accuracy modified base and sequence variant calls from raw nanopore reads by anchoring the information rich basecalling neural network output to a reference genome/transcriptome. 6 | 7 | Raw nanopore reads are processed by a single command to produce basecalls (FASTA/Q), reference mappings (SAM/BAM/CRAM), modified base calls (per-read and aggregated per-reference site), sequence variant calls (per-read and aggregated per-reference site) and more. 8 | 9 | ------------- 10 | Prerequisites 11 | ------------- 12 | 13 | The primary Megalodon run mode requires the Guppy basecaller (version >= 4.0). 14 | See the `community page for download/installation instructions [login required] `_. 15 | 16 | Megalodon is a python-based command line software package. 17 | Given a python (version >= 3.5) installation, all other requirements are handled by ``pip`` or ``conda``. 18 | 19 | .. 20 | 21 | `Taiyaki `_ is no longer required to run Megalodon, but installation is required for two specific run modes: 22 | 23 | 1) output mapped signal files (for basecall model training) 24 | 25 | 2) running the Taiyaki basecalling backend (for neural network designs including experimental layers) 26 | 27 | ------------ 28 | Installation 29 | ------------ 30 | 31 | ``pip`` is recommended for Megalodon installation. 32 | 33 | :: 34 | 35 | pip install megalodon 36 | 37 | ``conda`` installation is available, but not fully supported. 38 | ``ont_pyguppy_client_lib`` is not available on conda and thus must be installed with ``pip``. 39 | 40 | :: 41 | 42 | conda install megalodon 43 | pip install ont_pyguppy_client_lib 44 | 45 | To install from github source for development, the following commands can be run. 46 | 47 | :: 48 | 49 | git clone https://github.com/nanoporetech/megalodon 50 | pip install -e megalodon/ 51 | 52 | It is recommended that Megalodon be installed in a control compute environment. 53 | See `the python documentation for preparing virtual environments `_ 54 | 55 | =========== 56 | Quick Start 57 | =========== 58 | 59 | Megalodon must obtain the intermediate output from the basecall neural network. 60 | Guppy (production nanopore basecalling software) is the recommended backend to obtain this output from raw nanopore signal (from FAST5 files). 61 | Nanopore basecalling is compute intensive and thus it is highly recommended that GPU resources are specified (``--devices``) for optimal Megalodon performance. 62 | 63 | Megalodon is accessed via the command line interface ``megalodon`` command. 64 | 65 | :: 66 | 67 | # megalodon help (common args) 68 | megalodon -h 69 | # megalodon help (advanced args) 70 | megalodon --help-long 71 | 72 | # Example command to output basecalls, mappings, and 5mC CpG methylation in both per-read (``mod_mappings``) and aggregated (``mods``) formats 73 | # Compute settings: GPU devices 0 and 1 with 40 CPU cores 74 | megalodon \ 75 | raw_fast5s/ \ 76 | --outputs basecalls mappings mod_mappings mods \ 77 | --reference reference.fa --mod-motif m CG 0 \ 78 | --devices 0 1 --processes 40 79 | 80 | This command produces the ``megalodon_results`` output directory containing all requested output files and logs. 81 | The format for common outputs is described briefly below and in more detail in the `full documentation `_ 82 | 83 | The above command uses the modified base model included in Guppy. 84 | As of the ``2.3.0`` megalodon release (March 2021) the models included with Guppy (``4.5.2``) provide the most accurate modified basecalling models. 85 | As more accurate basecalling models are trained, they are first released into the `Rerio repository for research models `_. 86 | Once training pipelines are more thoroughly standardized and tested models will be transferred into Guppy. 87 | The code below shows how to obtain and run the R9.4.1, MinION/GridION, 5mC CpG model from Rerio. 88 | Note that this is the same model now included in Guppy ``4.5.2``. 89 | 90 | :: 91 | 92 | # Obtain and run R9.4.1, MinION, 5mC CpG model from Rerio 93 | git clone https://github.com/nanoporetech/rerio 94 | rerio/download_model.py rerio/basecall_models/res_dna_r941_min_modbases_5mC_CpG_v001 95 | megalodon \ 96 | raw_fast5s/ \ 97 | --guppy-params "-d ./rerio/basecall_models/" \ 98 | --guppy-config res_dna_r941_min_modbases_5mC_CpG_v001.cfg \ 99 | --outputs basecalls mappings mod_mappings mods \ 100 | --reference reference.fa --mod-motif m CG 0 \ 101 | --devices 0 1 --processes 40 102 | 103 | .. 104 | 105 | The path to the ``guppy_basecall_server`` executable is required to run Megalodon. 106 | By default, Megalodon assumes Guppy (Linux GPU) is installed in the current working directory (i.e. ``./ont-guppy/bin/guppy_basecall_server``). 107 | Use the ``--guppy-server-path`` argument to specify a different path. 108 | 109 | -------- 110 | Contents 111 | -------- 112 | 113 | .. toctree:: 114 | :maxdepth: 2 115 | 116 | algorithm_details 117 | common_arguments 118 | advanced_arguments 119 | computing_considerations 120 | variant_phasing 121 | file_formats 122 | model_training 123 | modbase_training 124 | extras_aggregate 125 | extras_calibrate 126 | extras_merge 127 | extras_modified_bases 128 | extras_phase_variants 129 | extras_per_read_text 130 | extras_validate 131 | extras_variants 132 | -------------------------------------------------------------------------------- /docs/extras_modified_bases.rst: -------------------------------------------------------------------------------- 1 | *********************************** 2 | ``megalodon_extras modified_bases`` 3 | *********************************** 4 | 5 | The ``megalodon_extras modified_bases`` command group contains various commands related to modified base processing within Megalodon. 6 | 7 | ----------------------------------------------------- 8 | ``megalodon_extras modified_bases describe_alphabet`` 9 | ----------------------------------------------------- 10 | 11 | Describe the alphabet, including modified bases, found in a given model. 12 | 13 | This command is useful to determine the syntax for specifying arguments related to modified base detection. 14 | 15 | Note that originally modified bases were specified by arbitrary values, but recent and future models will attempt to follow single letter codes specified by `samtools hts-spec (currently an open issue) `_. 16 | 17 | A minimal subset of the model specifications from the main ``megalodon`` command are available to specify the model exactly as in the main command. 18 | The model will be loaded as normal and the alphabet used will be printed. 19 | 20 | ------------------------------------------------------ 21 | ``megalodon_extras modified_bases estimate_threshold`` 22 | ------------------------------------------------------ 23 | 24 | Estimate the optimal global modified base score threshold such that the estimated proportion of bases modified is achieved when all sites are called. 25 | This command is useful when producing the ``signal_mappings`` or ``per_read_refs`` outputs with modified bases annotated for basecaller training with Taiyaki. 26 | 27 | This command works by estimating the proportion of bases modified from a sample using the most extreme calls (default most extreme 8%; set with ``--mod-percentile`` option) as a truth set. 28 | This method assumes that the distribution of modified base and canonical scores is approximately balanced. 29 | This may not be true for all models. 30 | The plots produced by the ``megalodon_extras calibrate modified_bases`` command can assist in making this determination. 31 | 32 | ---------------------------------------------------------------- 33 | [DEPRECATED] ``megalodon_extras modified_bases update_database`` 34 | ---------------------------------------------------------------- 35 | 36 | This method has not been updated for the most recent modified base schema and thus is currently deprecated. 37 | 38 | ------------------------------------------------------- 39 | ``megalodon_extras modified_bases create_ground_truth`` 40 | ------------------------------------------------------- 41 | 42 | Create a set of ground truth sites from a bedmethyl file. 43 | This is a convenience command to apply a threshold to observed fractions of modified bases from bedmethyl files produced by Megalodon or other software. 44 | 45 | The ``--strand-offset`` is provided to allow calls on opposite strands to be combined. 46 | For example forward and reverse strand CpG calls can be merged by setting ``--strand-offset 1`` since the reverse strand position ``1`` downstream of the forward strand position correspond to the same biological methylation event. 47 | 48 | ---------------------------------------------------- 49 | ``megalodon_extras modified_bases create_motif_bed`` 50 | ---------------------------------------------------- 51 | 52 | This is a helper command to take a reference FASTA file and produce a BED file with all the locations of a motif of interest. 53 | This can be useful for a number of modified base pipelines. 54 | 55 | ------------------------------------------------------- 56 | ``megalodon_extras modified_bases per_site_thresholds`` 57 | ------------------------------------------------------- 58 | 59 | See the `ground truth aided bootstrap modified base annotation `_ tutorial for more complete instructions on using this command. 60 | 61 | This command is targeted at creating higher accuracy modified base training data sets (mapped signal files) from a preliminary modified base model and a fractionally (at each reference site) modified sample. 62 | This command takes as input a Megalodon run with ``--outputs per_read_mods mappings`` and a ground truth bedmethyl file and produces a custom modified base threshold at each reference site. 63 | These modified base thresholds will match the Megalodon modified base statistics with the ground truth faction of modified bases at each reference site. 64 | The derived thresholds should then be supplied to Megalodon via the ``--mod-per-site-threshold`` argument along with the ``--outputs signal_mappings``. 65 | 66 | This command will also output a low coverage BED file containing reference sites covered by either too few ground truth or nanopore reads. 67 | This should be used to filter reads for the final training mapped signal data set. 68 | A read covering any low coverage sites will not be marked up as accurately and thus should not be included in model training. 69 | 70 | -------------------------------------------------- 71 | ``megalodon_extras modified_bases index_database`` 72 | -------------------------------------------------- 73 | 74 | In certain instances a megalodon run may end unexpectedly (e.g. out of memory error). 75 | In most cases the modified bases database is not corrupted by such an unexpected run termination. 76 | This will leave the modified base database without having completed the indexing step which is required for most downstream uses. 77 | This command will produce the index as a separate step in such instances. 78 | 79 | -------------------------------------------------- 80 | ``megalodon_extras modified_bases split_by_motif`` 81 | -------------------------------------------------- 82 | 83 | Split an input modified base database into smaller databases based on reference sequence motifs. 84 | This command enables the computationally expensive ``megalodon`` command to be run only once, while still allowing motif specific analyses. 85 | -------------------------------------------------------------------------------- /megalodon/megalodon_multiprocessing.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | import multiprocessing as mp 3 | from collections import namedtuple 4 | from multiprocessing.connection import wait 5 | from multiprocessing.queues import Queue as mpQueue 6 | 7 | from megalodon import logging, megalodon_helper as mh 8 | 9 | 10 | # fix error `TypeError: cannot pickle '_thread.lock' object` on Mac + python3.8 11 | try: 12 | mp.set_start_method("fork") 13 | except RuntimeError: 14 | pass 15 | 16 | _FULL_SLEEP_TIME = 1 17 | 18 | GETTER_QPC = namedtuple("getter_qpc", ("queue", "proc", "conn")) 19 | 20 | LOGGER = logging.get_logger() 21 | 22 | 23 | ########################### 24 | # Multi-processing Helper # 25 | ########################### 26 | 27 | 28 | class CountingMPQueue(mpQueue): 29 | """Minimal version of multiprocessing queue maintaining a queue size 30 | counter 31 | """ 32 | 33 | def __init__(self, **kwargs): 34 | self.name = None 35 | if "name" in kwargs: 36 | self.name = kwargs["name"] 37 | del kwargs["name"] 38 | super().__init__(ctx=mp.get_context(), **kwargs) 39 | self._size = mp.Value("i", 0) 40 | self.maxsize = None 41 | if "maxsize" in kwargs: 42 | self.maxsize = kwargs["maxsize"] 43 | 44 | def put(self, *args, **kwargs): 45 | super().put(*args, **kwargs) 46 | with self._size.get_lock(): 47 | self._size.value += 1 48 | 49 | def get(self, *args, **kwargs): 50 | rval = super().get(*args, **kwargs) 51 | with self._size.get_lock(): 52 | self._size.value -= 1 53 | return rval 54 | 55 | def qsize(self): 56 | qsize = max(0, self._size.value) 57 | if self.maxsize is not None: 58 | return min(self.maxsize, qsize) 59 | return qsize 60 | 61 | def empty(self): 62 | return self.qsize() <= 0 63 | 64 | 65 | def create_getter_qpc( 66 | getter_func, args, max_size=mh._MAX_QUEUE_SIZE, name=None 67 | ): 68 | """Spawn a new "getter" process. This process will use target=getter_func. 69 | A new queue and pipe connection will be passed to this function as the 70 | first two arguments, followed by *args. A mega_mp.GETTER_QPC will be 71 | returned containing the created mp.Queue, the mp.Process object and the 72 | other end of the mp.Pipe connection. 73 | 74 | Note the connection object is intended to communicate to the getter process 75 | that wroker processes have concluded. Send True or any value to the 76 | connection trigger the getter process to exit after exhausting the queue. 77 | """ 78 | if max_size is None: 79 | q = CountingMPQueue(name=name) 80 | else: 81 | q = CountingMPQueue(maxsize=max_size, name=name) 82 | main_conn, conn = mp.Pipe() 83 | p = mp.Process( 84 | target=getter_func, daemon=True, args=(q, conn, *args), name=name 85 | ) 86 | p.start() 87 | return GETTER_QPC(q, p, main_conn) 88 | 89 | 90 | class ConnWithSize: 91 | def __init__( 92 | self, 93 | conn, 94 | size, 95 | max_size=mh._MAX_QUEUE_SIZE, 96 | name="ConnWithSize", 97 | full_sleep_time=_FULL_SLEEP_TIME, 98 | ): 99 | if not isinstance(conn, mp.connection.Connection): 100 | raise mh.MegaError( 101 | ( 102 | "ConnWithSize initialized with non-connection object. " 103 | + "Object type: {}" 104 | ).format(type(conn)) 105 | ) 106 | if not isinstance(size, mp.sharedctypes.Synchronized) and isinstance( 107 | size.value, int 108 | ): 109 | raise mh.MegaError( 110 | ( 111 | "ConnWithSize initialized with non-synchronized size " 112 | + "object. Object type: {}" 113 | ).format(type(size)) 114 | ) 115 | self._conn = conn 116 | self._size = size 117 | self.max_size = max_size 118 | self.full_sleep_time = full_sleep_time 119 | self.name = name 120 | 121 | def qsize(self): 122 | return max(min(self._size.value, mh._MAX_QUEUE_SIZE), 0) 123 | 124 | def full(self): 125 | if self.max_size is None: 126 | return False 127 | return self.qsize() >= self.max_size 128 | 129 | def put(self, value): 130 | # enforce artificial queue max size with dulplex pipes 131 | if self.full(): 132 | LOGGER.debug("ThrottlingSimplexQueue") 133 | sleep(self.full_sleep_time) 134 | with self._size.get_lock(): 135 | self._size.value += 1 136 | self._conn.send(value) 137 | 138 | def close(self): 139 | self._conn.close() 140 | del self._conn 141 | 142 | 143 | class SimplexManyToOneQueue: 144 | """This object is a more efficient version of a multiprocessing.Queue for 145 | use when many connections will send information in one direction to a 146 | single connection. 147 | 148 | The get_conn class function will return a ConnWithSize object which can 149 | send information to be recieved by the wait_recv function of this class. 150 | """ 151 | 152 | def __init__( 153 | self, 154 | return_conns=True, 155 | max_size=mh._MAX_QUEUE_SIZE, 156 | name="SimplexQueue", 157 | ): 158 | self.return_conns = return_conns 159 | self._conns = [] 160 | self._size = mp.Value("i", 0) 161 | self.max_size = max_size 162 | self.name = name 163 | 164 | def get_conn(self): 165 | if not self.return_conns: 166 | return 167 | _my_conn, r_conn = mp.Pipe(duplex=False) 168 | self._conns.append(_my_conn) 169 | return ConnWithSize(r_conn, self._size, self.max_size, self.name) 170 | 171 | def qsize(self): 172 | return max(min(self._size.value, mh._MAX_QUEUE_SIZE), 0) 173 | 174 | def empty(self): 175 | return self.qsize() <= 0 176 | 177 | @property 178 | def has_valid_conns(self): 179 | return len(self._conns) > 0 180 | 181 | def wait_recv(self): 182 | for conn in wait(self._conns): 183 | try: 184 | r_val = conn.recv() 185 | with self._size.get_lock(): 186 | self._size.value -= 1 187 | except EOFError: 188 | # when connection is closed in worker process EOFError is 189 | # triggered, so remove that connection 190 | self._conns.remove(conn) 191 | else: 192 | yield r_val 193 | -------------------------------------------------------------------------------- /docs/extras_validate.rst: -------------------------------------------------------------------------------- 1 | ***************************** 2 | ``megalodon_extras validate`` 3 | ***************************** 4 | 5 | The ``megalodon_extras validate`` command group contains commands to validate mapping and modified base outputs from Megalodon. 6 | Note that scripts to validate sequence variants are not provided here. 7 | Other tools including `vcfeval `_ and `hapy.py `_ are recommended for validation of sequence variant results. 8 | 9 | ------------------------------------- 10 | ``megalodon_extras validate results`` 11 | ------------------------------------- 12 | 13 | Validate per-read mapping and modified base results. 14 | 15 | This command produces text and graphical summaries of mapping and modified base performance. 16 | 17 | Mapping results include distributional statistics for each sample provided (output determined by ``--out-filename`` default ``stdout``), as well as a plot showing the distribution of mapping accuracy for each sample (see ``--out-pdf``). 18 | 19 | ---- 20 | 21 | .. figure:: _images/mapping_validate_results.png 22 | :align: center 23 | :width: 600 24 | 25 | Example ``validate results`` per-read mapping plot. 26 | 27 | ---- 28 | 29 | Per-read modified base results require a per-read ground truth for modified and canonical bases. 30 | This can be provided by either 1) supplying a control sample via the ``--control-megalodon-results-dirs`` argument (assumes all modified base calls at ``--valid-sites`` in main Megalodon results are modified) or 2) providing a ground truth set of sites containing modified and canonical bases via the ``--ground-truth-data`` argument. 31 | See the ``megalodon_extras modified_bases create_ground_truth`` command for help generating a ground truth file. 32 | 33 | Per-read modified base results are analyzed to produce several metrics including the optimal `F1-score `_, `mean average precision `_ and `ROC AUC `_ among others. 34 | By default, modified and canonical ground truth sites are filtered to contain the same number of statistics for these statistic computations. 35 | It is highly recommended that this not be changed (via ``--allow-unbalance-classes``) as class imbalance can have a large effect on the statistics, thus effecting their comparison between runs and/or models. 36 | Below are example graphical representations produced for per-read modified base validation. 37 | 38 | ---- 39 | 40 | .. figure:: _images/mod_pr_validate_results.png 41 | :align: center 42 | :width: 600 43 | 44 | Example ``validate results`` per-read modified base precision-recall curve plot. 45 | 46 | .. figure:: _images/mod_roc_validate_results.png 47 | :align: center 48 | :width: 600 49 | 50 | Example ``validate results`` per-read modified base ROC curve plot. 51 | 52 | .. figure:: _images/mod_dist_validate_results.png 53 | :align: center 54 | :width: 600 55 | 56 | Example ``validate results`` per-read modified base score distribution plot. 57 | 58 | ---- 59 | 60 | ------------------------------------------------------- 61 | ``megalodon_extras validate aggregated_modified_bases`` 62 | ------------------------------------------------------- 63 | 64 | Compute validation metrics and visualizations from aggregated modified base calls. 65 | 66 | Similar to the ``megalodon_extras validate results`` command, modified base results are compared to a ground truth provided either by 1) a control sample or 2) a ground truth positions CSV file. 67 | A set of metrics are also reported and stored as described by the ``--out-filename`` argument (default ``stdout``). 68 | These metrics include the optimal F1-score, mean average precision and ROC AUC. 69 | This command outputs several visualizations similar to the per-read modified base validation including modified base percent distributions as well as precision-recall and ROC curves. 70 | 71 | ---- 72 | 73 | .. figure:: _images/mod_agg_dist_results.png 74 | :align: center 75 | :width: 600 76 | 77 | Example ``validate aggregated_modified_bases`` modified base percentage distribution plot. 78 | 79 | ---- 80 | 81 | ---------------------------------------------------- 82 | ``megalodon_extras validate compare_modified_bases`` 83 | ---------------------------------------------------- 84 | 85 | Compare two sets of bedmethyl files and report a standard set of metrics and visualizations. 86 | 87 | The two sets or individual bedmethyl files provided will be compared at all overlapping sites with sufficient coverage (defined by ``--coverage-threshold``; default all sites). 88 | To aggregate forward and reverse strand methylation calls set the ``--strand-offset`` argument. 89 | For example to aggregate CpG calls add the ``--strand-offset 1`` argument to the command. 90 | 91 | The first metrics reported concern the coverage over the two samples before and after the overlap and coverage filters have been applied. 92 | Overlapping percent modified values are then compared to produce the correlation coefficient, R^2 and RMSE (for the model y=x). 93 | The correlation coefficient has previously been reported as the standard metric for modified base detection performance, but the RMSE is recommended for purposes of model selection or general modified base detection performance. 94 | This is due to potential modified base model issues resulting in low accuracy, but high precision, which can result in high correlation. 95 | Specifically, some models have a tendency to call some low portion of ground truth modified sites as canonical, likely due to training set imbalance. 96 | 97 | This command also produces a standard set of visualizations for the comparison of these aggregated results. 98 | Shown below are plots comparing the percent modified bases between nanopore and ENCODE bisulfite runs (on log and linear scales shading) as well as a comparison of the coverage for the two samples. 99 | 100 | ---- 101 | 102 | .. figure:: _images/mod_agg_comp_log.png 103 | :align: center 104 | :width: 600 105 | 106 | Example ``validate compare_modified_bases`` percent modified comparison with log-10 scaled shading. 107 | 108 | .. figure:: _images/mod_agg_comp_linear.png 109 | :align: center 110 | :width: 600 111 | 112 | Example ``validate compare_modified_bases`` percent modified comparison with clipped linear scaled shading. 113 | 114 | .. figure:: _images/mod_agg_comp_cov.png 115 | :align: center 116 | :width: 600 117 | 118 | Example ``validate compare_modified_bases`` modified base sample read coverage comparison. 119 | 120 | ---- 121 | -------------------------------------------------------------------------------- /megalodon/banding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from megalodon import megalodon_helper as mh, logging 4 | 5 | LOGGER = logging.get_logger() 6 | 7 | 8 | def compute_sig_band(bps, levels, bhw=mh.DEFAULT_CONSTRAINED_HALF_BW): 9 | """Compute band over which to explore possible paths. Band is represented 10 | in sequence/level coordinates at each signal position. 11 | 12 | Args: 13 | bps (np.ndarray): Integer array containing breakpoints 14 | levels (np.ndarray): float array containing expected signal levels. May 15 | contain np.NAN values. Band will be constructed to maintain path 16 | through NAN regions. 17 | bhw (int): Band half width. If None, full matrix is used. 18 | 19 | Returns: 20 | int32 np.ndarray with shape (2, sig_len = bps[-1] - bps[0]). The first 21 | row contains the lower band boundaries in sequence coordinates and the 22 | second row contains the upper boundaries in sequence coordinates. 23 | """ 24 | seq_len = levels.shape[0] 25 | if bps.shape[0] - 1 != seq_len: 26 | raise mh.MegaError("Breakpoints must be one longer than levels.") 27 | sig_len = bps[-1] - bps[0] 28 | seq_indices = np.repeat(np.arange(seq_len), np.diff(bps)) 29 | 30 | # Calculate bands 31 | # The 1st row consists of the start indices (inc) and the 2nd row 32 | # consists of the end indices (exc) of the valid rows for each col. 33 | band = np.empty((2, sig_len), dtype=np.int32) 34 | if bhw is None: 35 | # specify entire input matrix 36 | band[0, :] = 0 37 | band[1, :] = seq_len 38 | else: 39 | # use specific band defined by bhw 40 | band[0, :] = np.maximum(seq_indices - bhw, 0) 41 | band[1, :] = np.minimum(seq_indices + bhw + 1, seq_len) 42 | 43 | # Modify bands based on invalid levels 44 | nan_mask = np.isin(seq_indices, np.nonzero(np.isnan(levels))) 45 | nan_sig_indices = np.where(nan_mask)[0] 46 | nan_seq_indices = seq_indices[nan_mask] 47 | band[0, nan_sig_indices] = nan_seq_indices 48 | band[1, nan_sig_indices] = nan_seq_indices + 1 49 | # Modify bands close to invalid levels so monotonically increasing 50 | band[0, :] = np.maximum.accumulate(band[0, :]) 51 | band[1, :] = np.minimum.accumulate(band[1, ::-1])[::-1] 52 | 53 | # expand band around large deletions to ensure valid paths 54 | invalid_indices = np.where(band[0, 1:] >= band[1, :-1])[0] 55 | while invalid_indices.shape[0] > 0: 56 | band[0, invalid_indices + 1] = np.maximum( 57 | band[0, invalid_indices + 1] - 1, 0 58 | ) 59 | band[1, invalid_indices] = np.minimum( 60 | band[1, invalid_indices] + 1, seq_len 61 | ) 62 | invalid_indices = np.where(band[0, 1:] >= band[1, :-1])[0] 63 | 64 | return band 65 | 66 | 67 | def convert_to_seq_band(sig_band): 68 | """Convert band with sig_len entries containing upper and lower band 69 | boundaries in base coordinates to a seq_len entries contraining upper and 70 | lower band boundaries in signal space. 71 | 72 | Args: 73 | sig_band (np.array): int32 array with shape (2, sig_len). The first row 74 | contains the lower band boundaries in sequence coordinates and the 75 | second row contains the upper boundaries in sequence coordinates. 76 | 77 | Returns: 78 | int32 np.ndarray with shape (2, seq_len = sig_band[1, -1]). The first 79 | row contains the lower band boundaries in signal coordinates and the 80 | second row contains the upper boundaries in signal coordinates. 81 | """ 82 | sig_len = sig_band.shape[1] 83 | seq_len = sig_band[1, -1] 84 | seq_band = np.zeros((2, seq_len), dtype=np.int32) 85 | seq_band[1, :] = sig_len 86 | 87 | # upper signal coordinates define lower sequence boundaries 88 | lower_sig_pos = np.nonzero(np.ediff1d(sig_band[1, :], to_begin=0))[0] 89 | lower_base_pos = sig_band[1, lower_sig_pos - 1] 90 | seq_band[0, lower_base_pos] = lower_sig_pos 91 | seq_band[0, :] = np.maximum.accumulate(seq_band[0, :]) 92 | 93 | upper_sig_pos = np.nonzero(np.ediff1d(sig_band[0, :], to_begin=0))[0] 94 | upper_base_pos = sig_band[0, upper_sig_pos] 95 | seq_band[1, upper_base_pos - 1] = upper_sig_pos 96 | seq_band[1, :] = np.minimum.accumulate(seq_band[1, ::-1])[::-1] 97 | 98 | return seq_band 99 | 100 | 101 | def validate_band(band, sig_len=None, seq_len=None, is_sig_band=True): 102 | """Validate that band is valid and agrees with input data. 103 | 104 | Args: 105 | band (np.array): int32 array with shape (2, sig_len or seq_len). The 106 | first row contains the lower band boundaries and the second row 107 | contains the upper boundaries. 108 | sig_len (int): Length of signal associated with band 109 | seq_len (int): Length of sequence/levels associated with band 110 | is_sig_band (bool): Does the provided band specify sequence/level 111 | positions for each signal position? If not it is assumed that the 112 | band contains signal positions for each sequence/level position. 113 | 114 | Raises: 115 | MegaError if any portion of the band is determined to be invalid. 116 | """ 117 | # first coordinate 0, last coordinate signal length 118 | if band[0, 0] != 0: 119 | raise mh.MegaError("Band does not start with 0 coordinate.") 120 | 121 | # ends all greater than starts 122 | if np.diff(band, axis=0)[0].min() <= 0: 123 | raise mh.MegaError("Band contains 0-length region") 124 | # monotonic start and end postions 125 | if np.diff(band[0]).min() < 0: 126 | raise mh.MegaError( 127 | "Band start positions are not monotonically increasing" 128 | ) 129 | if np.diff(band[1]).min() < 0: 130 | raise mh.MegaError( 131 | "Band end positions are not monotonically increasing" 132 | ) 133 | 134 | # if provided check that start and end coordinates agree with signal and 135 | # levels. 136 | if is_sig_band: 137 | if sig_len is not None and band.shape[1] != sig_len: 138 | LOGGER.debug(f"Invalid sig_band length: {band.shape[1]} {sig_len}") 139 | raise mh.MegaError("Invalid sig_band length") 140 | if seq_len is not None and band[1, -1] != seq_len: 141 | LOGGER.debug( 142 | f"Invalid sig_band end coordinate: {band[1, -1]} {seq_len}" 143 | ) 144 | raise mh.MegaError("Invalid sig_band end coordinate") 145 | else: 146 | if sig_len is not None and band[1, -1] != sig_len: 147 | LOGGER.debug( 148 | f"Invalid seq_band end coordinate: {band[1, -1]} {sig_len}" 149 | ) 150 | raise mh.MegaError("Invalid seq_band end coordinate") 151 | if seq_len is not None and band.shape[1] != seq_len: 152 | LOGGER.debug(f"Invalid sig_band length: {band.shape[1]} {seq_len}") 153 | raise mh.MegaError("Invalid sig_band length") 154 | -------------------------------------------------------------------------------- /megalodon/signal_mapping.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import queue 3 | import traceback 4 | from collections import namedtuple 5 | 6 | import numpy as np 7 | 8 | from megalodon import fast5_io, megalodon_helper as mh, logging 9 | 10 | try: 11 | import taiyaki 12 | 13 | # appease flake8 14 | taiyaki 15 | except ImportError: 16 | raise mh.MegaError( 17 | "Taiyaki installation required for signal mapping not found." 18 | ) 19 | try: 20 | from taiyaki import ( 21 | alphabet, 22 | fast5utils, 23 | signal_mapping as tai_mapping, 24 | prepare_mapping_funcs, 25 | signal as tai_signal, 26 | ) 27 | except ImportError: 28 | raise mh.MegaError( 29 | "Taiyaki modules could not be loaded. Signal mappings require " 30 | + 'Taiyaki version >= 5.2. Full error:\n"""\n{}\n"""'.format( 31 | traceback.format_exc() 32 | ) 33 | ) 34 | 35 | 36 | LOGGER = logging.get_logger() 37 | SIG_MAP_RESULT = namedtuple( 38 | "SIG_MAP_RESULT", 39 | ( 40 | "pass_filts", 41 | "fast5_fn", 42 | "dacs", 43 | "scale_params", 44 | "ref_seq", 45 | "stride", 46 | "read_id", 47 | "r_to_q_poss", 48 | "rl_cumsum", 49 | "ref_pos", 50 | "ref_out_info", 51 | ), 52 | ) 53 | 54 | 55 | def set_all_motif_mods(int_ref, ref_mods_all_motifs): 56 | ref_mod_pos, ref_mods = [], [] 57 | for mod_base, int_mod_base, mln, int_motif, rel_pos in ref_mods_all_motifs: 58 | for pos in np.where( 59 | np.all( 60 | mh.rolling_window(int_ref, int_motif.shape[0]) == int_motif, 61 | axis=1, 62 | ) 63 | )[0]: 64 | ref_mod_pos.append(pos + rel_pos) 65 | ref_mods.append(int_mod_base) 66 | if len(ref_mod_pos) > 0: 67 | int_ref[ref_mod_pos] = ref_mods 68 | return int_ref 69 | 70 | 71 | def get_remapping( 72 | sig_fn, 73 | dacs, 74 | scale_params, 75 | ref_seq, 76 | stride, 77 | read_id, 78 | r_to_q_poss, 79 | rl_cumsum, 80 | r_ref_pos, 81 | ref_out_info, 82 | ): 83 | read = fast5_io.get_read(sig_fn, read_id) 84 | channel_info = dict(fast5utils.get_channel_info(read).items()) 85 | read_params = { 86 | "trim_start": 0, 87 | "trim_end": 0, 88 | "shift": scale_params[0], 89 | "scale": scale_params[1], 90 | } 91 | sig = tai_signal.Signal( 92 | dacs=dacs, 93 | channel_info=channel_info, 94 | read_id=read_id, 95 | read_params=read_params, 96 | ) 97 | 98 | ref_to_sig = np.empty(len(ref_seq) + 1, dtype=np.int32) 99 | # skip last value since this is where the two seqs end 100 | for ref_pos, q_pos in enumerate(r_to_q_poss): 101 | ref_to_sig[ref_pos] = rl_cumsum[q_pos + r_ref_pos.q_trim_start] * stride 102 | try: 103 | int_ref = tai_mapping.SignalMapping.get_integer_reference( 104 | ref_seq, ref_out_info.alphabet_info.alphabet 105 | ) 106 | except Exception: 107 | raise mh.MegaError("Invalid reference sequence encountered") 108 | sig_mapping = tai_mapping.SignalMapping(ref_to_sig, int_ref, signalObj=sig) 109 | 110 | # annotate mod motifs 111 | if ref_out_info.ref_mods_all_motifs is not None: 112 | # annotate all mod base motif positions with alts 113 | int_ref = set_all_motif_mods(int_ref, ref_out_info.ref_mods_all_motifs) 114 | # set new Reference with mods annotated 115 | sig_mapping.Reference = int_ref 116 | 117 | return ( 118 | sig_mapping.get_read_dictionary(), 119 | prepare_mapping_funcs.RemapResult.SUCCESS, 120 | ) 121 | 122 | 123 | def get_alphabet_info_from_model(model_info): 124 | flat_alphabet = model_info.output_alphabet[0] 125 | can_base = model_info.output_alphabet[0] 126 | for base in model_info.output_alphabet[1:]: 127 | if base in model_info.can_alphabet: 128 | can_base = base 129 | flat_alphabet += can_base 130 | mod_long_names = ( 131 | [] 132 | if len(model_info.mod_long_names) == 0 133 | else list(zip(*model_info.mod_long_names))[1] 134 | ) 135 | return alphabet.AlphabetInfo( 136 | model_info.output_alphabet, 137 | flat_alphabet, 138 | mod_long_names, 139 | do_reorder=True, 140 | ) 141 | 142 | 143 | def get_alphabet_info(output_alphabet, collapse_alphabet, mod_long_names): 144 | return alphabet.AlphabetInfo( 145 | output_alphabet, collapse_alphabet, mod_long_names, do_reorder=True 146 | ) 147 | 148 | 149 | def write_signal_mappings(sig_map_q, sig_map_conn, ref_out_info, aux_failed_q): 150 | def apply_sig_map_offset(read_mapping): 151 | """Apply signal mapping shift to center coarse mappings to a registered 152 | signal based mapping. 153 | """ 154 | if ( 155 | ref_out_info.sig_map_offset is not None 156 | and ref_out_info.sig_map_offset != 0 157 | ): 158 | if ref_out_info.sig_map_offset > 0: 159 | # clip beginning of signal mapping and end of reference to 160 | # shift signal assignments to the left 161 | read_mapping[0]["Ref_to_signal"] = read_mapping[0][ 162 | "Ref_to_signal" 163 | ][ref_out_info.sig_map_offset :] 164 | read_mapping[0]["Reference"] = read_mapping[0]["Reference"][ 165 | : -ref_out_info.sig_map_offset 166 | ] 167 | else: 168 | # clip end of signal mapping and beginning of reference to 169 | # shift signal assignments to the right 170 | read_mapping[0]["Ref_to_signal"] = read_mapping[0][ 171 | "Ref_to_signal" 172 | ][: ref_out_info.sig_map_offset] 173 | read_mapping[0]["Reference"] = read_mapping[0]["Reference"][ 174 | -ref_out_info.sig_map_offset : 175 | ] 176 | return read_mapping 177 | 178 | def iter_mappings(): 179 | workers_active = True 180 | LOGGER.debug("GetterInitComplete") 181 | while workers_active or not sig_map_q.empty(): 182 | try: 183 | read_mapping = sig_map_q.get(timeout=0.1) 184 | yield apply_sig_map_offset(read_mapping) 185 | except queue.Empty: 186 | if sig_map_conn.poll(): 187 | workers_active = False 188 | 189 | try: 190 | LOGGER.debug("GetterStarting") 191 | prepare_mapping_funcs.generate_output_from_results( 192 | iter_mappings(), 193 | mh.get_megalodon_fn(ref_out_info.out_dir, mh.SIG_MAP_NAME), 194 | ref_out_info.alphabet_info, 195 | verbose=False, 196 | ) 197 | LOGGER.debug("GetterClosing") 198 | except Exception as e: 199 | aux_failed_q.put( 200 | ("SigMapProcessingError", str(e), traceback.format_exc()) 201 | ) 202 | 203 | 204 | if __name__ == "__main__": 205 | sys.stderr.write("This is a module. See commands with `megalodon -h`") 206 | sys.exit(1) 207 | -------------------------------------------------------------------------------- /docs/algorithm_details.rst: -------------------------------------------------------------------------------- 1 | *************************** 2 | Megalodon Algorithm Details 3 | *************************** 4 | 5 | This page describes the details of how megalodon processes the raw nanopore signal to produce highly-accurate modified base and sequence variant calls. 6 | 7 | ------------ 8 | Base Calling 9 | ------------ 10 | 11 | Basecalling is performed exactly as in Guppy. 12 | Raw nanopore signal is normalized, chunked, processed with a recurrent neural network and decoded using Viterbi decoding. 13 | Currently Megalodon is only compatible with flip-flop basecalling networks (excluding RLE and Bonito models) 14 | See `guppy documentation on the community page (login required) `_ for more details. 15 | 16 | ------------------- 17 | Reference Anchoring 18 | ------------------- 19 | 20 | Megalodon's functionality centers on the anchoring of the high-information neural network basecalling output to a reference sequence. 21 | Given anchored neural network output, alternatives to the reference (either modified bases or canonical bases) are proposed and scored to produce the highest accuracy results. 22 | 23 | The neural network output is anchored to the reference via standard read mapping of produced basecalls to the reference sequence (maintaining the link to the neural network outputs). 24 | If no reference mapping is produced (using ``minimap2`` via the ``mappy`` python interface) that read is not processed further (basecalls will be output if requested). 25 | This standard read mapping is processed to produce a matching of each basecall with a reference position. 26 | Reference positions within an insertion or deletion are assigned to the previous mapped read position (left justified; this behavior may change in future versions). 27 | This constitutes the reference anchoring used for modified base and sequence variant calling steps. 28 | 29 | ------------------------ 30 | Sequence Variant Calling 31 | ------------------------ 32 | 33 | Megalodon currently filters alleles over a certain maximum size (default ``50``) as performance on larger indels has not currently been validated. 34 | Note also that variants are converted into an "atomic" form (containing minimal unique variant sequence for indels). 35 | Thus atomic variants do not contain context sequence and are expanded to include regions of ambiguity (indel within a repetitive region). 36 | 37 | At each valid variant a region of context sequence around the variant is extracted. 38 | The context sequence allows the scoring algorithm to traverse slightly different paths through the local neural network output. 39 | The width of this sequence of interest is defined by the ``--variant-context-bases`` argument (specified individually for single base and insertion/deletion variants; defaults ``10`` and ``30`` respectively). 40 | 41 | Next the neural network output corresponding to the reference sequence of interest is extracted. 42 | The fuzzy reference anchoring described above identifies the range of the neural network output containing the sequence of interest. 43 | 44 | The sequence scoring function performs the forward-backward algorithm and Viterbi decoding over the neural network output to produce a score for the reference and proposed alternative sequence. 45 | The difference between these two scores is the assigned score for the proposed variant. 46 | Lower (negative) score are evidence for the alternative sequence and higher (positive) scores are evidence for the reference sequence. 47 | 48 | These raw scores are softmax values over potential states, to match characteristics of a probability distribution. 49 | In practice, these scores do not match empirical probabilities for a variant given a truth data set. 50 | Thus a calibration step is applied to convert these scores to estimated empirical probabilities. 51 | This enables more accurate aggregation across reads. 52 | 53 | As of version 1.0.0, megalodon now performs a second round of variant detection taking nearby variants into account. 54 | Variants from the first round (considering each variant in isolation) are filtered to by a minimal probability of evidence for variant allele (default ``0.05``; set with ``--context-min-alt-prob`` argument). 55 | In the second pass, variants within a set region are considered when estimating the probability of a particular variant (up to a set maximum number of context variants in order to reduce compute). 56 | Scores for each potential context are combined statistically (using logsumexp) and these are the final scores reported for each variant. 57 | This process reduces the number of false positives where a true variant is adjacent to another proposed variant. 58 | 59 | Finally, calls across reads at each reference location are aggregated in order make a sample-level call. 60 | These results will be output into a VCF format file. 61 | 62 | Currently ``diploid`` (default) and ``haploid`` variant aggregation modes are available. 63 | In ``haploid`` mode the probability of the reference and alternative alleles are simply the normalized (via Bayes' theorem) product of the individual read probabilities. 64 | In ``diploid`` mode the probability of each genotype (homozygous reference, heterozygous and homozygous alternative) are computed. 65 | The probabilities for homozygous alleles are as in the ``haploid`` mode, while the heterozygous probability is given by the weighted sum of the maximal probabilities taken over the sampling distribution (binomial with ``p=0.5``) given a true diploid heterozygous allele. 66 | These probabilities are then normalized given by Bayes' theorem. 67 | 68 | --------------------- 69 | Modified Base Calling 70 | --------------------- 71 | 72 | Modified base calling is performed largely in the same manner as variant calling above in terms of sequence and associated neural network output extraction. 73 | The main difference is that instead of proposing alternative canonical bases in the sequence, a modified base is proposed. 74 | This means that in order to identify a particular modification the model must be aware of this modification. 75 | Training models for particular modifications of interest is described in `Megalodon documentation here `_. 76 | 77 | Use the ``--mod-motif`` argument in order to restrict tested locations to certain relevant motifs (e.g. ``--mod-motif m CG 0`` to test only in CpG locations). 78 | Per-read modified base calls can be output in either a text table format or into a BAM file. 79 | There are two options to output per-read modified base calls into the BAM format. 80 | The default option when ``--outputs mod_mappings`` is specified is the `hts-spec proposed format `_. 81 | The second option emulates bisulfite sequencing (since this provides visualization options in some genome browsers). 82 | Specify the ``--mod-map-emulate-bisulfite`` option to select this output. 83 | See the ``--mod-map-base-conv`` option (``megalodon --help-long``) for further specification of this output. 84 | 85 | Modified bases can also be output anchored to the basecalls as in Guppy, but these calls are generally not as accurate than the reference anchored calls. 86 | These ``mod_basecalls`` are output in the BAM ``Mm`` and ``Ml`` tags as specified by hts-specs proposed format. 87 | -------------------------------------------------------------------------------- /megalodon/validation.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | import seaborn as sns 5 | import matplotlib 6 | 7 | if True: 8 | # Agg appears to be the most robust backend when only saving plots. 9 | matplotlib.use("Agg") 10 | import matplotlib.pyplot as plt 11 | from sklearn.metrics import ( 12 | roc_curve, 13 | auc, 14 | precision_recall_curve, 15 | average_precision_score, 16 | ) 17 | 18 | from megalodon import logging, megalodon_helper as mh 19 | 20 | LOGGER = logging.get_logger() 21 | 22 | # BANDWIDTH2 supports seaborn<0.11 when bw_adjust was introduced 23 | MOD_BANDWIDTH = 0.9 24 | MOD_BANDWIDTH2 = 0.2 25 | GRIDSIZE = 1000 26 | 27 | MOD_VAL_METRICS_HEADER = ( 28 | "{: <12}{: <19}{: <20}{: <9}{: <20}{: <19}{: <10}{} {}\n".format( 29 | "Optimal_F1", 30 | "Optimal_Threshold", 31 | "Mean_Avg_Precision", 32 | "ROC_AUC", 33 | "Num_Modified_Stats", 34 | "Num_Control_Stats", 35 | "Mod_Base", 36 | "Sample_Label", 37 | "Valid_Sites_Label", 38 | ) 39 | ) 40 | MOD_VAL_METRICS_TMPLT = ( 41 | "{: <12.6f}{: <19.4f}{: <20.6f}{: <9.6f}{: <20d}{: <19d}{: <10}{} {}\n" 42 | ) 43 | 44 | 45 | def plot_pr(pdf_fp, pr_data): 46 | for mod_base, mod_pr_data in pr_data.items(): 47 | LOGGER.info("Plotting {} precision-recall curves".format(mod_base)) 48 | plt.figure(figsize=(8, 7)) 49 | for lab, prec, recall in mod_pr_data: 50 | plt.step(recall, prec, label=lab, where="post") 51 | plt.ylim([-0.05, 1.05]) 52 | plt.xlim([-0.05, 1.05]) 53 | plt.xlabel("Recall") 54 | plt.ylabel("Precision") 55 | plt.title( 56 | ('Modified Base "{}" Precision-Recall Curves').format(mod_base) 57 | ) 58 | plt.legend() 59 | pdf_fp.savefig(bbox_inches="tight") 60 | plt.close() 61 | 62 | 63 | def plot_roc(pdf_fp, roc_data): 64 | for mod_base, mod_roc_data in roc_data.items(): 65 | LOGGER.info("Plotting {} ROC curves".format(mod_base)) 66 | plt.figure(figsize=(8, 7)) 67 | for lab, fpr, tpr in mod_roc_data: 68 | plt.step(fpr, tpr, label=lab) 69 | plt.ylim([-0.05, 1.05]) 70 | plt.xlim([-0.05, 1.05]) 71 | plt.xlabel("False Positive Rate") 72 | plt.ylabel("True Positive Rate") 73 | plt.title(('Modified Base "{}" ROC Curves').format(mod_base)) 74 | plt.legend() 75 | pdf_fp.savefig(bbox_inches="tight") 76 | plt.close() 77 | 78 | 79 | def plot_kde(pdf_fp, kde_data): 80 | for samp_lab, mod_stats, ctrl_stats in kde_data: 81 | LOGGER.info( 82 | "Plotting {} modified base statistics densities".format(samp_lab) 83 | ) 84 | plt.figure(figsize=(8, 5)) 85 | try: 86 | sns.kdeplot( 87 | mod_stats, 88 | shade=True, 89 | bw_adjust=MOD_BANDWIDTH, 90 | gridsize=GRIDSIZE, 91 | label="Yes", 92 | ) 93 | sns.kdeplot( 94 | ctrl_stats, 95 | shade=True, 96 | bw_adjust=MOD_BANDWIDTH, 97 | gridsize=GRIDSIZE, 98 | label="No", 99 | ) 100 | except AttributeError: 101 | sns.kdeplot( 102 | mod_stats, 103 | shade=True, 104 | bw=MOD_BANDWIDTH2, 105 | gridsize=GRIDSIZE, 106 | label="Yes", 107 | ) 108 | sns.kdeplot( 109 | ctrl_stats, 110 | shade=True, 111 | bw=MOD_BANDWIDTH2, 112 | gridsize=GRIDSIZE, 113 | label="No", 114 | ) 115 | plt.legend(prop={"size": 16}, title="Is Modified") 116 | plt.xlabel( 117 | "Log Likelihood Ratio\nMore Likely Modified <--> " 118 | + "More Likely Canonical" 119 | ) 120 | plt.ylabel("Density") 121 | plt.title(samp_lab) 122 | pdf_fp.savefig(bbox_inches="tight") 123 | plt.close() 124 | 125 | 126 | def compute_mod_sites_stats( 127 | mod_stats, ctrl_stats, balance_classes, mod_base, samp_lab, vs_lab, out_fp 128 | ): 129 | if balance_classes: 130 | # randomly downsample sample with more observations 131 | if mod_stats.shape[0] > ctrl_stats.shape[0]: 132 | mod_stats = np.random.choice( 133 | mod_stats, ctrl_stats.shape[0], replace=False 134 | ) 135 | elif mod_stats.shape[0] < ctrl_stats.shape[0]: 136 | ctrl_stats = np.random.choice( 137 | ctrl_stats, mod_stats.shape[0], replace=False 138 | ) 139 | 140 | is_can = np.repeat([0, 1], [mod_stats.shape[0], ctrl_stats.shape[0]]) 141 | all_stats = np.concatenate([mod_stats, ctrl_stats]) 142 | if any(np.isnan(all_stats)): 143 | LOGGER.warning( 144 | ("Encountered {} NaN modified base scores.").format( 145 | sum(np.isnan(all_stats)) 146 | ) 147 | ) 148 | all_stats = all_stats[~np.isnan(all_stats)] 149 | if all_stats.shape[0] == 0: 150 | raise mh.MegaError("All modified base scores are NaN") 151 | inf_idx = np.isinf(all_stats) 152 | if any(inf_idx): 153 | all_stats[inf_idx] = np.max(all_stats[~inf_idx]) 154 | neginf_idx = np.isinf(all_stats) 155 | if any(neginf_idx): 156 | all_stats[neginf_idx] = np.min(all_stats[~neginf_idx]) 157 | LOGGER.info( 158 | "Computing PR/ROC for {} from {} at {}".format( 159 | mod_base, samp_lab, vs_lab 160 | ) 161 | ) 162 | # compute roc and presicion recall 163 | precision, recall, thresh = precision_recall_curve(is_can, all_stats) 164 | prec_recall_sum = precision + recall 165 | valid_idx = np.where(prec_recall_sum > 0) 166 | all_f1 = ( 167 | 2 168 | * precision[valid_idx] 169 | * recall[valid_idx] 170 | / prec_recall_sum[valid_idx] 171 | ) 172 | optim_f1_idx = np.argmax(all_f1) 173 | optim_f1 = all_f1[optim_f1_idx] 174 | optim_thresh = thresh[optim_f1_idx] 175 | avg_prcn = average_precision_score(is_can, all_stats) 176 | 177 | fpr, tpr, _ = roc_curve(is_can, all_stats) 178 | roc_auc = auc(fpr, tpr) 179 | 180 | out_fp.write( 181 | MOD_VAL_METRICS_TMPLT.format( 182 | optim_f1, 183 | optim_thresh, 184 | avg_prcn, 185 | roc_auc, 186 | mod_stats.shape[0], 187 | ctrl_stats.shape[0], 188 | mod_base, 189 | samp_lab, 190 | vs_lab, 191 | ) 192 | ) 193 | pr_data = ( 194 | "{} at {} mAP={:0.2f}".format(samp_lab, vs_lab, avg_prcn), 195 | precision, 196 | recall, 197 | ) 198 | roc_data = ( 199 | "{} at {} AUC={:0.2f}".format(samp_lab, vs_lab, roc_auc), 200 | fpr, 201 | tpr, 202 | ) 203 | kde_data = ( 204 | "{} from {} at {}".format(mod_base, samp_lab, vs_lab), 205 | mod_stats, 206 | ctrl_stats, 207 | ) 208 | 209 | return pr_data, roc_data, kde_data 210 | 211 | 212 | if __name__ == "__main__": 213 | sys.stderr.write("This is a module. See commands with `megalodon -h`") 214 | sys.exit(1) 215 | -------------------------------------------------------------------------------- /megalodon_extras/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | from importlib import import_module 4 | 5 | from megalodon import __version__ 6 | from megalodon_extras import _extras_parsers as ep 7 | 8 | 9 | NESTED_COMMANDS = [ 10 | ( 11 | ep.GRP_AGG, 12 | "Aggregate per-read modified base and/or variant statistics", 13 | [ 14 | (ep.CMD_AGG_RUN, "Run aggregation"), 15 | ], 16 | ), 17 | ( 18 | ep.GRP_CALIB, 19 | "Calibrate model results with ground truth modified base or variants", 20 | [ 21 | (ep.CMD_CALIB_MODS, "Calibrate modified base statistics"), 22 | (ep.CMD_CALIB_VARS, "Calibrate sequence variant statistics"), 23 | (ep.CMD_CALIB_GEN_MODS, "Generate stats for mod calibration"), 24 | ( 25 | ep.CMD_CALIB_GEN_MODS_MSF, 26 | "Generate stats for mod calibration from mapped signal file", 27 | ), 28 | (ep.CMD_CALIB_GEN_VARS, "Generate stats for variant calibration"), 29 | (ep.CMD_CALIB_MERGE_MODS, "Merge modified base calibration files"), 30 | (ep.CMD_CALIB_MERGE_MODS_STATS, "Merge mod calibration stat files"), 31 | ], 32 | ), 33 | ( 34 | ep.GRP_MERGE, 35 | "Merge per-read databases or aggregated files", 36 | [ 37 | (ep.CMD_MERGE_MODS, "Merge per-read modified base database"), 38 | ( 39 | ep.CMD_MERGE_AGG_MODS, 40 | "Merge aggregated modified base bedmethyl files", 41 | ), 42 | (ep.CMD_MERGE_VARS, "Merge per-read sequence variants database"), 43 | ], 44 | ), 45 | ( 46 | ep.GRP_MODS, 47 | "Miscellaneous modified base operations", 48 | [ 49 | (ep.CMD_MODS_ALPHABET, "Print the alphabet for a choosen model"), 50 | ( 51 | ep.CMD_MODS_EST_THRESH, 52 | "Estimate optimal global modified base threshold for sequence markup", 53 | ), 54 | ( 55 | ep.CMD_MODS_UPDATE_DB, 56 | "Update modified base database from older versions of Megalodon", 57 | ), 58 | ( 59 | ep.CMD_MODS_GT, 60 | "Create ground truth modified base file from bedmethyl files", 61 | ), 62 | (ep.CMD_MODS_MOTIF, "Create BED file of motif sites"), 63 | ( 64 | ep.CMD_MODS_PER_SITE, 65 | "Extract per-site modified base thresholds for signal mapping " 66 | + "sequence markup", 67 | ), 68 | (ep.CMD_MODS_INDEX, "Create per-read modified base database index"), 69 | (ep.CMD_MODS_SPLIT, "Split modified base database by motif"), 70 | ], 71 | ), 72 | ( 73 | ep.GRP_PHASE, 74 | "Phase variants", 75 | [ 76 | ( 77 | ep.CMD_PHASE_FILT_WHATSHAP, 78 | "Filter variants not compatible with whatshap", 79 | ), 80 | ( 81 | ep.CMD_PHASE_GET_HAP_READS, 82 | "Extract read ids from haplotypes determined by whatshap", 83 | ), 84 | (ep.CMD_PHASE_MERGE_HAP, "Merge variants from haploid calls"), 85 | ], 86 | ), 87 | ( 88 | ep.GRP_TXT, 89 | "Output per-read text files", 90 | [ 91 | ( 92 | ep.CMD_TXT_MODS, 93 | "Output per-read modified base statistics text file", 94 | ), 95 | ( 96 | ep.CMD_TXT_VARS, 97 | "Output per-read sequence variant statistics text file", 98 | ), 99 | ], 100 | ), 101 | ( 102 | ep.GRP_VAL, 103 | "Validate per-read mapping and modified base results", 104 | [ 105 | ( 106 | ep.CMD_VAL_RES, 107 | "Validate per-read mappings and modified bases (if available)", 108 | ), 109 | (ep.CMD_VAL_AGG_MODS, "Validate aggregated modified bases results"), 110 | ( 111 | ep.CMD_VAL_COMP_MODS, 112 | "Compare aggregated modified base results (bedMethyl files", 113 | ), 114 | ( 115 | ep.CMD_VAL_MODS_CALIB, 116 | "Validate per-read modified bases from calibration file", 117 | ), 118 | ], 119 | ), 120 | ( 121 | ep.GRP_VARS, 122 | "Miscellaneous sequence variant operations", 123 | [ 124 | (ep.CMD_VAR_ATOM, "Atomize variants for faster processing"), 125 | (ep.CMD_VAR_RESOLVE, "Resolve potentially conflicting variants"), 126 | ( 127 | ep.CMD_VAR_HET_FACTOR, 128 | "Estimate optimal heterozygous factors for diploid variant calling", 129 | ), 130 | # TODO variant database API does not allow opening for writing once 131 | # database exists. 132 | (ep.CMD_VAR_INDEX, "***** Stub for future implementation *****"), 133 | ], 134 | ), 135 | ] 136 | 137 | 138 | class SubcommandHelpFormatter(argparse.RawDescriptionHelpFormatter): 139 | def _format_action(self, action): 140 | parts = super(SubcommandHelpFormatter, self)._format_action(action) 141 | if action.nargs == argparse.PARSER: 142 | parts = "\n".join(parts.split("\n")[1:]) 143 | return parts 144 | 145 | 146 | def _main(): 147 | """The main routine.""" 148 | desc = ( 149 | "Megalodon extras command groups (additional help available " 150 | + "within each command group):\n" 151 | + "\n".join( 152 | [ 153 | "\t{0: <25}{1}".format(grp_name, grp_help) 154 | for grp_name, grp_help, _ in NESTED_COMMANDS 155 | ] 156 | ) 157 | ) 158 | parser = argparse.ArgumentParser( 159 | prog="megalodon_extras", 160 | description="********** Megalodon Extras *********\n\n" 161 | + "Commands to perform operations related to main Megalodon command " 162 | + "including aggregation, variant phasing, validation, and more.\n\n" 163 | + desc, 164 | formatter_class=SubcommandHelpFormatter, 165 | ) 166 | parser.add_argument( 167 | "-v", 168 | "--version", 169 | action="version", 170 | version="Megalodon version: {}".format(__version__), 171 | help="Show Megalodon version and exit.", 172 | ) 173 | 174 | # add megalodon_extras command groups 175 | # service_command is a groupings of other commands 176 | # action_command is an executable command with detailed argument help 177 | service_subparsers = parser.add_subparsers(dest="service_command") 178 | for grp_name, grp_help, grp_sub_cmds in NESTED_COMMANDS: 179 | grp_desc = "\n".join( 180 | [ 181 | "\t{0: <30}{1}".format(cmd_name, cmd_help) 182 | for cmd_name, cmd_help in grp_sub_cmds 183 | ] 184 | ) 185 | grp_parser = service_subparsers.add_parser( 186 | grp_name, 187 | formatter_class=SubcommandHelpFormatter, 188 | description=grp_desc, 189 | ) 190 | grp_subparser = grp_parser.add_subparsers( 191 | title=grp_name, dest="action_command" 192 | ) 193 | for cmd_name, cmd_help in grp_sub_cmds: 194 | # add each action parser to this service parser group 195 | grp_subparser.add_parser( 196 | cmd_name, 197 | add_help=False, 198 | parents=[ 199 | ep.PARSERS[grp_name][cmd_name](), 200 | ], 201 | ) 202 | 203 | args = parser.parse_args() 204 | # if no service parser was provided print help and return 205 | if args.service_command is None: 206 | sys.stderr.write( 207 | "********** Please provide a megalodon_extras command group " 208 | + "for further help. **********\n" 209 | ) 210 | parser.print_help() 211 | sys.exit(2) 212 | 213 | # if no action parser is provided print that command groups help 214 | if args.action_command is None: 215 | sys.stderr.write( 216 | "********** Please provide a command for further help. " 217 | + "**********\n" 218 | ) 219 | parser.parse_args([args.service_command, "-h"]) 220 | 221 | module = import_module( 222 | ".{}_{}".format(args.service_command, args.action_command), 223 | "megalodon_extras", 224 | ) 225 | module._main(args) 226 | 227 | 228 | if __name__ == "__main__": 229 | _main() 230 | -------------------------------------------------------------------------------- /docs/extras_calibrate.rst: -------------------------------------------------------------------------------- 1 | ****************************** 2 | ``megalodon_extras calibrate`` 3 | ****************************** 4 | 5 | The ``megalodon_extras calibrate`` command group contains commands to produce Megalodon modified base and sequence variant calibration files for basecalling models. 6 | When a new basecalling model is trained a calibration file must be produced in order to obtain the most accurate aggregated modified base and sequence variant calls. 7 | Without a calibration file the ``--disable-mod-calibration`` or ``--disable-variant-calibration`` flags may be set, but aggregated results will likely be much less accurate. 8 | 9 | Calibration file estimation is broken down into two steps: 10 | 11 | 1. Ground truth statistic generation (``megalodon_extras calibrate generate_modified_base_stats`` and ``megalodon_extras calibrate generate_variant_stats`` commands) 12 | 13 | - This step processes completed Megalodon runs to extract ground truth positive and negative statistics. 14 | 2. Calibration estimation (``megalodon_extras calibrate modified_bases`` and ``megalodon_extras calibrate variants`` commands) 15 | 16 | - This step estimates the emperical probability of a modified base or sequence variant given the ground truth statistics from the first step. 17 | 18 | Note that the plots produced by the calibration procedure (with examples shown below) are stored in the GitHub repository for each released model (``megalodon/model_data/``). 19 | 20 | ----------------------------------------------------------- 21 | ``megalodon_extras calibrate generate_modified_base_stats`` 22 | ----------------------------------------------------------- 23 | 24 | Generate ground truth modified base statistics. 25 | 26 | The ground truth modified base composition for a run can be specified in two ways: 27 | 28 | 1. Control Megalodon results 29 | 30 | - Specify ``--control-megalodon-results-dir`` 31 | - Using this option assumes that all modified base statistics in ``--control-megalodon-results-dir`` represent canonical bases and all statistics in the main Megalodon results directory represent modified bases. 32 | 33 | - This respects the ``--mod-motif`` options specified in the main Megalodon commands. 34 | 2. Ground truth reference locations 35 | 36 | - Specify ``--ground-truth-data`` 37 | - See the ``megalodon_extras modified_bases create_ground_truth`` command for help producing a ground truth CSV file. 38 | 39 | ---------------------------------------------------------- 40 | ``megalodon_extras calibrate generate_mod_stats_from_msf`` 41 | ---------------------------------------------------------- 42 | 43 | In some situations ground truth control samples or reference locations are not available for calibration. 44 | The ``generate_mod_stats_from_msf`` sub-command uses the mapped signal file (``msf``) used for Taiyaki model training to produce Megalodon calibration statistics. 45 | This command uses the ground truth sequence including modified base annotation in order to extract modified base scores as computed in Megalodon. 46 | The extracted scores can be constricted to a fixed canonical sequence motif using the ``--motif`` argument (providing the sequence motif and relative modified position; e.g. ``--motif CG 0`` for CpG methylation). 47 | Note that the final set of Megalodon modified base statistics should contain enough data from both the modified and canonical set of sites. 48 | See ``megalodon_extras calibrate merge_modified_bases_stats`` below for merging sets of statistics. 49 | 50 | ----------------------------------------------------- 51 | ``megalodon_extras calibrate generate_variant_stats`` 52 | ----------------------------------------------------- 53 | 54 | Generate ground truth sequence variant statistics. 55 | 56 | This method produces ground truth sequence variant statistics by proposing alternatives to a reference sequence. 57 | It is thus assumed that the mapping location for each read contains the correct reference sequence. 58 | It is advised to select a set of reads with high quality mappings to a high quality reference for the sample. 59 | 60 | This command performs basecalling and read mapping as in the main Megalodon command. 61 | Variants are then randomly proposed and scored for a random set of sites across each read. 62 | "Correct" variants are not produced by default due to the computational overhead required to map full reads to the "incorrect" reference. 63 | This functionality is provided on an experimental basis via the ``--compute-false-reference-scores`` flag, but these scores are not currently accepted by the ``megalodon_extras calibrate variants`` command. 64 | 65 | --------------------------------------------- 66 | ``megalodon_extras calibrate modified_bases`` 67 | --------------------------------------------- 68 | 69 | Estimate modified base calibration file. 70 | 71 | Given a set of ground truth modified bases and raw Megalodon called statistics, compute empirical probabilities for a modified base. 72 | The ground truth statistics are generated by the ``megalodon_extras calibrate generate_modified_base_stats`` command, described above, and supplied via the ``--ground-truth-llrs`` argument. 73 | This command computes the empirical log-likelihood ratio over windows of observed modified base scores. 74 | This process involves several steps to ensure certain characteristics of the generating distributions (e.g. monotonicity). 75 | A separate calibration will be computed and stored in the output calibration file for each modified base found in the ground truth file. 76 | 77 | These steps are visualized in the example plot below, which can be produced for any new calibration file by providing the ``--out-pdf`` argument. 78 | The top facet of this plot shows the distribution of theoretical modified base log-likelihood ratios produced by the basecalling model. 79 | These distributions are smoothed such that they are monotonic from either extreme to the peak of the densities. 80 | The middle facet shows the inferred empirical probability that a base is modified given the theoretical modified base score produced by the basecaller. 81 | The final facet shows the same probabilities, but in log-likelihood space. 82 | A constraint is enforced on this function such that the value is monotonically increasing (red - before monotonic constraint; yellow - after monotonic constraint). 83 | The three vertical lines indicate common threshold values for modified base aggregation. 84 | Note that the fraction of data ignored at each threshold level is annotated in the figure legend. 85 | 86 | ---- 87 | 88 | .. figure:: _images/modified_base_calibration.png 89 | :align: center 90 | :width: 600 91 | 92 | Visualization of modified base calibration method. 93 | 94 | ---- 95 | 96 | --------------------------------------------------- 97 | ``megalodon_extras calibrate merge_modified_bases`` 98 | --------------------------------------------------- 99 | 100 | Merge modified base calibration files. 101 | 102 | In some cases the ground truth source for one modified base my come from a different source than another modified base. 103 | In this case calibration files can be computed separately and combined with this command. 104 | If multiple calibration files contain calibration for the same modified base, the calibration from the file listed first will be stored. 105 | 106 | --------------------------------------------------------- 107 | ``megalodon_extras calibrate merge_modified_bases_stats`` 108 | --------------------------------------------------------- 109 | 110 | Merge modified base calibration statistics files. 111 | 112 | In some cases the ground truth statistics may be extracted from several sources (unmodified and modified samples) and merged afterwards. 113 | This command enables this pipeline. 114 | 115 | --------------------------------------- 116 | ``megalodon_extras calibrate variants`` 117 | --------------------------------------- 118 | 119 | Estimate sequence variant calibration file. 120 | 121 | Given a set of ground truth sequence variant statistics, via ``--ground-truth-llrs`` argument, compute empirical probabilities of a sequence variant. 122 | This command computes the empirical log-likelihood ratio over windows of observed sequence variant scores. 123 | This process involves several steps to ensure certain characteristics of the generating distributions. 124 | This procedure is largely the same as the modified base calibration step, but the variants are grouped into categories based on the type of ground truth sequence variant. 125 | Note that the vertical bars are not present in these plots as sequence variant per-read statistics are combined in a probabilistic fashion and not based on a hard threshold. 126 | 127 | ---- 128 | 129 | .. figure:: _images/sequence_variant_calibration.png 130 | :align: center 131 | :width: 600 132 | 133 | Visualization of sequence variant calibration method. 134 | 135 | ---- 136 | -------------------------------------------------------------------------------- /docs/file_formats.rst: -------------------------------------------------------------------------------- 1 | ************ 2 | File Formats 3 | ************ 4 | 5 | This page describes the output file formats produced by ``megalodon``. 6 | Note that all outputs are unsorted (by reference position) unless specified in the output filename. 7 | 8 | ------------ 9 | Base Calling 10 | ------------ 11 | 12 | Basecalling produces either FASTQ or FASTA formats. 13 | Basecalls will be output into the ``basecalls.fastq`` or ``basecalls.fasta`` file within the ``--output-directory``. 14 | 15 | As of version 2.2, basecall anchored modified base calls (``mod_basecalls``) are output in an unmapped BAM file via the ``Mm`` and ``Ml`` tags `described by hts-specs here `_.. 16 | 17 | ------- 18 | Mapping 19 | ------- 20 | 21 | Mapped reads can be output in SAM, BAM or CRAM formats. 22 | Basecalls will be output into the ``mappings.sam``, ``mappings.bam``, or ``mappings.cram`` file within the ``--output-directory``. 23 | 24 | ~~~~~~~~~~~~~~~ 25 | Mapping Summary 26 | ~~~~~~~~~~~~~~~ 27 | 28 | When ``--outputs mappings`` is requested the ``mappings.summary.txt`` is produced. 29 | This file contains the following fields: 30 | 31 | #. read_id 32 | 33 | - Unique read identifier (from FAST5) 34 | #. pct_identity 35 | 36 | - Mapping/reference percent identity (computed as ``100 * num_match / num_align``) 37 | #. num_align 38 | 39 | - Length of full alignment (``num_match + num_mismatch + num_ins + num_del``) 40 | #. num_match 41 | 42 | - Number of basecalls aligned to a matching reference base 43 | #. num_del 44 | 45 | - Number of deleted reference bases implied by the alignment 46 | #. num_ins 47 | 48 | - Number of inserted reference bases implied by the alignment 49 | #. read_pct_coverage 50 | 51 | - Percentage of read basecalls included in reference alignment 52 | #. chrom 53 | 54 | - Reference contig name for mapping 55 | #. strand 56 | 57 | - Strand for mapping 58 | #. start 59 | 60 | - Reference coordinate for start of mapping (0-based close interval coordinate) 61 | #. end 62 | 63 | - Reference coordinate for end of mapping (0-based open interval coordinate) 64 | #. query_start 65 | 66 | - Basecall coordinate for start of mapping (0-based closed interval coordinate) 67 | #. query_end 68 | 69 | - Basecall coordinate for end of mapping (0-based open interval coordinate) 70 | #. map_sig_start 71 | 72 | - Raw signal coordinate for start of mapping (0-based closed interval coordinate). Note that this coordinate is as stored in the FAST5 file, so for RNA reads (5' to 3' read direction) the start coordinate will be greater than the end coordinate. 73 | #. map_sig_end 74 | 75 | - Raw signal coordinate for end of mapping (0-based open interval coordinate) 76 | #. sig_len 77 | 78 | - Length of signal for complete read 79 | #. map_num 80 | 81 | - Mapping number to distinguish multiple mappings from the same read. Should always be ``0`` when ``--allow-supplementary-alignments`` is not set. 82 | 83 | ~~~~~~~~~~~~~~~~~~~~~ 84 | Modified Base Mapping 85 | ~~~~~~~~~~~~~~~~~~~~~ 86 | 87 | As of version 2.2, the default output for the ``mod_mappings`` output type will be a single BAM file with modified base probabilities stored via the ``Mm`` and ``Ml`` tags, as in ``mod_basecalls`` above. 88 | This format can be output in SAM, BAM or CRAM format as specified by the ``--mappings-format`` argument (which also applies to the ``mappings`` and ``mod_basecalls`` outputs). 89 | 90 | In order to obtain ``mod_mappings`` in the same format as Meglodon version < 2.2 use the ``--mod-map-emulate-bisulfite`` flag. 91 | This option will output a file for each modified base represented in the basecalling model. 92 | The mapped reads in this output represent only the information about modified bases contained within each read. 93 | Each read includes the mapped reference bases with only the called modified bases annotated. 94 | The quality score for each called base (whether called as modified or canonical) represent the probability of a modified status and not the canonical base probability (as specified by the SAM format). 95 | Bases without a proposed modified base will contain a quality score of ``40``. 96 | 97 | In addition, the ``--mod-map-base-conv`` is provided to modulate the bases output by this format. 98 | This option is useful since the BAM and CRAM formats do not support modified bases and will convert all alternative bases to ``N`` for storage. 99 | For example, to mimic bisulfite output use ``--mod-map-base-conv C T --mod-map-base-conv Z C`` 100 | This can then be visualized by a genome browser as with standard bisulfite data. 101 | 102 | ---- 103 | 104 | .. figure:: _images/mod_mapping_viz.png 105 | :align: center 106 | :width: 600 107 | 108 | Genome browser visualization. Megalodon mod_mappings output. 109 | 110 | ---- 111 | 112 | ~~~~~~~~~~~~~~~ 113 | Variant Mapping 114 | ~~~~~~~~~~~~~~~ 115 | 116 | In addition to standard mapping files, megalodon includes a special mapping-style output with specific relevance to the variant calling pipeline. 117 | This format can be output as a SAM, BAM or CRAM file as with standard mapping format (as specified by the ``--mappings-format`` argument). 118 | The mapped reads in this output represent only the information about proposed variants contained within each read. 119 | Each read includes the mapped reference bases with only the called variants annotated. 120 | The score for each call is encoded in the base quality scores for each read. 121 | Bases without a proposed variant will contain a quality score of ``40``. 122 | Note that storage of insertion probabilities is not supported by the SAM/BAM format, so these score are lost in this format. 123 | This output is useful for 1) producing more accurate variant phasing and read haplotagging via whatshap and 2) visualizing per-read variant calls in a genome browser. 124 | 125 | ----------------------- 126 | Per-read Modified Bases 127 | ----------------------- 128 | 129 | ~~~~~~~~ 130 | Database 131 | ~~~~~~~~ 132 | 133 | The primary output for per-read modified base results is an `sqlite database `_. 134 | This database contains an indexed table with per-read, per-position, modified base scores, as well as auxiliary tables with read, modification type and reference chromosomes/records information. 135 | The read table (``read``) contains the read UUID. 136 | The modification type table (``mod``) contains the single letter modified base code, the modified base long name and the associated canonical base. 137 | 138 | As of version 2.2, the ``pos`` table has been dropped from the modified base schema. 139 | In place of the ``pos`` table, the ``chrm`` table contains the name and length of each chromosome/record in the reference sequence. 140 | The ``score_pos`` in the ``data`` table then contains an integer encoding of the ``(chrm, pos, strand)`` tuple (see ``megalodon.mods.ModsDb.get_pos_dbid`` and ``megalodon.mods.ModsDb.get_pos`` functions). 141 | This allows more efficient access to position information without requiring additional interaction with the database. 142 | 143 | The ``data`` table then contains the links between these tables along with the per-read log probability for each modified base at each called reference position in the ``score`` column. 144 | This table is indexed at the end of the run by the ``score_pos`` field such that iteration over the table (via ``megalodon.mods.ModsDb.iter_pos_scores`` occurs in reference sorted order. 145 | 146 | This database may be accessed via the ``megalodon.mods.ModsDb`` object. 147 | More documentation on the usage of the ``megalodon.mods.ModsDb`` interface will be added in a future release. 148 | 149 | ~~~~~~~~~~~~~ 150 | Tab-delimited 151 | ~~~~~~~~~~~~~ 152 | 153 | Modified bases results are also available via tab-delimited text output. 154 | This output can be requested via the ``--write-mods-text`` flag or obtained after a run via the ``megalodon_extras per_read_text modified_bases`` command. 155 | This output contains the following fields: ``read_id``, ``chrm``, ``strand``, ``pos``, ``mod_log_prob``, ``can_log_prob``, and ``mod_base`` 156 | 157 | ------------------------- 158 | Aggregated Modified Bases 159 | ------------------------- 160 | 161 | The default aggregated modified base output is the bedMethyl format (`description here `_). 162 | Alternative formats are `wiggle `_ (variableStep) and VCF (treating the modified base as if it were a sequence variant). 163 | 164 | -------------------------- 165 | Per-read Sequence Variants 166 | -------------------------- 167 | 168 | As with the modified base results, the primary output for per-read sequence variant results is as `sqlite database `_. 169 | This database contains an indexed table with per-read, per-position, variant scores, as well as auxiliary tables with read, reference location and alternative allele information. 170 | 171 | The reference location table (``loc``) contains the mapped 0-based position, strand (1=forward, -1=reverse) and chromosome (via a final ``chrm`` table which contains the chromosome text). 172 | The ``loc`` table also contains the location for the start and end of the tested positions (applicable for insertions/deletions). 173 | For example, insertions generally require a context base for downstream processing, but within megalodon only the inserted position is considered (without context). 174 | Each reference location is linked to the IDs linked with this location from the input variants file. 175 | Finally the reference sequence for the location is included in this table. 176 | In the related ``alt`` table, each alternative sequence is stored. 177 | Links between alternative sequences and reference locations are made via the main ``data`` table. 178 | 179 | The ``read`` table contains the read UUID as well as the mapped strand for each read. 180 | 181 | ---------------------------- 182 | Aggregated Sequence Variants 183 | ---------------------------- 184 | 185 | Sequence variant calls are output in standard VCF format (version 4.1). 186 | The sample format fields includes the following standard VCF fields: ``gt``, ``gq``, ``gp``, ``gl``, and ````pl`` 187 | In addition the non-standard ``log_probs`` field, containing the per-read contributions to the variant call, can be added to the VCF file by setting the ``--write-vcf-log-probs`` flag. 188 | -------------------------------------------------------------------------------- /docs/modbase_training.rst: -------------------------------------------------------------------------------- 1 | ************************************** 2 | Megalodon Modified Base Model Training 3 | ************************************** 4 | 5 | This page describes the process to generate modified base training data and train a basecalling model capable of also detecting modified bases. 6 | The options provided from Megalodon for modified base training data annotation are each linked to a specific sample type. 7 | If the provided options for do not perform sufficiently for a particular sample type, please open an issue on the `Megalodon issues page `_ with details for the sample type and intended training procedure. 8 | 9 | Currently two sample types are supported for modified base training data markup: 10 | 11 | 1. Modified base in known sequence context (e.g. bacterial methylation) 12 | 2. Native (fractionally modified) sample with existing modified base basecalling model 13 | 14 | Given the first sample above, a model will never be presented a modified and canonical base in close proximity (within the same training chunk). 15 | Thus the second sample type is often required to produce a final model with sufficient performance across biologically relevant samples. 16 | 17 | The highest accuracy markup of the native sample is imperative to achieve the highest performance modified base basecalling model. 18 | In version 2.3, an adaptation was added to allow for markup informed by a reference methylation ground truth (fraction of modified reads at each reference site). 19 | This ground truth can come from a number of sources and technologies. 20 | Find more details below on how to use this method. 21 | 22 | ------------------------------ 23 | Modified Base in Known Context 24 | ------------------------------ 25 | 26 | Given a sample with all locations matching a particular sequence motif modified (e.g. bacterial methylation) the ``--ref-mods-all-motifs`` option can be specified. 27 | The ``--ref-mods-all-motifs`` argument takes ``4`` arguments. 28 | These values are: 29 | 30 | 1. Single letter code for the modified base 31 | 2. Long name for the modified base 32 | 3. Sequence motif (using `IUPAC ambiguity codes `_) 33 | 4. Relative position for the modified base within the sequence motif (0-based coordinate) 34 | 35 | The first two values are simply stored in the training data file. 36 | When used for model training, these values will be saved for annotation of output files where appropriate (e.g. Megalodon or Guppy output files). 37 | 38 | Using this mode of modified base annotation, the specified basecalling model can be either a standard (canonical bases only) model or a modified base model. 39 | Thus this method of annotation can be specified for modifications with no previous modeling (as long as the current basecalls map to the provided reference). 40 | If the basecalling model specified is a modified base model, the single letter, long name, and corresponding canonical base attributes must be in agreement with the basecalling model. 41 | 42 | As an example a native E. coli K12 sample contains 5mC methylation at a single motifs, ``CCWGG`` at the second position, and 6mA methylation at several motifs, ``GATC`` at the second position, ``AACNNNNNNGTGC`` at the second position, and ``GCACNNNNNNGTT`` at the third position (`source `_). 43 | In order to annotate all of these modifications in a training data set the following command would be run: 44 | 45 | :: 46 | 47 | megalodon ecoli_k12_fast5s/ \ 48 | --outputs signal_mappings \ 49 | --reference ecoli_k12_reference.fa \ 50 | --devices 0 --processes 40 \ 51 | --ref-mods-all-motifs m 5mC CCWGG 1 \ 52 | --ref-mods-all-motifs a 6mA GATC 1 \ 53 | --ref-mods-all-motifs a 6mA AACNNNNNNGTGC 1 \ 54 | --ref-mods-all-motifs a 6mA GCACNNNNNNGTT 2 55 | 56 | Note that the single letter codes, ``m`` and ``a``, can be set to any value desired by the user. 57 | It is recommended that the values follow `specifications found in hts-specs `_. 58 | These values will be stored in the trained model for outputs where appropriate. 59 | 60 | ---------------------------------- 61 | Bootstrap Modified Base Annotation 62 | ---------------------------------- 63 | 64 | Once a modified base model is trained (see above) and calibration file computed (see below), further models can be trained by marking modified base training data with this model. 65 | 66 | .. warning:: 67 | 68 | Great care should be taken when training a modified base basecalling model, especially with this method. 69 | The accuracy of reference modified base markup is strongly indicative of the final modified base detection performance for a trained model. 70 | 71 | The following example assumes a trained model to detect 5mC (``m``) in ``CG`` sequence contexts (model specified in ``model_final.cfg``). 72 | In order to annotate 5mC sites in a modified base training data set (``signal_mapping``) using a modified base model the following command would be run: 73 | 74 | :: 75 | 76 | megalodon native_human_fast5s/ \ 77 | --outputs signal_mappings \ 78 | --reference reference.fa.mmi \ 79 | --devices 0 --processes 40 \ 80 | --mod-motif m CG 0 \ 81 | --ref-include-mods \ 82 | --guppy-config model_final.cfg 83 | 84 | The ``--ref-mod-threshold`` argument is provided to adjust the annotation based on modeling results. 85 | By default the threshold to annotate a base as modified is a log likelihood ratio of ``0`` (i.e. the modified base is more likely than the canonical base based on empirically calibrated statistics). 86 | In some samples this value may not be optimal. 87 | The ``megalodon_extras modified_bases estimate_threshold`` command is provided for assistance in determining a reasonable value for this parameter. 88 | 89 | ----------------------------------------------------- 90 | Ground Truth Aided Bootstrap Modified Base Annotation 91 | ----------------------------------------------------- 92 | 93 | To further improve modified base training data markup, a reference anchored ground truth can be leveraged. 94 | This method sets the modified base markup threshold at each reference position, informed by the provided ground truth. 95 | This is similar to the ``--ref-mod-threshold`` argument, but this threshold is global to all reference positions. 96 | 97 | The first step in this method is to call per-read modified bases on the native sample of interest. 98 | This sample should contain sufficient depth, such that the identified modified base threshold at each position will be robust. 99 | 50X coverage is a rough target for sufficient coverage with this method. 100 | 101 | Given a completed Megalodon run (``megalodon_results`` with ``--outputs mappings per_read_mods``) and the ground truth bedmethyl file (``ground_truth_methylation.CG.bed``) the following command will compute per-site modified base thresholds and low coverage sites. 102 | 103 | :: 104 | 105 | # Compute per-site thresholds 106 | megalodon_extras \ 107 | modified_bases per_site_thresholds \ 108 | megalodon_results \ 109 | ground_truth_methylation.CG.bed \ 110 | --strand-offset 1 \ 111 | --ground-truth-coverage-pdf gt_cov.CG.pdf \ 112 | --ground-truth-cov-min 50 \ 113 | --nanopore-cov-min 50 \ 114 | --out-blacklist-sites low_coverage_sites.CG.bed \ 115 | --out-per-site-mod-thresholds site_mod_thresholds.CG.bed 116 | # sort low coverage sites for faster bedtools filtering 117 | sort -S 25% --parallel=56 -T /tmp/ \ 118 | -k1,1V -k2,2n \ 119 | -o low_coverage_sites.CG.sorted.bed 120 | # filter and sort first round mappings (% ident>90, read coverage>90%, length between 1000 and 20000) 121 | awk '$2 > 90 && $7 > 90 && $3 - $6 > 1000 && $3 - $6 < 20000 {print $8"\t"$10"\t"$11"\t"$1"\t.\t"$9}' \ 122 | megalodon_results/mappings.summary.txt | \ 123 | sort -S 25% --parallel=56 -T /tmp/ -k1,1V -k2,2n > \ 124 | mappings.filtered.sorted.bed 125 | intersectBed \ 126 | -a mappings.filtered.sorted.bed \ 127 | -b low_coverage_sites.CG.sorted.bed -s -sorted -v | \ 128 | awk '{print $4}' > train_read_ids.txt 129 | 130 | Finally the ground truth aided modified base markup training data set is produced with the following command. 131 | 132 | :: 133 | 134 | megalodon \ 135 | native_human_fast5s/ \ 136 | --reference reference.fa.mmi \ 137 | --output-directory per_site_markup_mega_res \ 138 | --outputs per_read_mods signal_mappings \ 139 | --mod-motif m CG 0 \ 140 | --guppy-config model_final.cfg \ 141 | --processes 40 --devices 0 \ 142 | --ref-include-mods \ 143 | --mod-per-site-threshold site_mod_thresholds.CG.bed \ 144 | --read-ids-filename train_read_ids.txt 145 | 146 | ---------------------------- 147 | Modified Base Model Training 148 | ---------------------------- 149 | 150 | Given any of the above modified base annotated mapped signal files, a new modified base model can be trained with `Taiyaki `_. 151 | Below is an example command to train a modified base model from the data prepared above and convert the final model for use with Guppy or Megalodon. 152 | 153 | :: 154 | 155 | train_flipflop.py ./taiyaki/models/mLstm_cat_mod_flipflop.py \ 156 | megalodon_results/signal_mappings.hdf5 --device 0 157 | # dump model to json format for use by guppy 158 | dump_json.py training/model_final.checkpoint \ 159 | --output model_final.jsn 160 | 161 | The produced model should be referenced from a new Guppy config file. 162 | The easiest way to obtain this would be to copy and edit the closest existing Guppy config file in the ``data`` directory of Guppy. 163 | 164 | --------------------- 165 | Megalodon Calibration 166 | --------------------- 167 | 168 | In order to produce the most accurate aggregated modified base calls, Megalodon requires the computation of a calibration file. 169 | A ground truth sample containing known modified reference sites as well as known canonical base sites is required. 170 | This can be the same as the model training data. 171 | A modified base calibration file is created with the ``megalodon_extras calibrate generate_modified_base_stats`` and ``megalodon_extras calibrate modified_bases`` commands. 172 | Please see the `calibration documentation page `_ for more details about this process. 173 | --------------------------------------------------------------------------------