├── megalodon_extras
├── __init__.py
├── phase_variants_extract_haplotype_reads.py
├── modified_bases_index_database.py
├── variants_index_database.py
├── modified_bases_describe_alphabet.py
├── calibrate_merge_modified_bases_stats.py
├── modified_bases_estimate_threshold.py
├── aggregate_run.py
├── validate_mod_bases_from_calibration.py
├── modified_bases_create_motif_bed.py
├── per_read_text_variants.py
├── per_read_text_modified_bases.py
├── merge_variants.py
├── calibrate_merge_modified_bases.py
├── phase_variants_whatshap_filter.py
├── modified_bases_create_ground_truth.py
├── variants_atomize.py
├── modified_bases_update_database.py
├── merge_aggregated_modified_bases.py
├── calibrate_generate_modified_base_stats.py
├── calibrate_modified_bases.py
├── variants_heterozygous_factor.py
├── modified_bases_split_by_motif.py
└── __main__.py
├── megalodon
├── __init__.py
├── model_data
│ ├── dna_r9.4.1_450bps_modbases_5mc_hac.cfg
│ ├── dna_r10.3_450bps_modbases_5mc_hac_prom.cfg
│ ├── dna_r9.4.1_450bps_modbases_5mc_hac_prom.cfg
│ ├── dna_r10.3_450bps_fast.cfg
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── dna_r10.3_450bps_hac.cfg
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── dna_r9.4.1_450bps_hac.cfg
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── dna_r9.4.1_450bps_fast.cfg
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── dna_r10.3_450bps_fast_prom.cfg
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── dna_r10.3_450bps_hac_prom.cfg
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── dna_r9.4.1_450bps_hac_prom.cfg
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── dna_r9.4.1_450bps_fast_prom.cfg
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── res_dna_r103_prom_modbases_5mC_v001.cfg
│ │ ├── megalodon_mod_calibration.npz
│ │ ├── megalodon_mod_calibration.pdf
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── res_dna_r941_min_modbases_5mC_v001.cfg
│ │ ├── megalodon_mod_calibration.npz
│ │ ├── megalodon_mod_calibration.pdf
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── res_dna_r941_prom_modbases_5mC_v001.cfg
│ │ ├── megalodon_mod_calibration.npz
│ │ ├── megalodon_mod_calibration.pdf
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── res_dna_r941_min_modbases_5mC_5hmC_v001.cfg
│ │ ├── megalodon_mod_calibration.npz
│ │ └── megalodon_mod_calibration.pdf
│ ├── res_dna_r941_min_modbases_5mC_CpG_v001.cfg
│ │ ├── megalodon_mod_calibration.npz
│ │ ├── megalodon_mod_calibration.pdf
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── res_dna_r941_prom_modbases_5mC_CpG_v001.cfg
│ │ ├── megalodon_mod_calibration.npz
│ │ ├── megalodon_mod_calibration.pdf
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg
│ │ ├── megalodon_mod_calibration.npz
│ │ ├── megalodon_mod_calibration.pdf
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── res_dna_r941_min_modbases-all-context_v001.cfg
│ │ ├── megalodon_mod_calibration.npz
│ │ ├── megalodon_mod_calibration.pdf
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ ├── res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg
│ │ ├── megalodon_mod_calibration.npz
│ │ ├── megalodon_mod_calibration.pdf
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
│ └── dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg
│ │ ├── megalodon_mod_calibration.npz
│ │ ├── megalodon_mod_calibration.pdf
│ │ ├── megalodon_variant_calibration.npz
│ │ └── megalodon_variant_calibration.pdf
├── constrained_decoding.py
├── logging.py
├── megalodon_multiprocessing.py
├── banding.py
├── signal_mapping.py
└── validation.py
├── MANIFEST.in
├── ONT_logo_deprecate.png
├── docs
├── _images
│ ├── mod_mapping_viz.png
│ ├── mod_agg_comp_cov.png
│ ├── mod_agg_comp_log.png
│ ├── mod_agg_comp_linear.png
│ ├── mod_agg_dist_results.png
│ ├── mod_pr_validate_results.png
│ ├── mapping_validate_results.png
│ ├── mod_dist_validate_results.png
│ ├── mod_roc_validate_results.png
│ ├── modified_base_calibration.png
│ ├── sequence_variant_calibration.png
│ └── whatshap_haplotagged_variant_viz.png
├── extras_per_read_text.rst
├── extras_aggregate.rst
├── extras_merge.rst
├── extras_phase_variants.rst
├── computing_considerations.rst
├── variant_phasing.rst
├── model_training.rst
├── extras_variants.rst
├── common_arguments.rst
├── index.rst
├── extras_modified_bases.rst
├── extras_validate.rst
├── algorithm_details.rst
├── extras_calibrate.rst
├── file_formats.rst
└── modbase_training.rst
├── pyproject.toml
├── .gitlab-ci.yml
├── setup.py
├── .travis.yml
├── setup.cfg
└── test
└── test_api.py
/megalodon_extras/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/megalodon/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "2.5.0"
2 |
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_modbases_5mc_hac.cfg:
--------------------------------------------------------------------------------
1 | res_dna_r941_min_modbases_5mC_v001.cfg/
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENCE.txt
3 | include megalodon/model_data/*/*.npz
4 |
--------------------------------------------------------------------------------
/ONT_logo_deprecate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/ONT_logo_deprecate.png
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r10.3_450bps_modbases_5mc_hac_prom.cfg:
--------------------------------------------------------------------------------
1 | res_dna_r103_prom_modbases_5mC_v001.cfg
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_modbases_5mc_hac_prom.cfg:
--------------------------------------------------------------------------------
1 | res_dna_r941_prom_modbases_5mC_v001.cfg
--------------------------------------------------------------------------------
/docs/_images/mod_mapping_viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_mapping_viz.png
--------------------------------------------------------------------------------
/docs/_images/mod_agg_comp_cov.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_agg_comp_cov.png
--------------------------------------------------------------------------------
/docs/_images/mod_agg_comp_log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_agg_comp_log.png
--------------------------------------------------------------------------------
/docs/_images/mod_agg_comp_linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_agg_comp_linear.png
--------------------------------------------------------------------------------
/docs/_images/mod_agg_dist_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_agg_dist_results.png
--------------------------------------------------------------------------------
/docs/_images/mod_pr_validate_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_pr_validate_results.png
--------------------------------------------------------------------------------
/docs/_images/mapping_validate_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mapping_validate_results.png
--------------------------------------------------------------------------------
/docs/_images/mod_dist_validate_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_dist_validate_results.png
--------------------------------------------------------------------------------
/docs/_images/mod_roc_validate_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/mod_roc_validate_results.png
--------------------------------------------------------------------------------
/docs/_images/modified_base_calibration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/modified_base_calibration.png
--------------------------------------------------------------------------------
/docs/_images/sequence_variant_calibration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/sequence_variant_calibration.png
--------------------------------------------------------------------------------
/docs/_images/whatshap_haplotagged_variant_viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/docs/_images/whatshap_haplotagged_variant_viz.png
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r10.3_450bps_fast.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_fast.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r10.3_450bps_fast.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_fast.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r10.3_450bps_hac.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_hac.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r10.3_450bps_hac.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_hac.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_hac.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_hac.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_hac.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_hac.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_fast.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_fast.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_fast.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_fast.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r10.3_450bps_fast_prom.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_fast_prom.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r10.3_450bps_fast_prom.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_fast_prom.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r10.3_450bps_hac_prom.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_hac_prom.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r10.3_450bps_hac_prom.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r10.3_450bps_hac_prom.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_hac_prom.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_hac_prom.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_hac_prom.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_hac_prom.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_fast_prom.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_fast_prom.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_fast_prom.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_fast_prom.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_mod_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r103_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_v001.cfg/megalodon_mod_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_v001.cfg/megalodon_mod_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_v001.cfg/megalodon_mod_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_v001.cfg/megalodon_mod_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_mod_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_v001.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_mod_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_mod_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_mod_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_mod_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_mod_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_mod_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_mod_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_mod_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_mod_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_mod_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_mod_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_mod_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_prom_modbases_5mC_CpG_v001.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_mod_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_mod_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_mod_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_mod_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases-all-context_v001.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/res_dna_r941_min_modbases_5mC_5hmC_CpG_v001.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_variant_calibration.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_variant_calibration.npz
--------------------------------------------------------------------------------
/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_variant_calibration.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/megalodon/HEAD/megalodon/model_data/dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac_prom.cfg/megalodon_variant_calibration.pdf
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 80
3 | target-version = ['py36']
4 | include = '\.pyi?$'
5 | exclude = '''
6 |
7 | (
8 | /(
9 | \.eggs # exclude a few common directories in the
10 | | \.git # root of the project
11 | | \.hg
12 | | \.mypy_cache
13 | | \.tox
14 | | \.venv
15 | | _build
16 | | buck-out
17 | | build
18 | | dist
19 | | venv
20 | )/
21 | )
22 | '''
23 |
--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
1 | image: python:3.6
2 |
3 | stages:
4 | - format
5 |
6 | before_script:
7 | - python -V # Print out python version for debugging
8 | - pip install virtualenv
9 | - virtualenv venv
10 | - source venv/bin/activate
11 | - pip install --upgrade pip
12 | - pip install .[testing]
13 |
14 | black:
15 | stage: format
16 | script:
17 | - source venv/bin/activate
18 | - black --check .
19 |
20 | docs:
21 | stage: format
22 | script:
23 | - source venv/bin/activate
24 | - sphinx-build -b html docs builddir && tar -zcf docs_build.tgz builddir
25 | artifacts:
26 | paths:
27 | - docs_build.tgz
28 | expire_in: 1 week
29 | only:
30 | - master
31 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from setuptools import setup, Extension
3 |
4 |
5 | if __name__ == "__main__":
6 | # Note that using setup_requires cython allows users to install megalodon
7 | # without first installing cython (as required when using cythonize)
8 | extra_compile_args = ["-std=c99"]
9 | if sys.platform == "darwin":
10 | extra_compile_args.append("-mmacosx-version-min=10.9")
11 | print("Using macOS clang args")
12 | ext_modules = [
13 | Extension(
14 | "megalodon.decode",
15 | sources=["megalodon/decode.pyx"],
16 | extra_compile_args=extra_compile_args,
17 | language="c",
18 | ),
19 | ]
20 | setup(
21 | use_pyscaffold=True,
22 | setup_requires=["setuptools>=38.3", "cython"],
23 | ext_modules=ext_modules,
24 | )
25 |
--------------------------------------------------------------------------------
/megalodon_extras/phase_variants_extract_haplotype_reads.py:
--------------------------------------------------------------------------------
1 | import pysam
2 |
3 | from ._extras_parsers import get_parser_phase_variants_extract_haplotype_reads
4 |
5 |
6 | def _main(args):
7 | out_fps = {}
8 | for rec in pysam.AlignmentFile(args.alignment_filename):
9 | try:
10 | hp = dict(rec.tags)["HP"]
11 | except KeyError:
12 | # skip un-tagged reads
13 | continue
14 | if hp not in out_fps:
15 | out_fps[hp] = open(
16 | "{}.haplotype_{}_read_ids.txt".format(args.out_basename, hp),
17 | "w",
18 | )
19 | out_fps[hp].write(rec.qname + "\n")
20 |
21 | for fp in out_fps.values():
22 | fp.close()
23 |
24 | return
25 |
26 |
27 | if __name__ == "__main__":
28 | _main(get_parser_phase_variants_extract_haplotype_reads().parse_args())
29 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 | dist: trusty
5 | sudo: required
6 |
7 | addons:
8 | apt:
9 | sources:
10 | - ubuntu-toolchain-r-test
11 | packages:
12 | - libhdf5-dev
13 |
14 |
15 | install:
16 | - echo $TRAVIS_PYTHON_VERSION
17 | - pip install --upgrade pip setuptools wheel
18 | - pip install --only-binary=numpy,scipy,cython numpy scipy cython
19 | - pip install .
20 | - pip install sphinx sphinx_rtd_theme sphinx-argparse
21 |
22 |
23 | script:
24 | - echo "No testing implemented"
25 |
26 |
27 | before_deploy:
28 | - cd docs
29 | - sphinx-build -b html -d _build/doctrees . _build/html
30 | - cd ../
31 | - touch docs/_build/html/.nojekyll
32 |
33 | deploy:
34 | provider: pages
35 | skip_cleanup: true
36 | github_token: $GHPAGES_TOKEN
37 | local_dir: docs/_build/html
38 | target_branch: gh-pages
39 | on:
40 | branch: master
41 | python: "3.6"
42 |
--------------------------------------------------------------------------------
/docs/extras_per_read_text.rst:
--------------------------------------------------------------------------------
1 | **********************************
2 | ``megalodon_extras per_read_text``
3 | **********************************
4 |
5 | The ``megalodon_extras per_read_text`` command group contains commands to convert per-read modified base database statistics to text files.
6 | These files will be TSV files with headers describing the fields contained within the file.
7 |
8 | Note that these scripts are single threaded and can be quite slow for reasonable sized runs.
9 |
10 | -------------------------------------------------
11 | ``megalodon_extras per_read_text modified_bases``
12 | -------------------------------------------------
13 |
14 | Extract text format per-read modified base scores from a Megalodon per-read modified base database.
15 |
16 | -------------------------------------------
17 | ``megalodon_extras per_read_text variants``
18 | -------------------------------------------
19 |
20 | Extract text format per-read sequence variant scores from a Megalodon per-read sequence variant database.
21 |
--------------------------------------------------------------------------------
/megalodon_extras/modified_bases_index_database.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from megalodon import logging, mods, megalodon_helper as mh
3 | from ._extras_parsers import get_parser_modified_bases_index_database
4 |
5 |
6 | LOGGER = logging.get_logger()
7 |
8 |
9 | def _main(args):
10 | logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix)
11 | LOGGER.debug('Command: """' + " ".join(sys.argv) + '"""')
12 |
13 | mods_db_fn = mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME)
14 | mods_db = mods.ModsDb(mods_db_fn, read_only=False)
15 | try:
16 | mods_db.check_data_covering_index_exists()
17 | LOGGER.info("Modified bases database index already exists")
18 | except mh.MegaError:
19 | LOGGER.info("Creating modified bases database index")
20 | mods_db.create_data_covering_index()
21 | LOGGER.debug("Closing database")
22 | mods_db.close()
23 |
24 |
25 | if __name__ == "__main__":
26 | _main(get_parser_modified_bases_index_database().parse_args())
27 |
--------------------------------------------------------------------------------
/megalodon_extras/variants_index_database.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from megalodon import logging, variants, megalodon_helper as mh
3 | from ._extras_parsers import get_parser_variants_index_database
4 |
5 |
6 | LOGGER = logging.get_logger()
7 |
8 |
9 | def _main(args):
10 | raise NotImplementedError(
11 | "Variant index creation not currently implemented."
12 | )
13 |
14 | logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix)
15 | LOGGER.debug('Command: """' + " ".join(sys.argv) + '"""')
16 |
17 | vars_db_fn = mh.get_megalodon_fn(args.megalodon_directory, mh.PR_VAR_NAME)
18 | vars_db = variants.VarsDb(vars_db_fn, read_only=False)
19 | try:
20 | vars_db.check_data_covering_index_exists()
21 | LOGGER.info("Variants database index already exists")
22 | except mh.MegaError:
23 | LOGGER.info("Creating variants database index")
24 | vars_db.create_data_covering_index()
25 | LOGGER.debug("Closing database")
26 | vars_db.close()
27 |
28 |
29 | if __name__ == "__main__":
30 | _main(get_parser_variants_index_database().parse_args())
31 |
--------------------------------------------------------------------------------
/docs/extras_aggregate.rst:
--------------------------------------------------------------------------------
1 | ******************************
2 | ``megalodon_extras aggregate``
3 | ******************************
4 |
5 | The ``megalodon_extras aggregate`` command group contains a single command, ``run``, to perform aggregation of per-read sequence variant or modified base results.
6 |
7 | ----------------------------------
8 | ``megalodon_extras aggregate run``
9 | ----------------------------------
10 |
11 | Aggregate per-read sequence variant and/or modified base from the main ``megalodon`` command.
12 |
13 | This command can be useful in processing Megalodon pipelines efficiently.
14 | This command allows the ``megalodon`` command can be performed on one set of computing resources and then ``megalodon_extras aggregate run`` can be completed on a separate set of computing resources.
15 | The ``megalodon`` command, running the basecalling backend, generally requires GPU resources, while the aggregation step generally requires fast disk (SSDs) and a lot of CPU cores.
16 | This command allows one to perform these steps separately and on appropriate compute resources.
17 |
18 | Additionally, this command can allow for the adjustment of aggregation parameters without the need to repeat the compute expensive basecalling step.
19 |
--------------------------------------------------------------------------------
/megalodon_extras/modified_bases_describe_alphabet.py:
--------------------------------------------------------------------------------
1 | from megalodon import backends, logging, megalodon_helper as mh
2 | from ._extras_parsers import get_parser_modified_bases_describe_alphabet
3 |
4 |
5 | LOGGER = logging.get_logger()
6 |
7 |
8 | def _main(args):
9 | try:
10 | mh.mkdir(args.guppy_logs_output_directory, False)
11 | except mh.MegaError:
12 | LOGGER.warning(
13 | "Guppy logs output directory exists. Potentially overwriting "
14 | + "guppy logs."
15 | )
16 | logging.init_logger(args.log_directory)
17 | # set args that are not relevant to alphabet
18 | args.devices = None
19 |
20 | # set guppy args
21 | args.guppy_server_port = None
22 | args.guppy_timeout = mh.DEFAULT_GUPPY_TIMEOUT
23 | args.guppy_concurrent_reads = mh.DEFAULT_GUPPY_CONCURRENT_READS
24 | args.output_directory = args.guppy_logs_output_directory
25 |
26 | # set taiyaki args
27 | args.chunk_size = 1000
28 | args.chunk_overlap = 100
29 | args.max_concurrent_chunks = 200
30 | backend_params = backends.parse_backend_params(args)
31 | with backends.ModelInfo(backend_params, 1) as model_info:
32 | LOGGER.info(model_info.get_alphabet_str())
33 |
34 |
35 | if __name__ == "__main__":
36 | _main(get_parser_modified_bases_describe_alphabet().parse_args())
37 |
--------------------------------------------------------------------------------
/megalodon_extras/calibrate_merge_modified_bases_stats.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | import numpy as np
4 |
5 | from megalodon import logging, mods
6 | from ._extras_parsers import get_parser_calibrate_merge_modified_bases_stats
7 |
8 |
9 | LOGGER = logging.get_logger()
10 |
11 |
12 | def _main(args):
13 | logging.init_logger()
14 |
15 | fn_mod_base_llrs = defaultdict(lambda: ([], []))
16 | for llr_fn in args.modified_base_calibration_stats_files:
17 | llrs_data = np.load(llr_fn)
18 | for mod_base in llrs_data[mods.GT_ALL_MOD_BASE_STR]:
19 | fn_mod_base_llrs[mod_base][0].append(
20 | llrs_data[mods.GT_MOD_LLR_STR.format(mod_base)]
21 | )
22 | fn_mod_base_llrs[mod_base][1].append(
23 | llrs_data[mods.GT_CAN_LLR_STR.format(mod_base)]
24 | )
25 |
26 | mod_base_stats = {mods.GT_ALL_MOD_BASE_STR: list(fn_mod_base_llrs)}
27 | for mod_base, (mod_llrs, can_llrs) in fn_mod_base_llrs.items():
28 | mod_base_stats[mods.GT_MOD_LLR_STR.format(mod_base)] = np.concatenate(
29 | mod_llrs
30 | )
31 | mod_base_stats[mods.GT_CAN_LLR_STR.format(mod_base)] = np.concatenate(
32 | can_llrs
33 | )
34 | np.savez(args.out_filename, **mod_base_stats)
35 |
36 |
37 | if __name__ == "__main__":
38 | _main(get_parser_calibrate_merge_modified_bases_stats().parse_args())
39 |
--------------------------------------------------------------------------------
/docs/extras_merge.rst:
--------------------------------------------------------------------------------
1 | **************************
2 | ``megalodon_extras merge``
3 | **************************
4 |
5 | The ``megalodon_extras merge`` command group contains commands to merge multiple per-read modified base or sequence variant databases.
6 |
7 | These commands can assist in deploying Megalodon on an array of compute resources.
8 |
9 | -----------------------------------------
10 | ``megalodon_extras merge modified_bases``
11 | -----------------------------------------
12 |
13 | Merge multiple per-read modified base databases together.
14 |
15 | This command contains multi-process capabilities, but may encounter disk I/O bottlenecks.
16 | Note that the full set of modified base positions must be stored in memory to allow this command to process at high performance.
17 | Thus the number of processes should likely be set dependent upon the amount of RAM available and not the number of CPU cores available.
18 | It is recommended that the output location be on a fast disk (i.e. local SSD and not NFS or mounted drives).
19 |
20 | ----------------------------------------------------
21 | ``megalodon_extras merge aggregated_modified_bases``
22 | ----------------------------------------------------
23 |
24 | Merge multiple aggregated modified base bedmethyl files together.
25 |
26 | This command can be useful when used in processing pipelines and when preparting modified base training ground truth files.
27 | The ``--sorted-inputs`` option is provided to allow processing of very large files that cannot be stored in memory.
28 | Note that bedmethyl files output by Megalodon are not sorted by default.
29 | Sorting can be completed by running the unix command ``sort -k1V -k2n megalodon_results/modified_bases.5mC.bed > megalodon_results/modified_bases.5mC.sorted.bed``.
30 |
31 | -----------------------------------
32 | ``megalodon_extras merge variants``
33 | -----------------------------------
34 |
35 | Merge multiple per-read sequence variant databases together.
36 |
--------------------------------------------------------------------------------
/megalodon_extras/modified_bases_estimate_threshold.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from tqdm import tqdm
3 |
4 | from megalodon import logging, mods, megalodon_helper as mh
5 | from ._extras_parsers import get_parser_modified_bases_estimate_threshold
6 |
7 |
8 | LOGGER = logging.get_logger()
9 |
10 |
11 | def _main(args):
12 | logging.init_logger()
13 |
14 | LOGGER.info("Loading database position statistics")
15 | mods_db = mods.ModsDb(
16 | mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME)
17 | )
18 | db_mods = set(mod_base for mod_base, _ in mods_db.get_mod_long_names())
19 | if args.mod_base not in db_mods:
20 | raise mh.MegaError("Target modified base not found in mods database.")
21 |
22 | scores = []
23 | bar = tqdm(total=args.num_statistics, smoothing=0)
24 | for (chrm, strand, pos), mod_llrs in mods_db.iter_pos_scores(
25 | convert_pos=True, compute_llrs=True
26 | ):
27 | for mod_base, reads_llrs in mod_llrs.items():
28 | if mod_base != args.mod_base:
29 | continue
30 | bar.update(len(reads_llrs))
31 | scores.extend(reads_llrs)
32 | if args.num_statistics is not None and bar.n >= args.num_statistics:
33 | break
34 |
35 | LOGGER.info("Esitmating fraction of modified bases")
36 | scores = np.array(scores)
37 | frac_mod = args.fraction_modified
38 | if frac_mod is None:
39 | thresh_vals = np.percentile(
40 | scores, (args.mod_percentile, 100 - args.mod_percentile)
41 | )
42 | thresh_val = np.abs(thresh_vals).min()
43 | n_can = np.greater_equal(scores, thresh_val).sum()
44 | n_mod = np.less_equal(scores, -thresh_val).sum()
45 | frac_mod = n_mod / (n_mod + n_can)
46 | print("Fraction mod: {}".format(frac_mod))
47 | llr_thresh = np.percentile(scores, frac_mod * 100)
48 | print("Threshold: {}".format(llr_thresh))
49 |
50 |
51 | if __name__ == "__main__":
52 | _main(get_parser_modified_bases_estimate_threshold().parse_args())
53 |
--------------------------------------------------------------------------------
/megalodon_extras/aggregate_run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 | import sys
4 |
5 | from megalodon import aggregate, logging, mods, variants, megalodon_helper as mh
6 | from ._extras_parsers import get_parser_aggregate_run
7 |
8 |
9 | # set blas library environment variables (without these the cblas calls
10 | # can completely halt processing)
11 | os.environ["OMP_NUM_THREADS"] = "1"
12 | os.environ["OPENBLAS_NUM_THREADS"] = "1"
13 |
14 | LOGGER = logging.get_logger()
15 |
16 |
17 | def _main(args):
18 | log_suffix = (
19 | "aggregation"
20 | if args.output_suffix is None
21 | else "aggregation." + args.output_suffix
22 | )
23 | logging.init_logger(args.megalodon_directory, out_suffix=log_suffix)
24 | LOGGER.debug('Command: """' + " ".join(sys.argv) + '"""')
25 |
26 | if args.mod_aggregate_method == mh.MOD_EM_NAME:
27 | mod_agg_info = mods.AGG_INFO(mh.MOD_EM_NAME, None)
28 | elif args.mod_aggregate_method == mh.MOD_BIN_THRESH_NAME:
29 | mod_agg_info = mods.AGG_INFO(
30 | mh.MOD_BIN_THRESH_NAME, args.mod_binary_threshold
31 | )
32 | valid_read_ids = mh.parse_read_ids(args.read_ids_filename)
33 | aggregate.aggregate_stats(
34 | args.outputs,
35 | args.megalodon_directory,
36 | args.processes,
37 | args.write_vcf_log_probs,
38 | args.heterozygous_factors,
39 | variants.HAPLIOD_MODE if args.haploid else variants.DIPLOID_MODE,
40 | mod_agg_info,
41 | args.write_mod_log_probs,
42 | args.mod_output_formats,
43 | args.suppress_progress,
44 | valid_read_ids,
45 | args.output_suffix,
46 | args.aggregate_batch_size,
47 | )
48 |
49 | if mh.VAR_NAME in args.outputs:
50 | LOGGER.info("Sorting output variant file")
51 | variant_fn = mh.add_fn_suffix(
52 | mh.get_megalodon_fn(args.megalodon_directory, mh.VAR_NAME),
53 | args.output_suffix,
54 | )
55 | sort_variant_fn = mh.add_fn_suffix(variant_fn, "sorted")
56 | variants.sort_variants(variant_fn, sort_variant_fn)
57 | LOGGER.info("Indexing output variant file")
58 | variants.index_variants(sort_variant_fn)
59 |
60 |
61 | if __name__ == "__main__":
62 | _main(get_parser_aggregate_run().parse_args())
63 |
--------------------------------------------------------------------------------
/docs/extras_phase_variants.rst:
--------------------------------------------------------------------------------
1 | ***********************************
2 | ``megalodon_extras phase_variants``
3 | ***********************************
4 |
5 | The ``megalodon_extras phase_variants`` command group contains commands to assist in the Megalodon pipeline to produce the highest quality phased variants calls.
6 |
7 | ---------------------------------------------------
8 | ``megalodon_extras phase_variants whatshap_filter``
9 | ---------------------------------------------------
10 |
11 | `WhatsHap `_ (as of version ``0.18``) cannot process some complex variants.
12 | Providing such variants causes WhatsHap to exit with an error.
13 | This command is provided to remove these complex variants and allow processing to proceed without error.
14 | Note that these variants are still considered outside of the WhatsHap phasing step of the Megalodon phasing pipeline.
15 |
16 | -----------------------------------------------------------
17 | ``megalodon_extras phase_variants extract_haplotype_reads``
18 | -----------------------------------------------------------
19 |
20 | From alignment files produced by ``whatshap haplotag``, extract read ids for reads assigned to one of the two haplotypes.
21 | One file will be produced for each haplotype value in the alignment file (two for standard diploid processing).
22 |
23 | ----------------------------------------------------------
24 | ``megalodon_extras phase_variants merge_haploid_variants``
25 | ----------------------------------------------------------
26 |
27 | Merge haploid calls from original Megalodon variants and separate haplotype sets of calls.
28 |
29 | This command should only be used as recommended in the Megalodon variant phasing pipeline.
30 | Use of this command outside of this context is not recommended as several processing steps depend upon the variant processing steps.
31 |
32 | This command iterates over the three sets of sorted variants.
33 | If a variant is not found in the haplotype variant files, the call from the original Megalodon run is taken.
34 | If a variant is called as homozygous in the original Megalodon calling a heterozygous call cannot be output.
35 | If a variant was originally calls as heterozygous and the variant is called in both haplotype calls, then the output call is determined from the two haplotype calls.
36 |
--------------------------------------------------------------------------------
/megalodon_extras/validate_mod_bases_from_calibration.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from collections import defaultdict
3 |
4 | import numpy as np
5 | import matplotlib
6 |
7 | if True:
8 | # Agg appears to be the most robust backend when only saving plots.
9 | matplotlib.use("Agg")
10 | from matplotlib.backends.backend_pdf import PdfPages
11 |
12 | from megalodon import logging, megalodon_helper as mh, mods, validation
13 | from ._extras_parsers import get_parser_calibrate_modified_bases
14 |
15 |
16 | LOGGER = logging.get_logger()
17 |
18 |
19 | def extract_llrs(llr_fn):
20 | llrs_data = np.load(llr_fn)
21 | mod_bases = llrs_data[mods.GT_ALL_MOD_BASE_STR]
22 | mod_base_llrs = {}
23 | for mod_base in mod_bases:
24 | mod_base_llrs[mod_base] = (
25 | llrs_data[mods.GT_MOD_LLR_STR.format(mod_base)],
26 | llrs_data[mods.GT_CAN_LLR_STR.format(mod_base)],
27 | )
28 |
29 | return mod_base_llrs
30 |
31 |
32 | def _main(args):
33 | logging.init_logger()
34 |
35 | LOGGER.info("Parsing log-likelihood ratios")
36 | mod_base_llrs = extract_llrs(args.ground_truth_llrs)
37 |
38 | out_fp = (
39 | sys.stdout
40 | if args.out_filename is None
41 | else open(args.out_filename, "w")
42 | )
43 | out_fp.write(validation.MOD_VAL_METRICS_HEADER)
44 | pdf_fp = None if args.out_pdf is None else PdfPages(args.out_pdf)
45 | all_pr_data, all_roc_data = defaultdict(list), defaultdict(list)
46 | all_kde_data = []
47 | for mod_base, (mod_llrs, can_llrs) in mod_base_llrs.items():
48 | LOGGER.info(f'Computing "{mod_base}" modified base validation.')
49 | try:
50 | pr_data, roc_data, kde_data = validation.compute_mod_sites_stats(
51 | mod_llrs,
52 | can_llrs,
53 | not args.allow_unbalance_classes,
54 | mod_base,
55 | "Megalodon Calibration Data",
56 | "Sample",
57 | out_fp,
58 | )
59 | all_pr_data[mod_base].append(pr_data)
60 | all_roc_data[mod_base].append(roc_data)
61 | all_kde_data.append(kde_data)
62 | except mh.MegaError as e:
63 | LOGGER.warning(str(e))
64 | validation.plot_pr(pdf_fp, all_pr_data)
65 | validation.plot_roc(pdf_fp, all_roc_data)
66 | validation.plot_kde(pdf_fp, all_kde_data)
67 | if pdf_fp is not None:
68 | pdf_fp.close()
69 |
70 |
71 | if __name__ == "__main__":
72 | _main(get_parser_calibrate_modified_bases().parse_args())
73 |
--------------------------------------------------------------------------------
/megalodon_extras/modified_bases_create_motif_bed.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 |
3 | import pysam
4 | from tqdm import tqdm
5 |
6 | from megalodon import logging, megalodon_helper as mh
7 | from ._extras_parsers import get_parser_modified_bases_create_motif_bed
8 |
9 |
10 | LOGGER = logging.get_logger()
11 |
12 | MOTIF_INFO = namedtuple(
13 | "MOTIF_INFO",
14 | ("bases_before", "bases_after", "raw_motif", "motif", "rc_motif"),
15 | )
16 | BED_TMPLT = "{chrom}\t{pos}\t{end}\t.\t.\t{strand}\n"
17 |
18 |
19 | def parse_motifs(raw_motifs):
20 | motifs = []
21 | for raw_motif, bases_before in raw_motifs:
22 | bases_before = int(bases_before)
23 | bases_after = len(raw_motif) - bases_before - 1
24 | motif = mh.compile_motif_pat(raw_motif)
25 | rc_motif = mh.compile_rev_comp_motif_pat(raw_motif)
26 | motifs.append(
27 | MOTIF_INFO(
28 | bases_before=bases_before,
29 | bases_after=bases_after,
30 | raw_motif=raw_motif,
31 | motif=motif,
32 | rc_motif=rc_motif,
33 | )
34 | )
35 |
36 | return motifs
37 |
38 |
39 | def _main(args):
40 | logging.init_logger()
41 |
42 | # parse motifs
43 | motifs = parse_motifs(args.motif)
44 | # open indexed FASTA reference
45 | ref = pysam.FastaFile(args.reference)
46 |
47 | with open(args.out_filename, "w") as fp:
48 | # sort using RefName
49 | for chrm in tqdm(
50 | sorted([mh.RefName(chrm) for chrm in ref.references]),
51 | desc="Contigs",
52 | smoothing=0,
53 | dynamic_ncols=True,
54 | ):
55 | chrm_seq = ref.fetch(chrm)
56 | chrm_sites = []
57 | for motif in motifs:
58 | for m in motif.motif.finditer(chrm_seq):
59 | pos = m.start() + motif.bases_before
60 | chrm_sites.append((pos, "+"))
61 | for m in motif.rc_motif.finditer(chrm_seq):
62 | pos = m.start() + motif.bases_after
63 | chrm_sites.append((pos, "-"))
64 | fp.write(
65 | "".join(
66 | BED_TMPLT.format(
67 | chrom=chrm, pos=pos, end=pos + 1, strand=strand
68 | )
69 | for pos, strand in sorted(chrm_sites)
70 | )
71 | )
72 |
73 |
74 | if __name__ == "__main__":
75 | _main(get_parser_modified_bases_create_motif_bed().parse_args())
76 |
--------------------------------------------------------------------------------
/megalodon_extras/per_read_text_variants.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | import numpy as np
4 | from tqdm import tqdm
5 |
6 | from megalodon import variants, megalodon_helper as mh
7 | from ._extras_parsers import get_parser_per_read_text_variants
8 |
9 |
10 | def _main(args):
11 | vars_db = variants.VarsDb(
12 | mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_VAR_NAME),
13 | uuid_strand_index_in_memory=True,
14 | )
15 | vars_txt_fp = open(
16 | mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_VAR_TXT_NAME)
17 | if args.out_filename is None
18 | else args.out_filename,
19 | "w",
20 | )
21 | vars_txt_fp.write("\t".join(vars_db.text_field_names) + "\n")
22 | for (loc_id, loc_chrm, pos, ref_seq, var_name, test_start) in tqdm(
23 | vars_db.iter_locs(), total=vars_db.get_num_uniq_var_loc(), smoothing=0
24 | ):
25 | pr_var_stats = vars_db.get_loc_stats(
26 | (loc_id, loc_chrm, pos, ref_seq, var_name, test_start)
27 | )
28 | alt_type_stats = defaultdict(dict)
29 | for r_stats in pr_var_stats:
30 | alt_type_stats[r_stats.read_id][r_stats.alt_seq] = (
31 | r_stats.score,
32 | r_stats.chrm,
33 | )
34 |
35 | var_out_text = ""
36 | for read_id, r_var_stats in alt_type_stats.items():
37 | uuid, strand = vars_db.get_uuid_strand(read_id)
38 | alt_lps = np.array(list(zip(*r_var_stats.values()))[0])
39 | with np.errstate(divide="ignore"):
40 | ref_lp = np.log1p(-np.exp(alt_lps).sum())
41 | var_out_text += (
42 | "\n".join(
43 | (
44 | (
45 | "\t".join("{}" for _ in vars_db.text_field_names)
46 | ).format(
47 | uuid,
48 | chrm,
49 | strand,
50 | pos,
51 | ref_lp,
52 | alt_lp,
53 | ref_seq,
54 | alt_seq,
55 | var_name,
56 | )
57 | for alt_seq, (alt_lp, chrm) in r_var_stats.items()
58 | )
59 | )
60 | + "\n"
61 | )
62 | vars_txt_fp.write(var_out_text)
63 |
64 | return
65 |
66 |
67 | if __name__ == "__main__":
68 | _main(get_parser_per_read_text_variants().parse_args())
69 |
--------------------------------------------------------------------------------
/megalodon_extras/per_read_text_modified_bases.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from tqdm import tqdm
3 |
4 | from megalodon import mods, megalodon_helper as mh
5 | from ._extras_parsers import get_parser_per_read_text_modified_bases
6 |
7 |
8 | def _main(args):
9 | mods_db = mods.ModsDb(
10 | mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME),
11 | in_mem_dbid_to_uuid=True,
12 | )
13 | mods_txt_fp = open(
14 | mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME)
15 | if args.out_filename is None
16 | else args.out_filename,
17 | "w",
18 | )
19 | mods_txt_fp.write("\t".join(mods_db.text_field_names) + "\n")
20 | rec_tmplt = "\t".join("{}" for _ in mods_db.text_field_names) + "\n"
21 | bar = tqdm(
22 | desc="Processing Per-read Data",
23 | unit="per-read calls",
24 | total=mods_db.get_num_uniq_stats(),
25 | smoothing=0,
26 | dynamic_ncols=True,
27 | )
28 | for (chrm, strand, pos), pos_lps in mods_db.iter_pos_scores(
29 | convert_pos=True
30 | ):
31 | bar.update(len(pos_lps))
32 | str_strand = mh.int_strand_to_str(strand)
33 | mod_out_text = ""
34 | prev_dbid = None
35 | mod_bs, r_lps = [], []
36 | for read_dbid, mod_dbid, lp in sorted(pos_lps):
37 | if prev_dbid != read_dbid and prev_dbid is not None:
38 | uuid = mods_db.get_uuid(prev_dbid)
39 | # compute and store log likelihood ratios
40 | with np.errstate(divide="ignore"):
41 | can_lp = np.log1p(-np.exp(r_lps).sum())
42 | for mod_b, r_lp in zip(mod_bs, r_lps):
43 | mod_out_text += rec_tmplt.format(
44 | uuid, chrm, str_strand, pos, r_lp, can_lp, mod_b
45 | )
46 | mod_bs, r_lps = [], []
47 | prev_dbid = read_dbid
48 | mod_bs.append(mods_db.get_mod_base(mod_dbid))
49 | r_lps.append(lp)
50 | uuid = mods_db.get_uuid(prev_dbid)
51 | # compute and store log likelihood ratios
52 | with np.errstate(divide="ignore"):
53 | can_lp = np.log1p(-np.exp(r_lps).sum())
54 | for mod_b, r_lp in zip(mod_bs, r_lps):
55 | mod_out_text += rec_tmplt.format(
56 | uuid, chrm, str_strand, pos, r_lp, can_lp, mod_b
57 | )
58 | mods_txt_fp.write(mod_out_text)
59 | mods_txt_fp.close()
60 |
61 |
62 | if __name__ == "__main__":
63 | _main(get_parser_per_read_text_modified_bases().parse_args())
64 |
--------------------------------------------------------------------------------
/megalodon_extras/merge_variants.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 |
3 | from megalodon import logging, megalodon_helper as mh, variants
4 | from ._extras_parsers import get_parser_merge_variants
5 |
6 |
7 | def _main(args):
8 | mh.mkdir(args.output_megalodon_results_dir, args.overwrite)
9 | logging.init_logger(args.output_megalodon_results_dir)
10 | logger = logging.get_logger()
11 |
12 | logger.info("Opening new sequence variant statistics database")
13 | out_vars_db = variants.VarsDb(
14 | mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_VAR_NAME),
15 | read_only=False,
16 | loc_index_in_memory=not args.var_locations_on_disk,
17 | uuid_index_in_memory=True,
18 | )
19 |
20 | for mega_dir in args.megalodon_results_dirs:
21 | logger.info(
22 | "Adding sequence variant statistics from {}".format(mega_dir)
23 | )
24 | # full read only mode with no indices read into memory
25 | vars_db = variants.VarsDb(
26 | mh.get_megalodon_fn(mega_dir, mh.PR_VAR_NAME),
27 | read_only=True,
28 | chrm_index_in_memory=False,
29 | alt_index_in_memory=False,
30 | uuid_index_in_memory=False,
31 | )
32 | bar = tqdm(
33 | desc=mega_dir,
34 | total=vars_db.get_num_uniq_stats(),
35 | smoothing=0,
36 | dynamic_ncols=True,
37 | )
38 | for (
39 | score,
40 | uuid,
41 | strand,
42 | alt_seq,
43 | ref_seq,
44 | pos,
45 | var_name,
46 | test_end,
47 | test_start,
48 | chrm,
49 | chrm_len,
50 | ) in vars_db.iter_data():
51 | chrm_id = out_vars_db.get_chrm_id_or_insert(chrm, chrm_len)
52 | loc_id = out_vars_db.get_loc_id_or_insert(
53 | chrm_id, test_start, test_end, pos, ref_seq, var_name
54 | )
55 | alt_id = out_vars_db.get_alt_id_or_insert(alt_seq)
56 | read_id = out_vars_db.get_read_id_or_insert(uuid)
57 | out_vars_db.insert_data(score, loc_id, alt_id, read_id)
58 | bar.update()
59 | bar.close()
60 |
61 | logger.info("Creating indices and closing database")
62 | if out_vars_db.chrm_idx_in_mem:
63 | out_vars_db.create_chrm_index()
64 | if out_vars_db.loc_idx_in_mem:
65 | out_vars_db.create_loc_index()
66 | if out_vars_db.alt_idx_in_mem:
67 | out_vars_db.create_alt_index()
68 | out_vars_db.create_data_covering_index()
69 | out_vars_db.close()
70 |
71 |
72 | if __name__ == "__main__":
73 | _main(get_parser_merge_variants().parse_args())
74 |
--------------------------------------------------------------------------------
/megalodon_extras/calibrate_merge_modified_bases.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from megalodon import calibration, logging, megalodon_helper as mh
4 | from ._extras_parsers import get_parser_calibrate_merge_modified_bases
5 |
6 |
7 | LOGGER = logging.get_logger()
8 |
9 |
10 | def _main(args):
11 | logging.init_logger()
12 | mh.prep_out_fn(args.out_filename, args.overwrite)
13 |
14 | LOGGER.info(
15 | "Processing {}".format(args.modified_base_calibration_files[-1])
16 | )
17 | calib_data = np.load(args.modified_base_calibration_files[-1])
18 | stratify_type = str(calib_data[calibration.MOD_STRAT_TYPE_TXT])
19 | num_calib_vals = np.int(calib_data[calibration.SMOOTH_NVALS_TXT])
20 | mod_calibs = {}
21 | for mod_base in calib_data[calibration.MOD_BASES_TXT]:
22 | LOGGER.info("\tUsing {} calibration".format(mod_base))
23 | mod_calibs[mod_base] = (
24 | calib_data[mod_base + calibration.LLR_RANGE_SUFFIX].copy(),
25 | calib_data[mod_base + calibration.CALIB_TABLE_SUFFIX].copy(),
26 | )
27 | for mod_calib_fn in args.modified_base_calibration_files[-2::-1]:
28 | LOGGER.info("Processing {}".format(mod_calib_fn))
29 | calib_data = np.load(mod_calib_fn)
30 | assert stratify_type == str(calib_data[calibration.MOD_STRAT_TYPE_TXT])
31 | assert num_calib_vals == np.int(
32 | calib_data[calibration.SMOOTH_NVALS_TXT]
33 | )
34 | for mod_base in calib_data[calibration.MOD_BASES_TXT]:
35 | # overwrite calibration data with files passed earlier
36 | if mod_base in mod_calibs:
37 | LOGGER.info("\tOverwriting {} calibration".format(mod_base))
38 | else:
39 | LOGGER.info("\tUsing {} calibration".format(mod_base))
40 | mod_calibs[mod_base] = (
41 | calib_data[mod_base + calibration.LLR_RANGE_SUFFIX].copy(),
42 | calib_data[mod_base + calibration.CALIB_TABLE_SUFFIX].copy(),
43 | )
44 |
45 | save_kwargs = {}
46 | for mod_base, (mod_llr_range, mod_calib) in mod_calibs.items():
47 | save_kwargs[mod_base + calibration.LLR_RANGE_SUFFIX] = mod_llr_range
48 | save_kwargs[mod_base + calibration.CALIB_TABLE_SUFFIX] = mod_calib
49 |
50 | # save calibration table for reading into mod calibration table
51 | LOGGER.info("Saving calibrations to file.")
52 | mod_bases = list(mod_calibs.keys())
53 | np.savez(
54 | args.out_filename,
55 | stratify_type=stratify_type,
56 | smooth_nvals=num_calib_vals,
57 | mod_bases=mod_bases,
58 | **save_kwargs,
59 | )
60 |
61 |
62 | if __name__ == "__main__":
63 | _main(get_parser_calibrate_merge_modified_bases().parse_args())
64 |
--------------------------------------------------------------------------------
/megalodon/constrained_decoding.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from megalodon import banding, decode, logging, megalodon_helper as mh
4 |
5 | LOGGER = logging.get_logger()
6 |
7 |
8 | def construct_allowed_bases(seq):
9 | """Construct numpy allowed bases array from string sequence containing
10 | ambiguous bases.
11 | """
12 | allowed_bases = np.zeros((len(seq), 4), dtype=np.short)
13 | for seq_pos, base in enumerate(seq):
14 | try:
15 | pos_allowed_bases = mh.SINGLE_LETTER_CODE[base]
16 | except KeyError:
17 | raise mh.MegaError(
18 | f"Invalid IUPAC code ({base}) found wheile performing "
19 | "constrained basecalling."
20 | )
21 | for pos_allowed_base in pos_allowed_bases:
22 | allowed_bases[seq_pos, mh.ALPHABET.find(pos_allowed_base)] = 1
23 | return allowed_bases
24 |
25 |
26 | def constrained_basecall(
27 | reference,
28 | trans_logprobs,
29 | ref_to_block,
30 | half_bandwidth=mh.DEFAULT_CONSTRAINED_HALF_BW,
31 | ):
32 | """Perform constrained basecalling from initial sequence to ambiguous
33 | reference.
34 |
35 | Args:
36 | reference (str): Reference sequence containing ambiguous bases
37 | trans_logprob (np.array): 2D Float array containing flip-flop transition
38 | log probabilities. Shape should be num_blocks by num_transitions.
39 | num_blocks is signal // stride and num_transitions is the number of
40 | flip-flop transitions (40 for 4 canonical bases).
41 | ref_to_block (np.array): Containing initial path coordinates from
42 | reference bases to block coordinates in trans_logprob
43 | half_bandwidth (int): Half bandwidth over which to restrict path
44 | between sequence and blocks. Band will be constructed along
45 | block/signal dimension.
46 | """
47 | # if initial mapping starts within trans_logprobs trim and shift mapping
48 | if ref_to_block[0] != 0:
49 | trans_logprobs = trans_logprobs[ref_to_block[0] :]
50 | ref_to_block = ref_to_block - ref_to_block[0]
51 | # if mapping ends before end of trans_logprobs trim
52 | if ref_to_block.shape[0] > ref_to_block[-1]:
53 | trans_logprobs = trans_logprobs[: ref_to_block[-1]]
54 | allowed_bases = construct_allowed_bases(reference)
55 | sig_band = banding.compute_sig_band(
56 | ref_to_block, np.zeros(len(reference)), half_bandwidth
57 | )
58 | seq_band = banding.convert_to_seq_band(sig_band)
59 | int_seq = decode.flipflop_constrain_decode(
60 | trans_logprobs, allowed_bases, seq_band
61 | )
62 | constrained_seq = "".join(mh.ALPHABET[base % 4] for base in int_seq)
63 | return constrained_seq
64 |
--------------------------------------------------------------------------------
/megalodon_extras/phase_variants_whatshap_filter.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 |
3 | from megalodon import variants
4 | from ._extras_parsers import get_parser_phase_variants_whatshap_filter
5 |
6 |
7 | def is_complex_variant(ref, alts):
8 | # single base swaps aren't complex
9 | if any(len(allele) > 1 for allele in alts + [ref]):
10 | for alt in alts:
11 | simp_ref, simp_alt, _, _ = variants.simplify_var_seq(ref, alt)
12 | # if an allele simplifies to a SNV continue
13 | if len(simp_ref) == 0 and len(simp_alt) == 0:
14 | continue
15 | # if simplified sequence does not leave either allele empty
16 | # then this is a complex variant which cannot be processed by
17 | # whatshap
18 | if len(simp_ref) > 0 and len(simp_alt) > 0:
19 | return True
20 | return False
21 |
22 |
23 | def get_qual(vcf_line):
24 | qual = vcf_line.split()[5]
25 | try:
26 | qual = int(qual)
27 | except ValueError:
28 | qual = 0
29 | return qual
30 |
31 |
32 | def get_pos_ref_alts(vcf_line):
33 | chrm, pos, _, ref, alts = vcf_line.split()[:5]
34 | return chrm, int(pos), ref, alts.split(",")
35 |
36 |
37 | def _main(args):
38 | out_fp = open(args.out_vcf, "w")
39 | filt_fp = (
40 | None
41 | if args.filtered_records is None
42 | else open(args.filtered_records, "w")
43 | )
44 |
45 | with open(args.in_vcf) as fp:
46 | prev_line = prev_chrm = prev_end = None
47 | for line in tqdm(fp, desc="Filtering VCF", unit=" lines", smoothing=0):
48 | if line.startswith("#"):
49 | out_fp.write(line)
50 | continue
51 | chrm, start, ref, alts = get_pos_ref_alts(line)
52 | # skip complex variants
53 | if is_complex_variant(ref, alts):
54 | if filt_fp is not None:
55 | filt_fp.write("COMLEX_VARIANT: " + line)
56 | continue
57 |
58 | if prev_chrm == chrm and prev_end > start:
59 | if get_qual(line) > get_qual(prev_line):
60 | if filt_fp is not None:
61 | filt_fp.write("OVERLAPPING_VARIANT: " + prev_line)
62 | prev_line = line
63 | else:
64 | if filt_fp is not None:
65 | filt_fp.write("OVERLAPPING_VARIANT: " + line)
66 | continue
67 |
68 | if prev_line is not None:
69 | out_fp.write(line)
70 | prev_line = line
71 | prev_chrm = chrm
72 | prev_end = start + len(ref)
73 |
74 | if prev_line is not None:
75 | out_fp.write(line)
76 | if filt_fp is None:
77 | filt_fp.close()
78 |
79 | return
80 |
81 |
82 | if __name__ == "__main__":
83 | _main(get_parser_phase_variants_whatshap_filter().parse_args())
84 |
--------------------------------------------------------------------------------
/megalodon/logging.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import logging
4 |
5 |
6 | LOG_FN = "log.txt"
7 |
8 |
9 | class CustomFormatter(logging.Formatter):
10 | err_fmt = "*" * 100 + "\n\tERROR: %(msg)s\n" + "*" * 100
11 | warn_fmt = "*" * 20 + " WARNING: %(msg)s " + "*" * 20
12 | info_fmt = "[%(asctime)s] %(message)s"
13 | dbg_fmt = (
14 | "DBG %(asctime)s : %(msg)s --- %(processName)s-"
15 | + "%(threadName)s %(module)s.py:%(lineno)d"
16 | )
17 |
18 | def __init__(self, fmt="[%(asctime)s] %(levelname)-8s: %(message)s"):
19 | super().__init__(fmt=fmt, datefmt="%H:%M:%S", style="%")
20 |
21 | def format(self, record):
22 | format_orig = self._fmt
23 |
24 | # Replace the original format with one customized by logging level
25 | if record.levelno == logging.DEBUG:
26 | self._style._fmt = self.dbg_fmt
27 | elif record.levelno == logging.INFO:
28 | self._style._fmt = self.info_fmt
29 | elif record.levelno == logging.WARNING:
30 | self._style._fmt = self.warn_fmt
31 | elif record.levelno == logging.ERROR:
32 | self._style._fmt = self.err_fmt
33 | result = logging.Formatter.format(self, record)
34 |
35 | self._fmt = format_orig
36 |
37 | return result
38 |
39 |
40 | def init_logger(
41 | out_dir=None, out_suffix=None, log_fn=None, quiet=False, silent=False
42 | ):
43 | """Prepare logging output. Output file will be opened if out_dir or log_fn
44 | are specified. out_suffix will be added to the standard log.txt filename in
45 | out_dir (does not apply when log_fn is specified).
46 |
47 | File will include debug and above messages while stderr will include info
48 | and above. If quiet=True, stderr will include warning and above only.
49 | """
50 | log_fp = None
51 | if out_dir is not None:
52 | log_fn = os.path.join(out_dir, LOG_FN)
53 | if out_suffix is not None:
54 | base_fn, fn_ext = os.path.splitext(log_fn)
55 | log_fn = base_fn + "." + out_suffix + fn_ext
56 | if log_fn is not None:
57 | log_fp = logging.FileHandler(log_fn, "w")
58 | log_fp.setLevel(logging.DEBUG)
59 | log_fp.setFormatter(CustomFormatter())
60 |
61 | console = logging.StreamHandler()
62 | if silent:
63 | console.setLevel(logging.CRITICAL)
64 | elif quiet:
65 | console.setLevel(logging.WARNING)
66 | else:
67 | console.setLevel(logging.INFO)
68 | console.setFormatter(CustomFormatter())
69 |
70 | root_logger = logging.getLogger("")
71 | root_logger.setLevel(logging.DEBUG)
72 | if log_fp is not None:
73 | root_logger.addHandler(log_fp)
74 | root_logger.addHandler(console)
75 |
76 |
77 | def get_logger(module_name=""):
78 | return logging.getLogger(module_name)
79 |
80 |
81 | if __name__ == "__main__":
82 | sys.stderr.write("This is a module. See commands with `megalodon -h`")
83 | sys.exit(1)
84 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = megalodon
3 | version = attr: megalodon.__init__.__version__
4 | description = Nanopore base calling augmentation.
5 | license = ont_public_licence
6 | long-description = file: README.rst
7 | long-description-content-type = text/x-rst; charset=UTF-8; variant=GFM
8 | url = https://github.com/nanoporetech/megalodon
9 | author = Marcus Stoiber
10 | maintainer = Marcus Stoiber
11 | maintainer_email = marcus.stoiber@nanoporetech.com
12 | platforms = any
13 | classifiers =
14 | Development Status :: 4 - Beta
15 | Environment :: Console
16 | Environment :: GPU
17 | Intended Audience :: Developers
18 | Intended Audience :: Science/Research
19 | License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
20 | Natural Language :: English
21 | Operating System :: Unix
22 | Programming Language :: Python :: 3 :: Only
23 | Topic :: Scientific/Engineering :: Artificial Intelligence
24 | Topic :: Scientific/Engineering :: Bio-Informatics
25 |
26 | [options]
27 | zip_safe = False
28 | packages =
29 | megalodon
30 | megalodon_extras
31 | package_dir =
32 | =.
33 | include_package_data = True
34 | python_requires =
35 | >=3.6
36 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
37 | setup_requires = pyscaffold>=3.2a0,<3.3a0
38 |
39 | # project dependencies (testing deps listed in next section)
40 | install_requires =
41 | h5py >= 2.2.1
42 | numpy >= 1.9.0
43 | scipy >= 1.1.0
44 | Cython >= 0.25.2
45 | mappy >= 2.16
46 | pysam >= 0.15
47 | ont_fast5_api >= 3.2
48 | tqdm >= 2.17
49 | ont-pyguppy-client-lib
50 | scikit-learn
51 | seaborn
52 | ont-remora >= 1.0
53 |
54 | [options.package_data]
55 | * = model_data/*/*.npz
56 |
57 | [options.extras_require]
58 | testing =
59 | pytest
60 | black
61 | sphinx
62 | sphinx-rtd-theme
63 | sphinx-argparse
64 |
65 | [options.packages.find]
66 | where =
67 | megalodon/
68 | exclude =
69 | docs
70 | tests
71 | .eggs
72 |
73 | [options.entry_points]
74 | console_scripts =
75 | megalodon = megalodon.__main__:_main
76 | megalodon_extras = megalodon_extras.__main__:_main
77 |
78 | [test]
79 | extras = True
80 |
81 | [tool:pytest]
82 | addopts =
83 | --cov megalodon --verbose --ignore *.egg*
84 | norecursedirs =
85 | dist
86 | build
87 | .tox
88 | .eggs
89 | testpaths = tests
90 |
91 | [aliases]
92 | dists = bdist_wheel
93 |
94 | [bdist_wheel]
95 | # Use this option if your package is pure-python
96 | universal = 1
97 |
98 | [build_sphinx]
99 | source_dir = docs
100 | build_dir = build/sphinx
101 |
102 | [devpi:upload]
103 | # Options for the devpi: PyPI server and packaging tool
104 | # VCS export must be deactivated since we are using setuptools-scm
105 | no-vcs = 1
106 | formats = bdist_wheel
107 |
108 | [pyscaffold]
109 | # PyScaffold's parameters when the project was created.
110 | # This will be used when updating. Do not change!
111 | version = 3.2.3
112 | package = megalodon
113 | extensions =
114 | markdown
115 | gitlab
116 |
117 | [flake8]
118 | # ignore:
119 | # E203 whitespace before ':'
120 | # W503 line break before binary operator
121 | ignore = E203,W503
122 | max-line-length = 80
123 |
--------------------------------------------------------------------------------
/docs/computing_considerations.rst:
--------------------------------------------------------------------------------
1 | ************************
2 | Computing Considerations
3 | ************************
4 |
5 | This page aims to describe the Megalodon processing workflow, highlighting relevant computing considerations.
6 |
7 | ------------------
8 | Raw Signal Loading
9 | ------------------
10 |
11 | Raw signal is loaded from either single- or multi-FAST5 format via the ``ont_fast5_api``.
12 | Raw signal is loaded within a single process and distributed out the worker processes.
13 | The input queue status bar indicates how many read signals have been loaded and are awaiting processing.
14 | If this status bar is often empty, raw signal extraction from FAST5 files is likely a processing bottleneck.
15 |
16 | The reads queue is filled from a separate worker process.
17 | This process will enumerate FAST5 files and all read_ids stored within these files, but per-read processing will begin as soon as the first read_id/file is found.
18 | Users may notice a period of time where the progress bar does not have a known total number of reads.
19 | Once read enumeration is complete the progress bar will update to include the total number of reads found and ETA for run completion.
20 |
21 | ------------
22 | Base Calling
23 | ------------
24 |
25 | Basecalling is performed by the pyguppy backend.
26 | Basecalling consists of running the neural network and then decoding this output.
27 | See `guppy documentation on the community page (login required) `_ for more details.
28 | Parameters can be passed directly to the Guppy server initialization call via the ``--guppy-params`` argument.
29 |
30 | -----------------
31 | Reference Mapping
32 | -----------------
33 |
34 | Read mapping is completed using the ``minimap2`` python interface (``mappy``).
35 | The reference index is loaded into shared memory.
36 | A separate thread is linked to each per-read processing worker in order to access the shared memory index.
37 | Thus users may notice threads opened for this processing.
38 | These threads will generally consume less compute than the worker processes.
39 |
40 | ---------------------------------
41 | Variant and Modified Base Calling
42 | ---------------------------------
43 |
44 | Sequence variant and modified base calling is computed within the per-read processing workers using CPU resources.
45 | Generally, this portion of processing will consume a minority of the compute resources.
46 | Proposing many variants (e.g. all possible 2+ base indels) or modified bases in all contexts may show a bottle neck at this portion of processing.
47 | Internal testing shows that proposal of all possible single base substitutions shows minimal processing at this portion of per-read processing.
48 |
49 | ---------------
50 | Writing to Disk
51 | ---------------
52 |
53 | As of version 2.0, the status of output queues is displayed by default.
54 | As of version 2.2, the status of the input signal extraction queue is also displayed.
55 | If any of the output status bars indicate a full queue, Megalodon will stall waiting on that process to write data to disk.
56 | if the input signal extraction quque is often empty, raw signal extraction from FAST5 files is likely a processing bottleneck.
57 | Moving the input data or ``--output-directory`` respectively to a location with faster disk I/O performance should improve performance.
58 |
--------------------------------------------------------------------------------
/megalodon_extras/modified_bases_create_ground_truth.py:
--------------------------------------------------------------------------------
1 | from megalodon import megalodon_helper as mh
2 | from ._extras_parsers import get_parser_modified_bases_create_ground_truth
3 |
4 |
5 | def _main(args):
6 | samp_cov, samp_mod_cov = mh.parse_bed_methyls(
7 | args.bed_methyl_files, strand_offset=args.strand_offset
8 | )
9 | with open(args.out_csv, "w") as gt_fp:
10 | for (chrom, strand), ctg_cov in samp_cov.items():
11 | for pos, cov in ctg_cov.items():
12 | if cov < args.coverage_threshold:
13 | continue
14 | pct_mod = 100 * samp_mod_cov[(chrom, strand)][pos] / cov
15 | if pct_mod <= args.pct_mod_thresholds[0]:
16 | gt_fp.write(
17 | ",".join(
18 | map(
19 | str,
20 | (
21 | chrom,
22 | mh.int_strand_to_str(strand),
23 | pos,
24 | "False",
25 | ),
26 | )
27 | )
28 | + "\n"
29 | )
30 | if args.strand_offset is not None:
31 | gt_fp.write(
32 | ",".join(
33 | map(
34 | str,
35 | (
36 | chrom,
37 | mh.int_strand_to_str(strand),
38 | pos + args.strand_offset,
39 | "False",
40 | ),
41 | )
42 | )
43 | + "\n"
44 | )
45 | elif pct_mod >= args.pct_mod_thresholds[1]:
46 | gt_fp.write(
47 | ",".join(
48 | map(
49 | str,
50 | (
51 | chrom,
52 | mh.int_strand_to_str(strand),
53 | pos,
54 | "True",
55 | ),
56 | )
57 | )
58 | + "\n"
59 | )
60 | if args.strand_offset is not None:
61 | gt_fp.write(
62 | ",".join(
63 | map(
64 | str,
65 | (
66 | chrom,
67 | mh.int_strand_to_str(strand),
68 | pos + args.strand_offset,
69 | "True",
70 | ),
71 | )
72 | )
73 | + "\n"
74 | )
75 |
76 |
77 | if __name__ == "__main__":
78 | _main(get_parser_modified_bases_create_ground_truth().parse_args())
79 |
--------------------------------------------------------------------------------
/megalodon_extras/variants_atomize.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import mappy
4 |
5 | from megalodon import logging, mapping, megalodon_helper as mh, variants
6 | from ._extras_parsers import get_parser_variants_atomize
7 |
8 |
9 | LOGGER = logging.get_logger()
10 |
11 | HEADER = ["##fileformat=VCFv4.1", "##source=megalodon_atomized"]
12 | CONTIG_HEADER_LINE = "##contig="
13 | COMMAND_HEADER_LINE = '##command="{}"'
14 | FIELDS_LINE = "#CHROM POS ID REF ALT QUAL FILTER" + " INFO FORMAT SAMPLE"
15 | RECORD_LINE = "{chrm}\t{pos}\t{rid}\t{ref}\t{alts}\t.\t.\t{info}\t.\t.\n"
16 |
17 |
18 | def _main(args):
19 | logging.init_logger()
20 | LOGGER.info("Loading reference")
21 | aligner = mappy.Aligner(
22 | str(args.reference), preset=str("map-ont"), best_n=1
23 | )
24 | LOGGER.info("Loading variants")
25 | var_data = variants.VarInfo(
26 | args.in_vcf, aligner, args.max_indel_size, keep_var_fp_open=True
27 | )
28 | contigs = var_data.variants_idx.header.contigs.values()
29 | LOGGER.info("Atomizing variants")
30 | with open(args.out_vcf, "w") as out_vars:
31 | # preprocess contigs to set contig lengths for VCF header
32 | ctg_lens = {}
33 | for ctg in contigs:
34 | chrm_seq = aligner.seq(ctg.name)
35 | if len(chrm_seq) != ctg.length:
36 | LOGGER.warning(
37 | (
38 | "Mismatched contig lengths ({}) between "
39 | + "reference ({}) and input VCF ({}) using length from "
40 | "reference"
41 | ).format(ctg.name, len(chrm_seq), ctg.length)
42 | )
43 | ctg_lens[ctg.name] = len(chrm_seq)
44 |
45 | out_vars.write(
46 | "\n".join(
47 | HEADER
48 | + [
49 | CONTIG_HEADER_LINE.format(ctg, ctg_len)
50 | for ctg, ctg_len in ctg_lens.items()
51 | ]
52 | + [
53 | variants.CONTEXT_BASE_MI_LINE,
54 | COMMAND_HEADER_LINE.format(" ".join(sys.argv)),
55 | FIELDS_LINE,
56 | ]
57 | )
58 | + "\n"
59 | )
60 | for ctg in contigs:
61 | chrm_seq = aligner.seq(ctg.name)
62 | map_pos = mapping.MAP_POS(
63 | chrm=ctg.name,
64 | strand=None,
65 | start=0,
66 | end=len(chrm_seq),
67 | q_trim_start=None,
68 | q_trim_end=None,
69 | )
70 | for var in var_data.fetch_read_variants(
71 | map_pos, mh.seq_to_int(chrm_seq)
72 | ):
73 | out_vars.write(
74 | RECORD_LINE.format(
75 | chrm=ctg.name,
76 | pos=var.ref_start + 1,
77 | rid=var.id,
78 | ref=var.ref,
79 | alts=",".join(var.alts),
80 | info=variants.HAS_CONTEXT_BASE_TAG
81 | if var.has_context_base
82 | else ".",
83 | )
84 | )
85 |
86 | LOGGER.info("Indexing output variant file")
87 | variants.index_variants(args.out_vcf)
88 |
89 |
90 | if __name__ == "__main__":
91 | _main(get_parser_variants_atomize().parse_args())
92 |
--------------------------------------------------------------------------------
/megalodon_extras/modified_bases_update_database.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | from time import time
3 |
4 | from tqdm import tqdm
5 |
6 | from megalodon import logging, mods
7 | from ._extras_parsers import get_parser_modified_bases_update_database
8 |
9 |
10 | DEBUG = False
11 | N_DEBUG = 50000000
12 |
13 | INSERT_BATCH_SIZE = 10000
14 |
15 | LOGGER = logging.get_logger()
16 |
17 |
18 | def get_read_id(uuid, read_ids, new_db):
19 | try:
20 | read_id = read_ids[uuid]
21 | except KeyError:
22 | new_db.cur.execute("INSERT INTO read (uuid) VALUES (?)", (uuid,))
23 | read_id = new_db.cur.lastrowid
24 | read_ids[uuid] = read_id
25 | return read_id, read_ids
26 |
27 |
28 | def insert_data(new_db, insert_batch):
29 | new_db.cur.executemany("INSERT INTO data VALUES (?,?,?,?)", insert_batch)
30 |
31 |
32 | def fill_mods(old_cur, new_db):
33 | read_ids = {}
34 | n_recs = old_cur.execute("SELECT MAX(rowid) FROM mods").fetchone()[0]
35 | old_cur.execute("SELECT * FROM mods")
36 | insert_batch = []
37 | for i, (
38 | uuid,
39 | chrm,
40 | strand,
41 | pos,
42 | score,
43 | mod_base,
44 | motif,
45 | motif_pos,
46 | raw_motif,
47 | ) in tqdm(
48 | enumerate(old_cur), total=n_recs, smoothing=0, dynamic_ncols=True
49 | ):
50 | if DEBUG and i > N_DEBUG:
51 | break
52 | read_id, read_ids = get_read_id(uuid, read_ids, new_db)
53 | pos_id = new_db.get_pos_id_or_insert(chrm, strand, pos)
54 | mod_base_id = new_db.get_mod_base_id_or_insert(
55 | mod_base, motif, motif_pos, raw_motif
56 | )
57 | insert_batch.append((score, pos_id, mod_base_id, read_id))
58 | if len(insert_batch) >= INSERT_BATCH_SIZE:
59 | insert_data(new_db, insert_batch)
60 | insert_batch = []
61 |
62 | if len(insert_batch) >= 0:
63 | insert_data(new_db, insert_batch)
64 |
65 |
66 | def fill_refs(old_cur, new_db):
67 | old_cur.execute("SELECT DISTINCT chrm FROM mods")
68 | for (ref_name,) in old_cur:
69 | new_db.insert_chrm(ref_name)
70 | new_db.create_chrm_index()
71 |
72 |
73 | def _main(args):
74 | raise NotImplementedError(
75 | "The previous version of this script updated version 0 to "
76 | + "version 1. Updgreade to version 2 not yet implemented."
77 | )
78 | logging.init_logger()
79 | old_db = sqlite3.connect(args.old_db)
80 | old_cur = old_db.cursor()
81 | new_db = mods.ModsDb(args.new_db, read_only=False)
82 |
83 | LOGGER.info("Reading/loading reference record names.")
84 | fill_refs(old_cur, new_db)
85 |
86 | LOGGER.info("Reading/loading modified base scores.")
87 | fill_mods(old_cur, new_db)
88 |
89 | if not DEBUG:
90 | new_db.create_mod_index()
91 | t0 = time()
92 | LOGGER.info("Creating positions index.")
93 | new_db.create_pos_index()
94 | t1 = time()
95 | LOGGER.info("Took {} seconds.".format(t1 - t0))
96 | LOGGER.info("Creating scores position index.")
97 | new_db.create_data_covering_index()
98 | LOGGER.info("Took {} seconds.".format(time() - t1))
99 | new_db.close()
100 |
101 |
102 | if __name__ == "__main__":
103 | _main(get_parser_modified_bases_update_database().parse_args())
104 |
--------------------------------------------------------------------------------
/megalodon_extras/merge_aggregated_modified_bases.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from tqdm import tqdm
3 |
4 | from megalodon import logging, megalodon_helper as mh, mods
5 | from ._extras_parsers import get_parser_merge_aggregated_modified_bases
6 |
7 |
8 | def write_unsorted_merge(in_fns, out_fp, bar):
9 | cov, mod_cov = mh.parse_bed_methyls(in_fns)
10 | for chrm in sorted(
11 | mh.RefName(chrm) for chrm in set(chrm for chrm, _ in cov)
12 | ):
13 | # convert back to string after sorting
14 | chrm = str(chrm)
15 | s_poss = []
16 | if (chrm, 1) in cov:
17 | s_poss.extend([(pos, 1) for pos in cov[(chrm, 1)]])
18 | if (chrm, -1) in cov:
19 | s_poss.extend([(pos, -1) for pos in cov[(chrm, -1)]])
20 | for pos, strand in sorted(s_poss):
21 | pcov = cov[(chrm, strand)][pos]
22 | out_fp.write(
23 | mods.BEDMETHYL_TMPLT.format(
24 | chrom=chrm,
25 | pos=pos,
26 | end=pos + 1,
27 | strand=mh.int_strand_to_str(strand),
28 | cov=pcov,
29 | score=min(int(pcov), 1000),
30 | perc=np.around(
31 | mod_cov[(chrm, strand)][pos] / pcov * 100, 1
32 | ),
33 | )
34 | + "\n"
35 | )
36 | bar.update()
37 |
38 |
39 | def write_batch(out_fp, chrms, poss, strands, mod_covs, covs):
40 | covs = np.array(covs, dtype=int)
41 | mod_covs = np.array(mod_covs, dtype=int)
42 | pct_mods = np.zeros_like(covs)
43 | valid_covs = covs > 0
44 | pct_mods[valid_covs] = np.around(
45 | np.array(mod_covs[valid_covs], dtype=int) * 100 / covs[valid_covs], 1
46 | )
47 | out_fp.write(
48 | "\n".join(
49 | mods.BEDMETHYL_TMPLT.format(
50 | chrom=chrm,
51 | pos=pos,
52 | end=pos + 1,
53 | strand=strand,
54 | cov=cov,
55 | score=score,
56 | perc=pct_mod,
57 | )
58 | for chrm, pos, strand, cov, score, pct_mod in zip(
59 | chrms,
60 | poss,
61 | strands,
62 | covs,
63 | np.minimum(covs, 1000),
64 | pct_mods,
65 | )
66 | )
67 | + "\n"
68 | )
69 |
70 |
71 | def write_sorted_merge(in_fns, out_fp, bar, batch_size=50000):
72 | chrms, poss, strands, mod_covs, covs = [], [], [], [], []
73 | for chrm, pos, strand, mod_cov, cov in mh.iter_merged_bedmethyl(
74 | [mh.iter_bed_methyl_recs(in_fn) for in_fn in in_fns]
75 | ):
76 | chrms.append(chrm)
77 | poss.append(pos)
78 | strands.append(strand)
79 | mod_covs.append(mod_cov)
80 | covs.append(cov)
81 | bar.update()
82 | if len(chrms) >= batch_size:
83 | write_batch(out_fp, chrms, poss, strands, mod_covs, covs)
84 | chrms, poss, strands, mod_covs, covs = [], [], [], [], []
85 | if len(chrms) >= 0:
86 | write_batch(out_fp, chrms, poss, strands, mod_covs, covs)
87 |
88 |
89 | def _main(args):
90 | logging.init_logger()
91 | with open(args.output_bed_methyl_file, "w") as out_fp, tqdm(
92 | desc="Records Written", smoothing=0
93 | ) as bar:
94 | if args.sorted_inputs:
95 | write_sorted_merge(args.bed_methyl_files, out_fp, bar)
96 | else:
97 | write_unsorted_merge(args.bed_methyl_files, out_fp, bar)
98 |
99 |
100 | if __name__ == "__main__":
101 | _main(get_parser_merge_aggregated_modified_bases().parse_args())
102 |
--------------------------------------------------------------------------------
/docs/variant_phasing.rst:
--------------------------------------------------------------------------------
1 | ***************
2 | Variant Phasing
3 | ***************
4 |
5 | This page walks through the steps to use megalodon in conjunction with `whatshap `_ to produce the highest quality phased variant calls.
6 |
7 | This pipeline produces the ``variants.haploid_merged.vcf`` file containing high quality phased variant calls.
8 | The intermediate ``variant_mappings.haplotagged.bam`` file can be of particular interest to investigate variant calls at the per-read level.
9 | This file contains the reference sequence for each read annotated only with the proposed variant calls, including quality scores for SNVs.
10 | Thus random read errors are masked allowing for more accurate analysis on proposed variants.
11 | See an example of this per-read variant genome browser visualization below.
12 |
13 | ----
14 |
15 | .. figure:: _images/whatshap_haplotagged_variant_viz.png
16 | :align: center
17 | :width: 600
18 |
19 | Genome browser visualization. Megalodon variant_mappings haplotagged with whatshap (upper panel) and raw read mappings (lower panel).
20 |
21 | ----
22 |
23 | --------
24 | Workflow
25 | --------
26 |
27 | ::
28 |
29 | reads_dir="fast5s"
30 | ref="reference.fasta"
31 | variants_vcf="variants.vcf.gz"
32 | out_dir="megalodon_results"
33 | nproc=16
34 | gpu_devices="0 1"
35 |
36 | # run megalodon to produce variant_mappings
37 | megalodon \
38 | $reads_dir --outputs mappings variants variant_mappings \
39 | --reference $ref --variant-filename $variants_vcf \
40 | --output-directory $out_dir \
41 | --processes $nproc --devices $gpu_devices \
42 | --verbose-read-progress 3
43 |
44 | # filter whatshap incompatible variants and create indices
45 | megalodon_extras \
46 | phase_variants whatshap_filter \
47 | $out_dir/variants.sorted.vcf \
48 | $out_dir/variants.sorted.whatshap_filt.vcf \
49 | --filtered-records $out_dir/whatshap_filt.txt
50 | bgzip $out_dir/variants.sorted.whatshap_filt.vcf
51 | tabix $out_dir/variants.sorted.whatshap_filt.vcf.gz
52 | samtools index $out_dir/variant_mappings.sorted.bam
53 |
54 | # run whatshap with produced mappings and variants
55 | whatshap \
56 | phase --distrust-genotypes \
57 | -o $out_dir/variants.phased.vcf \
58 | $out_dir/variants.sorted.whatshap_filt.vcf.gz \
59 | $out_dir/variant_mappings.sorted.bam
60 |
61 | # assign haplotypes to reads
62 | bgzip $out_dir/variants.phased.vcf
63 | tabix $out_dir/variants.phased.vcf.gz
64 | whatshap \
65 | haplotag $out_dir/variants.phased.vcf.gz \
66 | $out_dir/variant_mappings.sorted.bam \
67 | -o $out_dir/variant_mappings.haplotagged.bam
68 |
69 | # extract haplotype reads and call haploid variants
70 | megalodon_extras \
71 | phase_variants extract_haplotype_reads \
72 | $out_dir/variant_mappings.haplotagged.bam \
73 | $out_dir/variant_mappings
74 | megalodon_extras \
75 | aggregate run \
76 | --megalodon-directory $out_dir --output-suffix haplotype_1 \
77 | --read-ids-filename $out_dir/variant_mappings.haplotype_1_read_ids.txt \
78 | --outputs variants --haploid --processes $nproc
79 | megalodon_extras \
80 | aggregate run \
81 | --megalodon-directory $out_dir --output-suffix haplotype_2 \
82 | --read-ids-filename $out_dir/variant_mappings.haplotype_2_read_ids.txt \
83 | --outputs variants --haploid --processes $nproc
84 |
85 | # merge haploid variants to produce diploid variants
86 | megalodon_extras \
87 | phase_variants merge_haploid_variants \
88 | $out_dir/variants.sorted.vcf.gz \
89 | $out_dir/variants.haplotype_1.sorted.vcf.gz \
90 | $out_dir/variants.haplotype_2.sorted.vcf.gz \
91 | --out-vcf $out_dir/variants.haploid_merged.vcf
92 |
--------------------------------------------------------------------------------
/docs/model_training.rst:
--------------------------------------------------------------------------------
1 | ************************
2 | Megalodon Model Training
3 | ************************
4 |
5 | This page describes how to use Megalodon to prepare training data and train a new basecalling model using Taiyaki.
6 | For modified base data preparation and model training documentation see the :doc:`modified base training documentation ` page.
7 |
8 | .. note::
9 |
10 | Preparation of training data via Megalodon requires a basecalling model that can produce valid reference mappings.
11 | If valid reference mappings using ``minimap2`` cannot be produced for a set of reads, model training will not proceed successfully.
12 |
13 | ----------------
14 | Data Preparation
15 | ----------------
16 |
17 | To produce a training data ("mapped signal") file the ``--outputs signal_mappings`` argument should be added to a Megalodon call.
18 | This will produce a ``signal_mappings.hdf5`` file in the specified Megalodon output directory.
19 | For each read producing a valid reference mapping, this file contains a mapping between the raw signal and the mapped reference bases.
20 | This file can then be directly passed to the Taiyaki ``train_flipflop.py`` command for model training.
21 |
22 | ::
23 |
24 | # run megalodon; output signal mappings
25 | megalodon raw_fast5s/ \
26 | --outputs signal_mappings \
27 | --reference reference.fa \
28 | --devices 0 --processes 40
29 |
30 | # run taiyaki training
31 | train_flipflop.py ./taiyaki/models/mLstm_flipflop.py \
32 | megalodon_results/signal_mappings.hdf5 --device 0
33 |
34 | Once training completes, the ``training/model_final.checkpoint`` contains the model.
35 | This can be converted to a guppy compatible model with the ``taiyaki/bin/dump_json.py`` script.
36 | A guppy config with appropriate settings should also be produced for new models.
37 |
38 | .. note::
39 |
40 | For optimal performance, it is recommended that the ``OMP_NUM_THREADS`` unix environment variable be set to ``1`` for the above Megalodon command and a larger value for the Taiyaki training command.
41 |
42 |
43 | ----------------------
44 | Signal Mapping Options
45 | ----------------------
46 |
47 | Several options are available to control the behavior of the ``signal_mappings`` output.
48 |
49 | - ``--ref-length-range``
50 |
51 | - Only allow reads with a reference mapping length within this range into the output.
52 | - ``--ref-percent-identity-threshold``
53 |
54 | - Only include reads with higher mapping percent identity in signal_mappings output.
55 | - ``--ref-percent-coverage-threshold``
56 |
57 | - Only include reads with higher read alignment coverage in signal_mappings output.
58 | - ``--ref-include-variants``
59 |
60 | - This option replaces the reference sequence with more likely proposed alternative sequences as called in the ``per_read_variants`` output.
61 | - Cannot specify both this option and ``--ref-include-mods``.
62 |
63 |
64 | ---------------------
65 | Megalodon Calibration
66 | ---------------------
67 |
68 | When a new model is trained, the produced scores must be calibrated to achieve optimal aggregated results (over reads).
69 | Once produced, calibration files can be passed to Megalodon via the ``--variant-calibration-filename`` and ``--mod-calibration-filename`` arguments.
70 |
71 | Sequence variant calibration requires a ground truth against which to compute scores.
72 | For sequence variants, a high quality reference for a set of reads will suffice for this requirement.
73 | Random sequence variants are proposed and scored in order to create distributions over which to calibrate the produced scores.
74 | In order to create a sequence variant calibration file, run ``megalodon/scripts/generate_ground_truth_variant_llr_scores.py`` followed by ``megalodon/scripts/calibrate_variant_llr_scores.py``.
75 | The optional ``--out-pdf`` provides visualization of the likelihood ratio score correction.
76 |
--------------------------------------------------------------------------------
/test/test_api.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from megalodon import backends, logging, megalodon_helper as mh
4 |
5 |
6 | LOGGER = logging.get_logger()
7 |
8 |
9 | def open_pyguppy_backend(args):
10 | args.do_not_use_guppy_server = False
11 | try:
12 | mh.mkdir(args.output_directory, False)
13 | except mh.MegaError:
14 | LOGGER.warning(
15 | "Guppy logs output directory exists. Potentially overwriting guppy "
16 | "logs."
17 | )
18 | backend_params = backends.parse_backend_params(args)
19 | model_info = None
20 | try:
21 | model_info = backends.ModelInfo(backend_params, args.processes)
22 | # if spawning multiple workers run this inside newly spawned processes
23 | model_info.prep_model_worker()
24 | LOGGER.info(model_info.get_alphabet_str())
25 | LOGGER.info(
26 | "Model structure:\n\tStride: {}\n\tState size: {}".format(
27 | model_info.stride, model_info.output_size
28 | )
29 | )
30 | # use model_info.iter_basecalled_reads to basecall reads and return
31 | # relevant signal anchored information.
32 | model_info.client.disconnect()
33 | finally:
34 | # ensure guppy server is closed in finally block
35 | if model_info is not None:
36 | model_info.close()
37 |
38 |
39 | def get_parser():
40 | parser = argparse.ArgumentParser()
41 |
42 | parser.add_argument(
43 | "--log-directory",
44 | default=".",
45 | help="Directory to output megalodon log. Default: current "
46 | + "working directory",
47 | )
48 |
49 | pyg_grp = parser.add_argument_group("Guppy Backend Arguments")
50 | pyg_grp.add_argument(
51 | "--guppy-config",
52 | default=mh.DEFAULT_GUPPY_CFG,
53 | help="Guppy config. Default: %(default)s",
54 | )
55 | pyg_grp.add_argument(
56 | "--guppy-server-path",
57 | default=mh.DEFAULT_GUPPY_SERVER_PATH,
58 | help="Path to guppy server executable. Default: %(default)s",
59 | )
60 | pyg_grp.add_argument(
61 | "--guppy-server-port",
62 | type=int,
63 | help="Guppy server port. Default: Guppy auto",
64 | )
65 | pyg_grp.add_argument(
66 | "--guppy-params",
67 | help="Extra guppy server parameters. Main purpose for optimal "
68 | + "performance based on compute environment. Quote parameters to "
69 | + "avoid them being parsed by megalodon.",
70 | )
71 | pyg_grp.add_argument(
72 | "--guppy-concurrent-reads",
73 | type=int,
74 | default=mh.DEFAULT_GUPPY_CONCURRENT_READS,
75 | help="Number of reads to process concurrently within each worker "
76 | "processes. Default: %(default)d",
77 | )
78 | pyg_grp.add_argument(
79 | "--guppy-timeout",
80 | type=float,
81 | default=mh.DEFAULT_GUPPY_TIMEOUT,
82 | help="Timeout to wait for guppy server to call a single read in "
83 | + "seconds. Default: %(default)f",
84 | )
85 | pyg_grp.add_argument(
86 | "--output-directory",
87 | default="guppy_logs",
88 | help="Directory to output guppy logs. Default: %(default)s",
89 | )
90 | pyg_grp.add_argument(
91 | "--devices",
92 | nargs="+",
93 | help="GPU devices for guppy basecalling backend.",
94 | )
95 | pyg_grp.add_argument(
96 | "--processes",
97 | type=int,
98 | default=1,
99 | help="Number of parallel processes. Default: %(default)d",
100 | )
101 |
102 | return parser
103 |
104 |
105 | def main(args):
106 | logging.init_logger(args.log_directory)
107 | open_pyguppy_backend(args)
108 |
109 |
110 | if __name__ == "__main__":
111 | main(get_parser().parse_args())
112 |
--------------------------------------------------------------------------------
/megalodon_extras/calibrate_generate_modified_base_stats.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import numpy as np
3 |
4 | from megalodon import logging, megalodon_helper as mh, mods
5 | from ._extras_parsers import get_parser_calibrate_generate_modified_base_stats
6 |
7 |
8 | LOGGER = logging.get_logger()
9 |
10 |
11 | def output_mods_data(
12 | all_mod_llrs, all_can_llrs, mod_base_set, exclude_mod_bases, out_fn
13 | ):
14 | LOGGER.info("Merging modified base data")
15 | all_mod_bases = list(
16 | set(all_mod_llrs.keys()).intersection(all_can_llrs.keys())
17 | )
18 | if len(set(all_mod_llrs.keys()).difference(all_mod_bases)) > 0:
19 | LOGGER.warning(
20 | "Modified base(s) found in modified dataset which were not "
21 | + "found in canonical dataset: {}".format(
22 | ",".join(set(all_mod_llrs.keys()).difference(all_mod_bases))
23 | )
24 | )
25 | if len(set(all_can_llrs.keys()).difference(all_mod_bases)) > 0:
26 | LOGGER.warning(
27 | "Modified base(s) found in modified dataset which were "
28 | + "not found in canonical dataset: {}".format(
29 | ",".join(set(all_mod_llrs.keys()).difference(all_mod_bases))
30 | )
31 | )
32 | if mod_base_set is not None:
33 | all_mod_bases = list(set(all_mod_bases).intersection(mod_base_set))
34 | if len(all_mod_bases) == 0:
35 | LOGGER.error(
36 | (
37 | "No modified bases to process.\n\tModified bases from "
38 | + "results: {}\n\tModified base set: {}"
39 | ).format(",".join(all_mod_bases), ",".join(mod_base_set))
40 | )
41 | if exclude_mod_bases is not None:
42 | all_mod_bases = list(set(all_mod_bases).difference(exclude_mod_bases))
43 | if len(all_mod_bases) == 0:
44 | LOGGER.error(
45 | (
46 | "No modified bases to process.\n\tModified bases from "
47 | + "results: {}\n\tExcluded modified bases: {}"
48 | ).format(",".join(all_mod_bases), ",".join(exclude_mod_bases))
49 | )
50 | mod_base_stats = {mods.GT_ALL_MOD_BASE_STR: all_mod_bases}
51 | for mod_base in all_mod_bases:
52 | mod_base_stats[mods.GT_MOD_LLR_STR.format(mod_base)] = all_mod_llrs[
53 | mod_base
54 | ]
55 | mod_base_stats[mods.GT_CAN_LLR_STR.format(mod_base)] = all_can_llrs[
56 | mod_base
57 | ]
58 | np.savez(out_fn, **mod_base_stats)
59 |
60 |
61 | def _main(args):
62 | logging.init_logger(quiet=args.quiet)
63 |
64 | if (
65 | args.ground_truth_data is None
66 | and args.control_megalodon_results_dir is None
67 | ):
68 | LOGGER.error(
69 | "Must provide either --control-megalodon-results-dir or "
70 | + "--ground-truth-data"
71 | )
72 | sys.exit()
73 |
74 | db_fn = mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME)
75 | if args.ground_truth_data is not None:
76 | LOGGER.info("Parsing ground truth data")
77 | gt_mod_pos, gt_can_pos = mh.parse_ground_truth_file(
78 | args.ground_truth_data, include_strand=args.strand_specific_sites
79 | )
80 | LOGGER.info(
81 | (
82 | "Loaded ground truth data with {} modified sites and {} "
83 | + "canonical sites."
84 | ).format(len(gt_mod_pos), len(gt_can_pos))
85 | )
86 | LOGGER.info(
87 | "Reading ground truth modified base statistics from " + "database."
88 | )
89 | all_mod_llrs, all_can_llrs = mods.extract_stats_at_valid_sites(
90 | db_fn,
91 | [gt_mod_pos, gt_can_pos],
92 | include_strand=args.strand_specific_sites,
93 | )
94 | else:
95 | LOGGER.info(
96 | "Reading ground truth modified base statistics from " + "database"
97 | )
98 | all_mod_llrs = mods.extract_all_stats(db_fn)
99 | LOGGER.info(
100 | "Reading ground truth modified base statistics from "
101 | + "canonical sample database"
102 | )
103 | all_can_llrs = mods.extract_all_stats(
104 | mh.get_megalodon_fn(
105 | args.control_megalodon_results_dir, mh.PR_MOD_NAME
106 | )
107 | )
108 |
109 | mod_summary = [
110 | (
111 | mod,
112 | len(all_mod_llrs[mod]) if mod in all_mod_llrs else 0,
113 | len(all_can_llrs[mod]) if mod in all_can_llrs else 0,
114 | )
115 | for mod in set(all_mod_llrs).union(all_can_llrs)
116 | ]
117 | LOGGER.info(
118 | "Data summary:\n\tmod\tmod_N\tcan_N\n"
119 | + "\n".join("\t" + "\t".join(map(str, x)) for x in mod_summary)
120 | )
121 | output_mods_data(
122 | all_mod_llrs,
123 | all_can_llrs,
124 | args.modified_bases_set,
125 | args.exclude_modified_bases,
126 | args.out_filename,
127 | )
128 |
129 |
130 | if __name__ == "__main__":
131 | _main(get_parser_calibrate_generate_modified_base_stats().parse_args())
132 |
--------------------------------------------------------------------------------
/megalodon_extras/calibrate_modified_bases.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib
3 |
4 | if True:
5 | # Agg appears to be the most robust backend when only saving plots.
6 | matplotlib.use("Agg")
7 | import matplotlib.pyplot as plt
8 | from matplotlib.backends.backend_pdf import PdfPages
9 |
10 | from megalodon import calibration, logging, megalodon_helper as mh, mods
11 | from ._extras_parsers import get_parser_calibrate_modified_bases
12 |
13 |
14 | LOGGER = logging.get_logger()
15 | PROB_COLORS = ("#bcbddc", "#807dba", "#6a51a3")
16 |
17 |
18 | def plot_calib(
19 | pdf_fp,
20 | mod_base,
21 | smooth_ls,
22 | s_ref,
23 | sm_ref,
24 | s_alt,
25 | sm_alt,
26 | mono_prob,
27 | prob_alt,
28 | prob_threshs,
29 | add_prob_thresh,
30 | ):
31 | f, axarr = plt.subplots(3, sharex=True, figsize=(11, 7))
32 | axarr[0].plot(smooth_ls, s_ref, color="orange")
33 | axarr[0].plot(smooth_ls, sm_ref, color="red")
34 | axarr[0].plot(smooth_ls, s_alt, color="grey")
35 | axarr[0].plot(smooth_ls, sm_alt, color="blue")
36 | axarr[0].set_ylabel(
37 | "Probability Density\nred/orange=canonical\nblue/grey=modified"
38 | )
39 | axarr[0].set_title(mod_base + " Calibration")
40 | axarr[1].plot(smooth_ls, mono_prob, color="orange")
41 | axarr[1].plot(smooth_ls, 1 / (np.exp(smooth_ls) + 1), color="purple")
42 | axarr[1].set_ylabel(
43 | "Emperical Modified\nProbability\norange=calibrated\npurple=raw"
44 | )
45 | axarr[2].plot(smooth_ls, np.log((1 - prob_alt) / prob_alt), color="red")
46 | axarr[2].plot(
47 | smooth_ls, np.log((1 - mono_prob) / mono_prob), color="orange"
48 | )
49 | axarr[2].set_ylabel("Calibrated LLR\norage=monotonic")
50 | axarr[2].set_xlabel("Theoretical LLR (NN Score)")
51 | if add_prob_thresh:
52 | # indicate the cutoff points for several common cutoff locations
53 | thresh_f = np.log((1 - mono_prob) / mono_prob)
54 | for p, col in zip(prob_threshs, PROB_COLORS):
55 | llr_x = np.log(p / (1 - p))
56 | thresh_val = np.argmin(np.abs(thresh_f - llr_x))
57 | nthresh_val = np.argmin(np.abs(thresh_f + llr_x))
58 | prop_filt = (
59 | sum(sm_ref[nthresh_val:thresh_val])
60 | + sum(sm_alt[nthresh_val:thresh_val])
61 | ) / (sum(sm_ref) + sum(sm_alt))
62 | for i in range(2):
63 | axarr[i].axvline(x=smooth_ls[thresh_val], color=col)
64 | axarr[i].axvline(x=smooth_ls[nthresh_val], color=col)
65 | axarr[2].axvline(x=smooth_ls[thresh_val], color=col)
66 | axarr[2].axvline(
67 | x=smooth_ls[nthresh_val],
68 | color=col,
69 | label=("--mod-binary-threshold={} (filters {:.0f}%)").format(
70 | p, 100 * prop_filt
71 | ),
72 | )
73 | axarr[2].legend(fontsize="small")
74 |
75 | pdf_fp.savefig(bbox_inches="tight")
76 | plt.close()
77 |
78 |
79 | def extract_llrs(llr_fn):
80 | llrs_data = np.load(llr_fn)
81 | mod_bases = llrs_data[mods.GT_ALL_MOD_BASE_STR]
82 | mod_base_llrs = {}
83 | for mod_base in mod_bases:
84 | mod_base_llrs[mod_base] = (
85 | llrs_data[mods.GT_MOD_LLR_STR.format(mod_base)],
86 | llrs_data[mods.GT_CAN_LLR_STR.format(mod_base)],
87 | )
88 |
89 | return mod_base_llrs
90 |
91 |
92 | def _main(args):
93 | logging.init_logger()
94 | mh.prep_out_fn(args.out_filename, args.overwrite)
95 |
96 | LOGGER.info("Parsing log-likelihood ratios")
97 | mod_base_llrs = extract_llrs(args.ground_truth_llrs)
98 |
99 | pdf_fp = None if args.out_pdf is None else PdfPages(args.out_pdf)
100 | save_kwargs = {}
101 | for mod_base, (mod_llrs, can_llrs) in mod_base_llrs.items():
102 | LOGGER.info("Computing {} modified base calibration.".format(mod_base))
103 | mod_calib, mod_llr_range, plot_data = calibration.compute_calibration(
104 | can_llrs,
105 | mod_llrs,
106 | args.max_input_llr,
107 | args.num_calibration_values,
108 | args.smooth_bandwidth,
109 | args.min_density,
110 | args.diff_epsilon,
111 | args.llr_clip_buffer,
112 | pdf_fp is not None,
113 | num_proc=args.processes,
114 | )
115 | save_kwargs[mod_base + calibration.LLR_RANGE_SUFFIX] = mod_llr_range
116 | save_kwargs[mod_base + calibration.CALIB_TABLE_SUFFIX] = mod_calib
117 | if pdf_fp is not None:
118 | plot_calib(
119 | pdf_fp,
120 | mod_base,
121 | *plot_data,
122 | args.pdf_prob_thresholds,
123 | not args.plot_without_prob_thresholds,
124 | )
125 | if pdf_fp is not None:
126 | pdf_fp.close()
127 |
128 | # save calibration table for reading into mod calibration table
129 | LOGGER.info("Saving calibrations to file.")
130 | mod_bases = list(mod_base_llrs.keys())
131 | np.savez(
132 | args.out_filename,
133 | stratify_type=calibration.MOD_BASE_STRAT_TYPE,
134 | smooth_nvals=args.num_calibration_values,
135 | mod_bases=mod_bases,
136 | **save_kwargs,
137 | )
138 |
139 |
140 | if __name__ == "__main__":
141 | _main(get_parser_calibrate_modified_bases().parse_args())
142 |
--------------------------------------------------------------------------------
/megalodon_extras/variants_heterozygous_factor.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | import pysam
4 | import numpy as np
5 |
6 | from megalodon import logging
7 | from ._extras_parsers import get_parser_variants_heterozygous_factor
8 |
9 |
10 | LOGGER = logging.get_logger()
11 |
12 | HOM_REF_TXT = "hom_ref"
13 | HET_TXT = "het"
14 | HOM_ALT_TXT = "hom_alt"
15 |
16 | SNP_TXT = "SNP"
17 | DEL_TXT = "DEL"
18 | INS_TXT = "INS"
19 |
20 | STAT_WIDTH = 12
21 | STATS_FMT_STR = "{:<" + str(STAT_WIDTH) + "}"
22 | FLOAT_FMT_STR = "{:<" + str(STAT_WIDTH) + ".4f}"
23 | STAT_NAMES = ("HomRef", "Het", "HomAlt", "F1", "Precision", "Recall")
24 | N_STATS = len(STAT_NAMES)
25 | N_INT_STATS = 3
26 | N_FLOAT_STATS = N_STATS - N_INT_STATS
27 | HEADER_TMPLT = STATS_FMT_STR * (N_STATS + 1)
28 | STATS_TMPLT = STATS_FMT_STR * (N_INT_STATS + 1) + FLOAT_FMT_STR * N_FLOAT_STATS
29 |
30 |
31 | def _main(args):
32 | def conv_call_str(gt_vals):
33 | gt_set = set(gt_vals)
34 | if gt_set == set([0]):
35 | return HOM_REF_TXT
36 | elif gt_set == set([0, 1]):
37 | return HET_TXT
38 | return HOM_ALT_TXT
39 |
40 | logging.init_logger()
41 | gt_calls = defaultdict(dict)
42 | for variant in pysam.VariantFile(args.ground_truth_variants).fetch():
43 | # skip mutli-allelic sites
44 | if variant.alts is None or len(variant.alts) > 1:
45 | continue
46 | if len(variant.ref) == len(variant.alts[0]):
47 | gt_calls[SNP_TXT][
48 | (variant.contig, variant.pos, variant.ref, variant.alts[0])
49 | ] = conv_call_str(variant.samples.values()[0]["GT"])
50 | elif len(variant.ref) > len(variant.alts[0]):
51 | gt_calls[DEL_TXT][
52 | (variant.contig, variant.pos, variant.ref, variant.alts[0])
53 | ] = conv_call_str(variant.samples.values()[0]["GT"])
54 | else:
55 | gt_calls[INS_TXT][
56 | (variant.contig, variant.pos, variant.ref, variant.alts[0])
57 | ] = conv_call_str(variant.samples.values()[0]["GT"])
58 | mega_calls = defaultdict(dict)
59 | for variant in pysam.VariantFile(args.megalodon_variants).fetch():
60 | # skip mutli-allelic sites
61 | if len(variant.alts) > 1:
62 | continue
63 | if len(variant.ref) == len(variant.alts[0]):
64 | mega_calls[SNP_TXT][
65 | (variant.contig, variant.pos, variant.ref, variant.alts[0])
66 | ] = conv_call_str(variant.samples.values()[0]["GT"])
67 | elif len(variant.ref) > len(variant.alts[0]):
68 | mega_calls[DEL_TXT][
69 | (variant.contig, variant.pos, variant.ref, variant.alts[0])
70 | ] = conv_call_str(variant.samples.values()[0]["GT"])
71 | else:
72 | mega_calls[INS_TXT][
73 | (variant.contig, variant.pos, variant.ref, variant.alts[0])
74 | ] = conv_call_str(variant.samples.values()[0]["GT"])
75 |
76 | for var_type in (SNP_TXT, DEL_TXT, INS_TXT):
77 | counts = defaultdict(int)
78 | for chrm_pos_ref_alt in set(gt_calls[var_type]).intersection(
79 | mega_calls[var_type]
80 | ):
81 | counts[
82 | (
83 | gt_calls[var_type][chrm_pos_ref_alt],
84 | mega_calls[var_type][chrm_pos_ref_alt],
85 | )
86 | ] += 1
87 |
88 | # compute F1 stat
89 | vt_stats = []
90 | for truth_type in (HOM_REF_TXT, HET_TXT, HOM_ALT_TXT):
91 | gt_count = sum(
92 | counts[(truth_type, mega_call)]
93 | for mega_call in (HOM_REF_TXT, HET_TXT, HOM_ALT_TXT)
94 | )
95 | mega_count = sum(
96 | counts[(gt_call, truth_type)]
97 | for gt_call in (HOM_REF_TXT, HET_TXT, HOM_ALT_TXT)
98 | )
99 | if gt_count == 0 or mega_count == 0:
100 | vt_stats.append((np.NAN, np.NAN, np.NAN))
101 | else:
102 | prec = counts[(truth_type, truth_type)] / mega_count
103 | recall = counts[(truth_type, truth_type)] / gt_count
104 | vt_stats.append(
105 | (2 * (prec * recall) / (prec + recall), prec, recall)
106 | )
107 |
108 | # print output
109 | LOGGER.info(var_type)
110 | LOGGER.info(HEADER_TMPLT.format("Truth\tCalls", *STAT_NAMES))
111 | for truth, (f1, prec, recall) in zip(
112 | (HOM_REF_TXT, HET_TXT, HOM_ALT_TXT), vt_stats
113 | ):
114 | LOGGER.info(
115 | STATS_TMPLT.format(
116 | truth,
117 | counts[(truth, HOM_REF_TXT)],
118 | counts[(truth, HET_TXT)],
119 | counts[(truth, HOM_ALT_TXT)],
120 | f1,
121 | prec,
122 | recall,
123 | )
124 | )
125 | mean_f1_fmt = (
126 | "{:>"
127 | + str(STAT_WIDTH * (N_STATS - 2))
128 | + "}"
129 | + FLOAT_FMT_STR * N_FLOAT_STATS
130 | + "\n"
131 | )
132 | mean_stats = map(np.nanmean, zip(*vt_stats))
133 | LOGGER.info(mean_f1_fmt.format("Mean Stats: ", *mean_stats))
134 |
135 |
136 | if __name__ == "__main__":
137 | _main(get_parser_variants_heterozygous_factor().parse_args())
138 |
--------------------------------------------------------------------------------
/megalodon_extras/modified_bases_split_by_motif.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 |
3 | import pysam
4 | from tqdm import tqdm
5 |
6 | from megalodon import backends, logging, mods, megalodon_helper as mh
7 | from ._extras_parsers import get_parser_modified_bases_split_calls_by_motif
8 |
9 |
10 | LOGGER = logging.get_logger()
11 |
12 | MOTIF_INFO = namedtuple(
13 | "MOTIF_INFO",
14 | ("bases_before", "bases_after", "raw_motif", "motif", "rc_motif"),
15 | )
16 |
17 |
18 | ########################
19 | # data table functions #
20 | ########################
21 |
22 |
23 | def split_data(in_mods_db, out_mods_dbs, ref):
24 | LOGGER.info("Inserting modified base data")
25 | bar = tqdm(
26 | desc="Inserting Data",
27 | unit="per-read calls",
28 | total=in_mods_db.get_num_uniq_stats(),
29 | smoothing=0,
30 | dynamic_ncols=True,
31 | )
32 | curr_ref_seq = curr_chrm = None
33 | # TODO multiprocess over contigs (need to implement iteration over range
34 | # of pos_dbids via chrm string)
35 | for pos_dbid, pos_mod_data in in_mods_db.iter_pos_scores():
36 | bar.update(len(pos_mod_data))
37 | chrm, strand, pos = in_mods_db.get_pos(pos_dbid)
38 | if chrm != curr_chrm:
39 | curr_chrm = chrm
40 | curr_ref_seq = ref.fetch(chrm)
41 | for out_mods_db, motif_info in out_mods_dbs:
42 | motif_match = (
43 | motif_info.motif.match(
44 | curr_ref_seq[
45 | pos
46 | - motif_info.bases_before : pos
47 | + motif_info.bases_after
48 | + 1
49 | ]
50 | )
51 | if strand == 1
52 | else motif_info.rc_motif.match(
53 | curr_ref_seq[
54 | pos
55 | - motif_info.bases_after : pos
56 | + motif_info.bases_before
57 | + 1
58 | ]
59 | )
60 | )
61 | if motif_match is not None:
62 | pos_insert_data = [
63 | (lp, pos_dbid, mod_dbid, read_dbid)
64 | for read_dbid, mod_dbid, lp in pos_mod_data
65 | ]
66 | out_mods_db.insert_batch_data(pos_insert_data)
67 | break
68 | bar.close()
69 |
70 |
71 | ##########
72 | # motifs #
73 | ##########
74 |
75 |
76 | def parse_motifs(raw_motifs):
77 | motifs = []
78 | for raw_motif, bases_before in raw_motifs:
79 | bases_before = int(bases_before)
80 | bases_after = len(raw_motif) - bases_before - 1
81 | motif = mh.compile_motif_pat(raw_motif)
82 | rc_motif = mh.compile_rev_comp_motif_pat(raw_motif)
83 | motifs.append(
84 | MOTIF_INFO(
85 | bases_before=bases_before,
86 | bases_after=bases_after,
87 | raw_motif=raw_motif,
88 | motif=motif,
89 | rc_motif=rc_motif,
90 | )
91 | )
92 |
93 | return motifs
94 |
95 |
96 | ########
97 | # main #
98 | ########
99 |
100 |
101 | def _main(args):
102 | logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix)
103 |
104 | # parse motifs
105 | motifs = parse_motifs(args.motif)
106 | # open indexed FASTA reference
107 | ref = pysam.FastaFile(args.reference)
108 |
109 | LOGGER.info("Extracting mods and chrms from input database")
110 | in_mods_db = mods.ModsDb(
111 | mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME)
112 | )
113 | alphabet, _, mod_long_names = in_mods_db.get_alphabet_info()
114 | ref_names_and_lens = list(zip(*in_mods_db.iter_chrms()))[1:]
115 | LOGGER.info("Extracting read uuid table")
116 | in_uuids = [uuid for _, uuid in in_mods_db.iter_uuids()]
117 |
118 | LOGGER.info("Opening new per-read modified base statistics databases")
119 | model_info = backends.DetachedModelInfo(
120 | alphabet=alphabet, mod_long_names=mod_long_names
121 | )
122 | out_mods_dbs = []
123 | for motif_info in motifs:
124 | out_dir = "{}.{}_{}".format(
125 | args.output_prefix, motif_info.raw_motif, motif_info.bases_before
126 | )
127 | mh.mkdir(out_dir, overwrite=False)
128 | mods_info = mods.ModInfo(model_info, out_dir=out_dir)
129 | mods.init_mods_db(mods_info, ref_names_and_lens)
130 | out_mods_dbs.append(
131 | (mods.ModsDb(mods_info.mods_db_fn, read_only=False), motif_info)
132 | )
133 | out_mods_dbs[-1][0].insert_uuids(in_uuids)
134 | out_mods_dbs[-1][0].commit()
135 |
136 | # commit so read uuids are available to worker processes
137 | LOGGER.info("Inserting per-read calls from input databases")
138 | split_data(in_mods_db, out_mods_dbs, ref)
139 |
140 | # TOOD do this in separate processes
141 | LOGGER.info(
142 | "Creating data covering indices for efficient iteration by position"
143 | )
144 | for out_mods_db, _ in out_mods_dbs:
145 | out_mods_db.create_data_covering_index()
146 | out_mods_db.commit()
147 | out_mods_db.close()
148 | LOGGER.info("Finished indexing {}".format(out_mods_db.fn))
149 |
150 |
151 | if __name__ == "__main__":
152 | _main(get_parser_modified_bases_split_calls_by_motif().parse_args())
153 |
--------------------------------------------------------------------------------
/docs/extras_variants.rst:
--------------------------------------------------------------------------------
1 | *****************************
2 | ``megalodon_extras variants``
3 | *****************************
4 |
5 | The ``megalodon_extras variants`` command group contains various commands related to sequence variant processing within Megalodon.
6 |
7 | -------------------------------------
8 | ``megalodon_extras variants atomize``
9 | -------------------------------------
10 |
11 | This command processes each variant in a VCF file to convert the variants to their atomic form.
12 | When the variant file produced by this command is used in the ``megalodon`` command with the ``--variants-are-atomized`` flag, processing speed can be drastically increased.
13 | This is especially true for high-density variant files.
14 |
15 | For each variant processed in Megalodon the "atomic" variant is required for the highest quality results.
16 | For single nucleotide variants (SNVs) the atomic form simply reduces any multi-nucleotide SNP into the single nucleotide swaps.
17 | For insertions and deletions (indels) the atomic form removes context bases and expands the indel to include all unambiguous positions.
18 | For example consider a bit of reference sequence ``AGCGCA``.
19 | An insertion after the first base of a ``GC`` could be validly annotated in a VCF file as reference allele ``A`` and alternative allele ``AGC``.
20 | But the atomic form of this variant would be reference allele ``GCGC`` and alternative allele ``GCGCGC``.
21 | In this way atomic variants capture all reference bases impacted by a variant and no more.
22 | Processing variants in this way ensures that the correct bit of sequence and raw signal is considered when determining the correct allele within Megalodon.
23 |
24 | The processing of each variant to its atomic form must be completed each time a variant overlaps a read since VCF files can often be too large to store in RAM.
25 | Thus the computational cost for high coverage or dense variant files can be quite high for this step of the processing.
26 | This command allows this atomizing step to be performed once before starting a ``megalodon`` run.
27 | When passing a variant file processed with this command, the ``--variants-are-atomized`` flag should be specified.
28 | Note that this flag should not be used with VCF files originating from other sources.
29 | Specifically, this command adds a non-standard ``CB`` (context base) VCF flag in order to indicate that a context base added to a variant is not part of the atomic form of that variant.
30 |
31 | -------------------------------------
32 | ``megalodon_extras variants resolve``
33 | -------------------------------------
34 |
35 | The ``megalodon_extras variants resolve`` command resolves conflicting variants and removes variants determined to be the reference allele.
36 |
37 | Megalodon processes sequence variants to make a call for each variant individually (taking nearby variants into account).
38 | Nearby variants are often overlapping and thus incompatible.
39 | The primary use for this command is to resolve situations where multiple overlapping variants are called.
40 | At each site with overlapping variants, the one with the highest probability is selected for the output file.
41 |
42 | In addition, this command has options to filter variants which were called as the reference allele (``--max-likelihood-ratio``), filter for coverage (``--min-depth``), and revert atomic variant notation (``--trim-variants``).
43 |
44 | There are also a number of options to inspect potential systematic bias in identified sequence variants.
45 | Providing a VCF file called only from reverse strand mapping reads via the ``--reverse-strand-variants`` argument activates the output.
46 | The main VCF provided is then assumed to be derived from forward strand mapping reads only.
47 | In this mode, variants are output when they are identified only on one strand and not the other to allow analysis of potential bias in basecalling models.
48 | This feature is experimental and does not have defined pipelines for downstream use.
49 |
50 | -------------------------------------------------
51 | ``megalodon_extras variants heterozygous_factor``
52 | -------------------------------------------------
53 |
54 | Determine the result of the heterozygous factor on identifying the correct balance of homozygous to heterozygous variants.
55 |
56 | This command can assist in setting an optimal value for the ``--heterozygous-factors`` argument.
57 | The default value is intended to minimize the number of false heterozygous calls.
58 | The recommended phased variant pipeline for Megalodon processes variants such that if a variant is not initially called heterozygous, a heterozygous call cannot be made, but heterozygous calls can be converted to homozygous calls.
59 | Since this phased variant pipeline is recommended in order to obtain the highest quality variants, false homozygous calls are minimized.
60 |
61 | If the aim is to achieve a balance of homozygous and heterozygous calls, this command can be used to evaluate this balance for a particular ``--heterozygous-factors`` setting.
62 | This command will output the number of called homozygous and heterozygous calls compared to their ground truth from a provided set of variants.
63 | As a guide, previous Megalodon version had the default value of ``--heterozygous-factors 2.1 1.6`` which achieves a better balance than the current default of ``--heterozygous-factors 1.0 1.0`` which minimizes false homozygous calls.
64 |
65 | --------------------------------------------
66 | ``megalodon_extras variants index_database``
67 | --------------------------------------------
68 |
69 | This command is not currently implemented, but will be in a future release.
70 |
--------------------------------------------------------------------------------
/docs/common_arguments.rst:
--------------------------------------------------------------------------------
1 | ****************
2 | Common Arguments
3 | ****************
4 |
5 | -----------------
6 | Required Argument
7 | -----------------
8 |
9 | - ``fast5s_dir``
10 |
11 | - Path to directory containing raw FAST5-format nanopore reads.
12 | - Both single and multi FAST5 formats are supported.
13 | - Default searches recursively for fast5 read files. To search only one-level specify ``--not-recursive``.
14 |
15 | ----------------------
16 | Guppy Backend Argument
17 | ----------------------
18 |
19 | - ``--guppy-config``
20 |
21 | - Guppy config.
22 | - Default: ``dna_r9.4.1_450bps_modbases_5mc_hac.cfg``
23 |
24 | - ``--guppy-server-path``
25 |
26 | - Path to guppy server executable.
27 | - Default: ``./ont-guppy/bin/guppy_basecall_server``
28 |
29 | ----------------
30 | Output Arguments
31 | ----------------
32 |
33 | - ``--live-processing``
34 |
35 | - As of version 2.2, Megalodon now supports live run processing.
36 | - Activate live processing mode by simply adding the ``--live-processing`` argument and specifying the MinKNOW output directory as the input FAST5 directory.
37 | - Megalodon will continue to search for FAST5s until the ``final_summary*`` file is created by MinKNOW, indicating data production has completed.
38 | - ``--outputs``
39 |
40 | - Specify desired outputs.
41 | - Options are ``basecalls``, ``mod_basecalls``, ``mappings``, ``variant_mappings``, ``mod_mappings``, ``per_read_variants``, ``per_read_mods``, ``variants``, and ``mods``.
42 |
43 | - ``mod_basecalls`` are output in a BAM file via the ``Mm`` and ``Ml`` tags `described by hts-specs here `_.
44 | - ``variant_mappings`` are intended for obtaining highly accurate phased variant genotypes, but also provide a nice genome browser visualiztion of per-read variant calls.
45 |
46 | - These mappings contain reference sequence at all positions except for per-read called variants. The base quality scores encode the likelihood for that reference anchored variant for use in the ``whathap`` phasing algorithm.
47 | - ``mod_mappings`` provide reference-anchored per-read modified base calls.
48 |
49 | - As of version 2.2, the default output uses the ``Mm`` and ``Ml`` hts-specs tags (see above) with all modified bases in one output file.
50 | - Specify the ``--mod-map-emulate-bisulfite`` option to output one BAM per modified base with modified bases converted using ``--mod-map-base-conv``
51 |
52 | - This file is useful for visualizing per-read modified base calls (e.g. IGV bisulfite mode for CpG calls).
53 | - This file may also allow a port to standard bisulfite pipelines that are capable of processing long-reads.
54 | - Default output is ``basecalls`` only.
55 | - ``--output-directory``
56 |
57 | - Specify the directory to output results.
58 | Default ``megalodon_results``
59 | - ``--overwrite``
60 |
61 | - Overwrite the ``--output-directory`` if it exists.
62 | - Note that this is a recursive file deletion and should be used with caution.
63 |
64 | -----------------
65 | Mapping Arguments
66 | -----------------
67 |
68 | - ``--mappings-format``
69 |
70 | - Format for ``mapping`` output.
71 | - Options include ``bam`` (default), ``cram``, and ``sam``.
72 | - As of version 2.2, mappings are no longer sorted by default.
73 |
74 | - Set ``--sort-mappings`` to sort mappings. If ``samtools`` is not in ``$PATH`` provide path to executable via the ``--samtools-executable`` argument.
75 | - ``--reference``
76 |
77 | - Reference genome or transcriptome in FASTA or minimap2 index format.
78 |
79 | - If ``--reference`` is a minimap2 index and ``--mapping-format`` is ``cram``, provide FASTA reference via ``--cram-reference``.
80 |
81 | --------------------------
82 | Sequence Variant Arguments
83 | --------------------------
84 |
85 | - ``--haploid``
86 |
87 | - Compute sequence variants assuming a haploid reference. Default: diploid
88 | - ``--variant-filename``
89 |
90 | - File containing putative variants in VCF/BCF format.
91 |
92 | - Variants file must be sorted.
93 | - If variant file is not compressed and indexed this will be performed before further processing.
94 | - Variants must be matched to the ``--reference`` provided.
95 |
96 | -----------------------
97 | Modified Base Arguments
98 | -----------------------
99 |
100 | - ``--mod-motif``
101 |
102 | - Restrict modified base results to the specified motifs.
103 | - This argument takes 3 values representing:
104 |
105 | 1. Modified base single letter codes (see ``megalodon_extras modified_bases describe_alphabet`` command)
106 | 2. Canonical sequence motif (may contain `ambiguity codes `_)
107 | 3. Relative position (0-based) of the modified base within the canonical sequence motif
108 | - Multiple ``--mod-motif`` arguments can be provided to a single ``megalodon`` command.
109 | - If not provided (and ``per_read_mods`` or ``mods`` outputs requested) all relevant sites are tested (e.g. all ``C`` bases for ``5mC``).
110 |
111 | - Note that restricting to motifs of interest can save computationally expensive steps and is considered more than a simple post-processing filter.
112 |
113 | --------------------------
114 | Compute Resource Arguments
115 | --------------------------
116 |
117 | - ``--processes``
118 |
119 | - Number of CPU read-processing workers to spawn.
120 | - ``--devices``
121 |
122 | - GPU devices to use for basecalling acceleration.
123 | - If not provided CPU basecalling will be performed.
124 | - Device names can be provided in the following formats: ``0``, ``cuda0`` or ``cuda:0``.
125 | - Multiple devices can be specified separated by a space.
126 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | *************************************
2 | Welcome to Megalodon's documentation!
3 | *************************************
4 |
5 | Megalodon is a research command line tool to extract high accuracy modified base and sequence variant calls from raw nanopore reads by anchoring the information rich basecalling neural network output to a reference genome/transcriptome.
6 |
7 | Raw nanopore reads are processed by a single command to produce basecalls (FASTA/Q), reference mappings (SAM/BAM/CRAM), modified base calls (per-read and aggregated per-reference site), sequence variant calls (per-read and aggregated per-reference site) and more.
8 |
9 | -------------
10 | Prerequisites
11 | -------------
12 |
13 | The primary Megalodon run mode requires the Guppy basecaller (version >= 4.0).
14 | See the `community page for download/installation instructions [login required] `_.
15 |
16 | Megalodon is a python-based command line software package.
17 | Given a python (version >= 3.5) installation, all other requirements are handled by ``pip`` or ``conda``.
18 |
19 | ..
20 |
21 | `Taiyaki `_ is no longer required to run Megalodon, but installation is required for two specific run modes:
22 |
23 | 1) output mapped signal files (for basecall model training)
24 |
25 | 2) running the Taiyaki basecalling backend (for neural network designs including experimental layers)
26 |
27 | ------------
28 | Installation
29 | ------------
30 |
31 | ``pip`` is recommended for Megalodon installation.
32 |
33 | ::
34 |
35 | pip install megalodon
36 |
37 | ``conda`` installation is available, but not fully supported.
38 | ``ont_pyguppy_client_lib`` is not available on conda and thus must be installed with ``pip``.
39 |
40 | ::
41 |
42 | conda install megalodon
43 | pip install ont_pyguppy_client_lib
44 |
45 | To install from github source for development, the following commands can be run.
46 |
47 | ::
48 |
49 | git clone https://github.com/nanoporetech/megalodon
50 | pip install -e megalodon/
51 |
52 | It is recommended that Megalodon be installed in a control compute environment.
53 | See `the python documentation for preparing virtual environments `_
54 |
55 | ===========
56 | Quick Start
57 | ===========
58 |
59 | Megalodon must obtain the intermediate output from the basecall neural network.
60 | Guppy (production nanopore basecalling software) is the recommended backend to obtain this output from raw nanopore signal (from FAST5 files).
61 | Nanopore basecalling is compute intensive and thus it is highly recommended that GPU resources are specified (``--devices``) for optimal Megalodon performance.
62 |
63 | Megalodon is accessed via the command line interface ``megalodon`` command.
64 |
65 | ::
66 |
67 | # megalodon help (common args)
68 | megalodon -h
69 | # megalodon help (advanced args)
70 | megalodon --help-long
71 |
72 | # Example command to output basecalls, mappings, and 5mC CpG methylation in both per-read (``mod_mappings``) and aggregated (``mods``) formats
73 | # Compute settings: GPU devices 0 and 1 with 40 CPU cores
74 | megalodon \
75 | raw_fast5s/ \
76 | --outputs basecalls mappings mod_mappings mods \
77 | --reference reference.fa --mod-motif m CG 0 \
78 | --devices 0 1 --processes 40
79 |
80 | This command produces the ``megalodon_results`` output directory containing all requested output files and logs.
81 | The format for common outputs is described briefly below and in more detail in the `full documentation `_
82 |
83 | The above command uses the modified base model included in Guppy.
84 | As of the ``2.3.0`` megalodon release (March 2021) the models included with Guppy (``4.5.2``) provide the most accurate modified basecalling models.
85 | As more accurate basecalling models are trained, they are first released into the `Rerio repository for research models `_.
86 | Once training pipelines are more thoroughly standardized and tested models will be transferred into Guppy.
87 | The code below shows how to obtain and run the R9.4.1, MinION/GridION, 5mC CpG model from Rerio.
88 | Note that this is the same model now included in Guppy ``4.5.2``.
89 |
90 | ::
91 |
92 | # Obtain and run R9.4.1, MinION, 5mC CpG model from Rerio
93 | git clone https://github.com/nanoporetech/rerio
94 | rerio/download_model.py rerio/basecall_models/res_dna_r941_min_modbases_5mC_CpG_v001
95 | megalodon \
96 | raw_fast5s/ \
97 | --guppy-params "-d ./rerio/basecall_models/" \
98 | --guppy-config res_dna_r941_min_modbases_5mC_CpG_v001.cfg \
99 | --outputs basecalls mappings mod_mappings mods \
100 | --reference reference.fa --mod-motif m CG 0 \
101 | --devices 0 1 --processes 40
102 |
103 | ..
104 |
105 | The path to the ``guppy_basecall_server`` executable is required to run Megalodon.
106 | By default, Megalodon assumes Guppy (Linux GPU) is installed in the current working directory (i.e. ``./ont-guppy/bin/guppy_basecall_server``).
107 | Use the ``--guppy-server-path`` argument to specify a different path.
108 |
109 | --------
110 | Contents
111 | --------
112 |
113 | .. toctree::
114 | :maxdepth: 2
115 |
116 | algorithm_details
117 | common_arguments
118 | advanced_arguments
119 | computing_considerations
120 | variant_phasing
121 | file_formats
122 | model_training
123 | modbase_training
124 | extras_aggregate
125 | extras_calibrate
126 | extras_merge
127 | extras_modified_bases
128 | extras_phase_variants
129 | extras_per_read_text
130 | extras_validate
131 | extras_variants
132 |
--------------------------------------------------------------------------------
/docs/extras_modified_bases.rst:
--------------------------------------------------------------------------------
1 | ***********************************
2 | ``megalodon_extras modified_bases``
3 | ***********************************
4 |
5 | The ``megalodon_extras modified_bases`` command group contains various commands related to modified base processing within Megalodon.
6 |
7 | -----------------------------------------------------
8 | ``megalodon_extras modified_bases describe_alphabet``
9 | -----------------------------------------------------
10 |
11 | Describe the alphabet, including modified bases, found in a given model.
12 |
13 | This command is useful to determine the syntax for specifying arguments related to modified base detection.
14 |
15 | Note that originally modified bases were specified by arbitrary values, but recent and future models will attempt to follow single letter codes specified by `samtools hts-spec (currently an open issue) `_.
16 |
17 | A minimal subset of the model specifications from the main ``megalodon`` command are available to specify the model exactly as in the main command.
18 | The model will be loaded as normal and the alphabet used will be printed.
19 |
20 | ------------------------------------------------------
21 | ``megalodon_extras modified_bases estimate_threshold``
22 | ------------------------------------------------------
23 |
24 | Estimate the optimal global modified base score threshold such that the estimated proportion of bases modified is achieved when all sites are called.
25 | This command is useful when producing the ``signal_mappings`` or ``per_read_refs`` outputs with modified bases annotated for basecaller training with Taiyaki.
26 |
27 | This command works by estimating the proportion of bases modified from a sample using the most extreme calls (default most extreme 8%; set with ``--mod-percentile`` option) as a truth set.
28 | This method assumes that the distribution of modified base and canonical scores is approximately balanced.
29 | This may not be true for all models.
30 | The plots produced by the ``megalodon_extras calibrate modified_bases`` command can assist in making this determination.
31 |
32 | ----------------------------------------------------------------
33 | [DEPRECATED] ``megalodon_extras modified_bases update_database``
34 | ----------------------------------------------------------------
35 |
36 | This method has not been updated for the most recent modified base schema and thus is currently deprecated.
37 |
38 | -------------------------------------------------------
39 | ``megalodon_extras modified_bases create_ground_truth``
40 | -------------------------------------------------------
41 |
42 | Create a set of ground truth sites from a bedmethyl file.
43 | This is a convenience command to apply a threshold to observed fractions of modified bases from bedmethyl files produced by Megalodon or other software.
44 |
45 | The ``--strand-offset`` is provided to allow calls on opposite strands to be combined.
46 | For example forward and reverse strand CpG calls can be merged by setting ``--strand-offset 1`` since the reverse strand position ``1`` downstream of the forward strand position correspond to the same biological methylation event.
47 |
48 | ----------------------------------------------------
49 | ``megalodon_extras modified_bases create_motif_bed``
50 | ----------------------------------------------------
51 |
52 | This is a helper command to take a reference FASTA file and produce a BED file with all the locations of a motif of interest.
53 | This can be useful for a number of modified base pipelines.
54 |
55 | -------------------------------------------------------
56 | ``megalodon_extras modified_bases per_site_thresholds``
57 | -------------------------------------------------------
58 |
59 | See the `ground truth aided bootstrap modified base annotation `_ tutorial for more complete instructions on using this command.
60 |
61 | This command is targeted at creating higher accuracy modified base training data sets (mapped signal files) from a preliminary modified base model and a fractionally (at each reference site) modified sample.
62 | This command takes as input a Megalodon run with ``--outputs per_read_mods mappings`` and a ground truth bedmethyl file and produces a custom modified base threshold at each reference site.
63 | These modified base thresholds will match the Megalodon modified base statistics with the ground truth faction of modified bases at each reference site.
64 | The derived thresholds should then be supplied to Megalodon via the ``--mod-per-site-threshold`` argument along with the ``--outputs signal_mappings``.
65 |
66 | This command will also output a low coverage BED file containing reference sites covered by either too few ground truth or nanopore reads.
67 | This should be used to filter reads for the final training mapped signal data set.
68 | A read covering any low coverage sites will not be marked up as accurately and thus should not be included in model training.
69 |
70 | --------------------------------------------------
71 | ``megalodon_extras modified_bases index_database``
72 | --------------------------------------------------
73 |
74 | In certain instances a megalodon run may end unexpectedly (e.g. out of memory error).
75 | In most cases the modified bases database is not corrupted by such an unexpected run termination.
76 | This will leave the modified base database without having completed the indexing step which is required for most downstream uses.
77 | This command will produce the index as a separate step in such instances.
78 |
79 | --------------------------------------------------
80 | ``megalodon_extras modified_bases split_by_motif``
81 | --------------------------------------------------
82 |
83 | Split an input modified base database into smaller databases based on reference sequence motifs.
84 | This command enables the computationally expensive ``megalodon`` command to be run only once, while still allowing motif specific analyses.
85 |
--------------------------------------------------------------------------------
/megalodon/megalodon_multiprocessing.py:
--------------------------------------------------------------------------------
1 | from time import sleep
2 | import multiprocessing as mp
3 | from collections import namedtuple
4 | from multiprocessing.connection import wait
5 | from multiprocessing.queues import Queue as mpQueue
6 |
7 | from megalodon import logging, megalodon_helper as mh
8 |
9 |
10 | # fix error `TypeError: cannot pickle '_thread.lock' object` on Mac + python3.8
11 | try:
12 | mp.set_start_method("fork")
13 | except RuntimeError:
14 | pass
15 |
16 | _FULL_SLEEP_TIME = 1
17 |
18 | GETTER_QPC = namedtuple("getter_qpc", ("queue", "proc", "conn"))
19 |
20 | LOGGER = logging.get_logger()
21 |
22 |
23 | ###########################
24 | # Multi-processing Helper #
25 | ###########################
26 |
27 |
28 | class CountingMPQueue(mpQueue):
29 | """Minimal version of multiprocessing queue maintaining a queue size
30 | counter
31 | """
32 |
33 | def __init__(self, **kwargs):
34 | self.name = None
35 | if "name" in kwargs:
36 | self.name = kwargs["name"]
37 | del kwargs["name"]
38 | super().__init__(ctx=mp.get_context(), **kwargs)
39 | self._size = mp.Value("i", 0)
40 | self.maxsize = None
41 | if "maxsize" in kwargs:
42 | self.maxsize = kwargs["maxsize"]
43 |
44 | def put(self, *args, **kwargs):
45 | super().put(*args, **kwargs)
46 | with self._size.get_lock():
47 | self._size.value += 1
48 |
49 | def get(self, *args, **kwargs):
50 | rval = super().get(*args, **kwargs)
51 | with self._size.get_lock():
52 | self._size.value -= 1
53 | return rval
54 |
55 | def qsize(self):
56 | qsize = max(0, self._size.value)
57 | if self.maxsize is not None:
58 | return min(self.maxsize, qsize)
59 | return qsize
60 |
61 | def empty(self):
62 | return self.qsize() <= 0
63 |
64 |
65 | def create_getter_qpc(
66 | getter_func, args, max_size=mh._MAX_QUEUE_SIZE, name=None
67 | ):
68 | """Spawn a new "getter" process. This process will use target=getter_func.
69 | A new queue and pipe connection will be passed to this function as the
70 | first two arguments, followed by *args. A mega_mp.GETTER_QPC will be
71 | returned containing the created mp.Queue, the mp.Process object and the
72 | other end of the mp.Pipe connection.
73 |
74 | Note the connection object is intended to communicate to the getter process
75 | that wroker processes have concluded. Send True or any value to the
76 | connection trigger the getter process to exit after exhausting the queue.
77 | """
78 | if max_size is None:
79 | q = CountingMPQueue(name=name)
80 | else:
81 | q = CountingMPQueue(maxsize=max_size, name=name)
82 | main_conn, conn = mp.Pipe()
83 | p = mp.Process(
84 | target=getter_func, daemon=True, args=(q, conn, *args), name=name
85 | )
86 | p.start()
87 | return GETTER_QPC(q, p, main_conn)
88 |
89 |
90 | class ConnWithSize:
91 | def __init__(
92 | self,
93 | conn,
94 | size,
95 | max_size=mh._MAX_QUEUE_SIZE,
96 | name="ConnWithSize",
97 | full_sleep_time=_FULL_SLEEP_TIME,
98 | ):
99 | if not isinstance(conn, mp.connection.Connection):
100 | raise mh.MegaError(
101 | (
102 | "ConnWithSize initialized with non-connection object. "
103 | + "Object type: {}"
104 | ).format(type(conn))
105 | )
106 | if not isinstance(size, mp.sharedctypes.Synchronized) and isinstance(
107 | size.value, int
108 | ):
109 | raise mh.MegaError(
110 | (
111 | "ConnWithSize initialized with non-synchronized size "
112 | + "object. Object type: {}"
113 | ).format(type(size))
114 | )
115 | self._conn = conn
116 | self._size = size
117 | self.max_size = max_size
118 | self.full_sleep_time = full_sleep_time
119 | self.name = name
120 |
121 | def qsize(self):
122 | return max(min(self._size.value, mh._MAX_QUEUE_SIZE), 0)
123 |
124 | def full(self):
125 | if self.max_size is None:
126 | return False
127 | return self.qsize() >= self.max_size
128 |
129 | def put(self, value):
130 | # enforce artificial queue max size with dulplex pipes
131 | if self.full():
132 | LOGGER.debug("ThrottlingSimplexQueue")
133 | sleep(self.full_sleep_time)
134 | with self._size.get_lock():
135 | self._size.value += 1
136 | self._conn.send(value)
137 |
138 | def close(self):
139 | self._conn.close()
140 | del self._conn
141 |
142 |
143 | class SimplexManyToOneQueue:
144 | """This object is a more efficient version of a multiprocessing.Queue for
145 | use when many connections will send information in one direction to a
146 | single connection.
147 |
148 | The get_conn class function will return a ConnWithSize object which can
149 | send information to be recieved by the wait_recv function of this class.
150 | """
151 |
152 | def __init__(
153 | self,
154 | return_conns=True,
155 | max_size=mh._MAX_QUEUE_SIZE,
156 | name="SimplexQueue",
157 | ):
158 | self.return_conns = return_conns
159 | self._conns = []
160 | self._size = mp.Value("i", 0)
161 | self.max_size = max_size
162 | self.name = name
163 |
164 | def get_conn(self):
165 | if not self.return_conns:
166 | return
167 | _my_conn, r_conn = mp.Pipe(duplex=False)
168 | self._conns.append(_my_conn)
169 | return ConnWithSize(r_conn, self._size, self.max_size, self.name)
170 |
171 | def qsize(self):
172 | return max(min(self._size.value, mh._MAX_QUEUE_SIZE), 0)
173 |
174 | def empty(self):
175 | return self.qsize() <= 0
176 |
177 | @property
178 | def has_valid_conns(self):
179 | return len(self._conns) > 0
180 |
181 | def wait_recv(self):
182 | for conn in wait(self._conns):
183 | try:
184 | r_val = conn.recv()
185 | with self._size.get_lock():
186 | self._size.value -= 1
187 | except EOFError:
188 | # when connection is closed in worker process EOFError is
189 | # triggered, so remove that connection
190 | self._conns.remove(conn)
191 | else:
192 | yield r_val
193 |
--------------------------------------------------------------------------------
/docs/extras_validate.rst:
--------------------------------------------------------------------------------
1 | *****************************
2 | ``megalodon_extras validate``
3 | *****************************
4 |
5 | The ``megalodon_extras validate`` command group contains commands to validate mapping and modified base outputs from Megalodon.
6 | Note that scripts to validate sequence variants are not provided here.
7 | Other tools including `vcfeval `_ and `hapy.py `_ are recommended for validation of sequence variant results.
8 |
9 | -------------------------------------
10 | ``megalodon_extras validate results``
11 | -------------------------------------
12 |
13 | Validate per-read mapping and modified base results.
14 |
15 | This command produces text and graphical summaries of mapping and modified base performance.
16 |
17 | Mapping results include distributional statistics for each sample provided (output determined by ``--out-filename`` default ``stdout``), as well as a plot showing the distribution of mapping accuracy for each sample (see ``--out-pdf``).
18 |
19 | ----
20 |
21 | .. figure:: _images/mapping_validate_results.png
22 | :align: center
23 | :width: 600
24 |
25 | Example ``validate results`` per-read mapping plot.
26 |
27 | ----
28 |
29 | Per-read modified base results require a per-read ground truth for modified and canonical bases.
30 | This can be provided by either 1) supplying a control sample via the ``--control-megalodon-results-dirs`` argument (assumes all modified base calls at ``--valid-sites`` in main Megalodon results are modified) or 2) providing a ground truth set of sites containing modified and canonical bases via the ``--ground-truth-data`` argument.
31 | See the ``megalodon_extras modified_bases create_ground_truth`` command for help generating a ground truth file.
32 |
33 | Per-read modified base results are analyzed to produce several metrics including the optimal `F1-score `_, `mean average precision `_ and `ROC AUC `_ among others.
34 | By default, modified and canonical ground truth sites are filtered to contain the same number of statistics for these statistic computations.
35 | It is highly recommended that this not be changed (via ``--allow-unbalance-classes``) as class imbalance can have a large effect on the statistics, thus effecting their comparison between runs and/or models.
36 | Below are example graphical representations produced for per-read modified base validation.
37 |
38 | ----
39 |
40 | .. figure:: _images/mod_pr_validate_results.png
41 | :align: center
42 | :width: 600
43 |
44 | Example ``validate results`` per-read modified base precision-recall curve plot.
45 |
46 | .. figure:: _images/mod_roc_validate_results.png
47 | :align: center
48 | :width: 600
49 |
50 | Example ``validate results`` per-read modified base ROC curve plot.
51 |
52 | .. figure:: _images/mod_dist_validate_results.png
53 | :align: center
54 | :width: 600
55 |
56 | Example ``validate results`` per-read modified base score distribution plot.
57 |
58 | ----
59 |
60 | -------------------------------------------------------
61 | ``megalodon_extras validate aggregated_modified_bases``
62 | -------------------------------------------------------
63 |
64 | Compute validation metrics and visualizations from aggregated modified base calls.
65 |
66 | Similar to the ``megalodon_extras validate results`` command, modified base results are compared to a ground truth provided either by 1) a control sample or 2) a ground truth positions CSV file.
67 | A set of metrics are also reported and stored as described by the ``--out-filename`` argument (default ``stdout``).
68 | These metrics include the optimal F1-score, mean average precision and ROC AUC.
69 | This command outputs several visualizations similar to the per-read modified base validation including modified base percent distributions as well as precision-recall and ROC curves.
70 |
71 | ----
72 |
73 | .. figure:: _images/mod_agg_dist_results.png
74 | :align: center
75 | :width: 600
76 |
77 | Example ``validate aggregated_modified_bases`` modified base percentage distribution plot.
78 |
79 | ----
80 |
81 | ----------------------------------------------------
82 | ``megalodon_extras validate compare_modified_bases``
83 | ----------------------------------------------------
84 |
85 | Compare two sets of bedmethyl files and report a standard set of metrics and visualizations.
86 |
87 | The two sets or individual bedmethyl files provided will be compared at all overlapping sites with sufficient coverage (defined by ``--coverage-threshold``; default all sites).
88 | To aggregate forward and reverse strand methylation calls set the ``--strand-offset`` argument.
89 | For example to aggregate CpG calls add the ``--strand-offset 1`` argument to the command.
90 |
91 | The first metrics reported concern the coverage over the two samples before and after the overlap and coverage filters have been applied.
92 | Overlapping percent modified values are then compared to produce the correlation coefficient, R^2 and RMSE (for the model y=x).
93 | The correlation coefficient has previously been reported as the standard metric for modified base detection performance, but the RMSE is recommended for purposes of model selection or general modified base detection performance.
94 | This is due to potential modified base model issues resulting in low accuracy, but high precision, which can result in high correlation.
95 | Specifically, some models have a tendency to call some low portion of ground truth modified sites as canonical, likely due to training set imbalance.
96 |
97 | This command also produces a standard set of visualizations for the comparison of these aggregated results.
98 | Shown below are plots comparing the percent modified bases between nanopore and ENCODE bisulfite runs (on log and linear scales shading) as well as a comparison of the coverage for the two samples.
99 |
100 | ----
101 |
102 | .. figure:: _images/mod_agg_comp_log.png
103 | :align: center
104 | :width: 600
105 |
106 | Example ``validate compare_modified_bases`` percent modified comparison with log-10 scaled shading.
107 |
108 | .. figure:: _images/mod_agg_comp_linear.png
109 | :align: center
110 | :width: 600
111 |
112 | Example ``validate compare_modified_bases`` percent modified comparison with clipped linear scaled shading.
113 |
114 | .. figure:: _images/mod_agg_comp_cov.png
115 | :align: center
116 | :width: 600
117 |
118 | Example ``validate compare_modified_bases`` modified base sample read coverage comparison.
119 |
120 | ----
121 |
--------------------------------------------------------------------------------
/megalodon/banding.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from megalodon import megalodon_helper as mh, logging
4 |
5 | LOGGER = logging.get_logger()
6 |
7 |
8 | def compute_sig_band(bps, levels, bhw=mh.DEFAULT_CONSTRAINED_HALF_BW):
9 | """Compute band over which to explore possible paths. Band is represented
10 | in sequence/level coordinates at each signal position.
11 |
12 | Args:
13 | bps (np.ndarray): Integer array containing breakpoints
14 | levels (np.ndarray): float array containing expected signal levels. May
15 | contain np.NAN values. Band will be constructed to maintain path
16 | through NAN regions.
17 | bhw (int): Band half width. If None, full matrix is used.
18 |
19 | Returns:
20 | int32 np.ndarray with shape (2, sig_len = bps[-1] - bps[0]). The first
21 | row contains the lower band boundaries in sequence coordinates and the
22 | second row contains the upper boundaries in sequence coordinates.
23 | """
24 | seq_len = levels.shape[0]
25 | if bps.shape[0] - 1 != seq_len:
26 | raise mh.MegaError("Breakpoints must be one longer than levels.")
27 | sig_len = bps[-1] - bps[0]
28 | seq_indices = np.repeat(np.arange(seq_len), np.diff(bps))
29 |
30 | # Calculate bands
31 | # The 1st row consists of the start indices (inc) and the 2nd row
32 | # consists of the end indices (exc) of the valid rows for each col.
33 | band = np.empty((2, sig_len), dtype=np.int32)
34 | if bhw is None:
35 | # specify entire input matrix
36 | band[0, :] = 0
37 | band[1, :] = seq_len
38 | else:
39 | # use specific band defined by bhw
40 | band[0, :] = np.maximum(seq_indices - bhw, 0)
41 | band[1, :] = np.minimum(seq_indices + bhw + 1, seq_len)
42 |
43 | # Modify bands based on invalid levels
44 | nan_mask = np.isin(seq_indices, np.nonzero(np.isnan(levels)))
45 | nan_sig_indices = np.where(nan_mask)[0]
46 | nan_seq_indices = seq_indices[nan_mask]
47 | band[0, nan_sig_indices] = nan_seq_indices
48 | band[1, nan_sig_indices] = nan_seq_indices + 1
49 | # Modify bands close to invalid levels so monotonically increasing
50 | band[0, :] = np.maximum.accumulate(band[0, :])
51 | band[1, :] = np.minimum.accumulate(band[1, ::-1])[::-1]
52 |
53 | # expand band around large deletions to ensure valid paths
54 | invalid_indices = np.where(band[0, 1:] >= band[1, :-1])[0]
55 | while invalid_indices.shape[0] > 0:
56 | band[0, invalid_indices + 1] = np.maximum(
57 | band[0, invalid_indices + 1] - 1, 0
58 | )
59 | band[1, invalid_indices] = np.minimum(
60 | band[1, invalid_indices] + 1, seq_len
61 | )
62 | invalid_indices = np.where(band[0, 1:] >= band[1, :-1])[0]
63 |
64 | return band
65 |
66 |
67 | def convert_to_seq_band(sig_band):
68 | """Convert band with sig_len entries containing upper and lower band
69 | boundaries in base coordinates to a seq_len entries contraining upper and
70 | lower band boundaries in signal space.
71 |
72 | Args:
73 | sig_band (np.array): int32 array with shape (2, sig_len). The first row
74 | contains the lower band boundaries in sequence coordinates and the
75 | second row contains the upper boundaries in sequence coordinates.
76 |
77 | Returns:
78 | int32 np.ndarray with shape (2, seq_len = sig_band[1, -1]). The first
79 | row contains the lower band boundaries in signal coordinates and the
80 | second row contains the upper boundaries in signal coordinates.
81 | """
82 | sig_len = sig_band.shape[1]
83 | seq_len = sig_band[1, -1]
84 | seq_band = np.zeros((2, seq_len), dtype=np.int32)
85 | seq_band[1, :] = sig_len
86 |
87 | # upper signal coordinates define lower sequence boundaries
88 | lower_sig_pos = np.nonzero(np.ediff1d(sig_band[1, :], to_begin=0))[0]
89 | lower_base_pos = sig_band[1, lower_sig_pos - 1]
90 | seq_band[0, lower_base_pos] = lower_sig_pos
91 | seq_band[0, :] = np.maximum.accumulate(seq_band[0, :])
92 |
93 | upper_sig_pos = np.nonzero(np.ediff1d(sig_band[0, :], to_begin=0))[0]
94 | upper_base_pos = sig_band[0, upper_sig_pos]
95 | seq_band[1, upper_base_pos - 1] = upper_sig_pos
96 | seq_band[1, :] = np.minimum.accumulate(seq_band[1, ::-1])[::-1]
97 |
98 | return seq_band
99 |
100 |
101 | def validate_band(band, sig_len=None, seq_len=None, is_sig_band=True):
102 | """Validate that band is valid and agrees with input data.
103 |
104 | Args:
105 | band (np.array): int32 array with shape (2, sig_len or seq_len). The
106 | first row contains the lower band boundaries and the second row
107 | contains the upper boundaries.
108 | sig_len (int): Length of signal associated with band
109 | seq_len (int): Length of sequence/levels associated with band
110 | is_sig_band (bool): Does the provided band specify sequence/level
111 | positions for each signal position? If not it is assumed that the
112 | band contains signal positions for each sequence/level position.
113 |
114 | Raises:
115 | MegaError if any portion of the band is determined to be invalid.
116 | """
117 | # first coordinate 0, last coordinate signal length
118 | if band[0, 0] != 0:
119 | raise mh.MegaError("Band does not start with 0 coordinate.")
120 |
121 | # ends all greater than starts
122 | if np.diff(band, axis=0)[0].min() <= 0:
123 | raise mh.MegaError("Band contains 0-length region")
124 | # monotonic start and end postions
125 | if np.diff(band[0]).min() < 0:
126 | raise mh.MegaError(
127 | "Band start positions are not monotonically increasing"
128 | )
129 | if np.diff(band[1]).min() < 0:
130 | raise mh.MegaError(
131 | "Band end positions are not monotonically increasing"
132 | )
133 |
134 | # if provided check that start and end coordinates agree with signal and
135 | # levels.
136 | if is_sig_band:
137 | if sig_len is not None and band.shape[1] != sig_len:
138 | LOGGER.debug(f"Invalid sig_band length: {band.shape[1]} {sig_len}")
139 | raise mh.MegaError("Invalid sig_band length")
140 | if seq_len is not None and band[1, -1] != seq_len:
141 | LOGGER.debug(
142 | f"Invalid sig_band end coordinate: {band[1, -1]} {seq_len}"
143 | )
144 | raise mh.MegaError("Invalid sig_band end coordinate")
145 | else:
146 | if sig_len is not None and band[1, -1] != sig_len:
147 | LOGGER.debug(
148 | f"Invalid seq_band end coordinate: {band[1, -1]} {sig_len}"
149 | )
150 | raise mh.MegaError("Invalid seq_band end coordinate")
151 | if seq_len is not None and band.shape[1] != seq_len:
152 | LOGGER.debug(f"Invalid sig_band length: {band.shape[1]} {seq_len}")
153 | raise mh.MegaError("Invalid sig_band length")
154 |
--------------------------------------------------------------------------------
/megalodon/signal_mapping.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import queue
3 | import traceback
4 | from collections import namedtuple
5 |
6 | import numpy as np
7 |
8 | from megalodon import fast5_io, megalodon_helper as mh, logging
9 |
10 | try:
11 | import taiyaki
12 |
13 | # appease flake8
14 | taiyaki
15 | except ImportError:
16 | raise mh.MegaError(
17 | "Taiyaki installation required for signal mapping not found."
18 | )
19 | try:
20 | from taiyaki import (
21 | alphabet,
22 | fast5utils,
23 | signal_mapping as tai_mapping,
24 | prepare_mapping_funcs,
25 | signal as tai_signal,
26 | )
27 | except ImportError:
28 | raise mh.MegaError(
29 | "Taiyaki modules could not be loaded. Signal mappings require "
30 | + 'Taiyaki version >= 5.2. Full error:\n"""\n{}\n"""'.format(
31 | traceback.format_exc()
32 | )
33 | )
34 |
35 |
36 | LOGGER = logging.get_logger()
37 | SIG_MAP_RESULT = namedtuple(
38 | "SIG_MAP_RESULT",
39 | (
40 | "pass_filts",
41 | "fast5_fn",
42 | "dacs",
43 | "scale_params",
44 | "ref_seq",
45 | "stride",
46 | "read_id",
47 | "r_to_q_poss",
48 | "rl_cumsum",
49 | "ref_pos",
50 | "ref_out_info",
51 | ),
52 | )
53 |
54 |
55 | def set_all_motif_mods(int_ref, ref_mods_all_motifs):
56 | ref_mod_pos, ref_mods = [], []
57 | for mod_base, int_mod_base, mln, int_motif, rel_pos in ref_mods_all_motifs:
58 | for pos in np.where(
59 | np.all(
60 | mh.rolling_window(int_ref, int_motif.shape[0]) == int_motif,
61 | axis=1,
62 | )
63 | )[0]:
64 | ref_mod_pos.append(pos + rel_pos)
65 | ref_mods.append(int_mod_base)
66 | if len(ref_mod_pos) > 0:
67 | int_ref[ref_mod_pos] = ref_mods
68 | return int_ref
69 |
70 |
71 | def get_remapping(
72 | sig_fn,
73 | dacs,
74 | scale_params,
75 | ref_seq,
76 | stride,
77 | read_id,
78 | r_to_q_poss,
79 | rl_cumsum,
80 | r_ref_pos,
81 | ref_out_info,
82 | ):
83 | read = fast5_io.get_read(sig_fn, read_id)
84 | channel_info = dict(fast5utils.get_channel_info(read).items())
85 | read_params = {
86 | "trim_start": 0,
87 | "trim_end": 0,
88 | "shift": scale_params[0],
89 | "scale": scale_params[1],
90 | }
91 | sig = tai_signal.Signal(
92 | dacs=dacs,
93 | channel_info=channel_info,
94 | read_id=read_id,
95 | read_params=read_params,
96 | )
97 |
98 | ref_to_sig = np.empty(len(ref_seq) + 1, dtype=np.int32)
99 | # skip last value since this is where the two seqs end
100 | for ref_pos, q_pos in enumerate(r_to_q_poss):
101 | ref_to_sig[ref_pos] = rl_cumsum[q_pos + r_ref_pos.q_trim_start] * stride
102 | try:
103 | int_ref = tai_mapping.SignalMapping.get_integer_reference(
104 | ref_seq, ref_out_info.alphabet_info.alphabet
105 | )
106 | except Exception:
107 | raise mh.MegaError("Invalid reference sequence encountered")
108 | sig_mapping = tai_mapping.SignalMapping(ref_to_sig, int_ref, signalObj=sig)
109 |
110 | # annotate mod motifs
111 | if ref_out_info.ref_mods_all_motifs is not None:
112 | # annotate all mod base motif positions with alts
113 | int_ref = set_all_motif_mods(int_ref, ref_out_info.ref_mods_all_motifs)
114 | # set new Reference with mods annotated
115 | sig_mapping.Reference = int_ref
116 |
117 | return (
118 | sig_mapping.get_read_dictionary(),
119 | prepare_mapping_funcs.RemapResult.SUCCESS,
120 | )
121 |
122 |
123 | def get_alphabet_info_from_model(model_info):
124 | flat_alphabet = model_info.output_alphabet[0]
125 | can_base = model_info.output_alphabet[0]
126 | for base in model_info.output_alphabet[1:]:
127 | if base in model_info.can_alphabet:
128 | can_base = base
129 | flat_alphabet += can_base
130 | mod_long_names = (
131 | []
132 | if len(model_info.mod_long_names) == 0
133 | else list(zip(*model_info.mod_long_names))[1]
134 | )
135 | return alphabet.AlphabetInfo(
136 | model_info.output_alphabet,
137 | flat_alphabet,
138 | mod_long_names,
139 | do_reorder=True,
140 | )
141 |
142 |
143 | def get_alphabet_info(output_alphabet, collapse_alphabet, mod_long_names):
144 | return alphabet.AlphabetInfo(
145 | output_alphabet, collapse_alphabet, mod_long_names, do_reorder=True
146 | )
147 |
148 |
149 | def write_signal_mappings(sig_map_q, sig_map_conn, ref_out_info, aux_failed_q):
150 | def apply_sig_map_offset(read_mapping):
151 | """Apply signal mapping shift to center coarse mappings to a registered
152 | signal based mapping.
153 | """
154 | if (
155 | ref_out_info.sig_map_offset is not None
156 | and ref_out_info.sig_map_offset != 0
157 | ):
158 | if ref_out_info.sig_map_offset > 0:
159 | # clip beginning of signal mapping and end of reference to
160 | # shift signal assignments to the left
161 | read_mapping[0]["Ref_to_signal"] = read_mapping[0][
162 | "Ref_to_signal"
163 | ][ref_out_info.sig_map_offset :]
164 | read_mapping[0]["Reference"] = read_mapping[0]["Reference"][
165 | : -ref_out_info.sig_map_offset
166 | ]
167 | else:
168 | # clip end of signal mapping and beginning of reference to
169 | # shift signal assignments to the right
170 | read_mapping[0]["Ref_to_signal"] = read_mapping[0][
171 | "Ref_to_signal"
172 | ][: ref_out_info.sig_map_offset]
173 | read_mapping[0]["Reference"] = read_mapping[0]["Reference"][
174 | -ref_out_info.sig_map_offset :
175 | ]
176 | return read_mapping
177 |
178 | def iter_mappings():
179 | workers_active = True
180 | LOGGER.debug("GetterInitComplete")
181 | while workers_active or not sig_map_q.empty():
182 | try:
183 | read_mapping = sig_map_q.get(timeout=0.1)
184 | yield apply_sig_map_offset(read_mapping)
185 | except queue.Empty:
186 | if sig_map_conn.poll():
187 | workers_active = False
188 |
189 | try:
190 | LOGGER.debug("GetterStarting")
191 | prepare_mapping_funcs.generate_output_from_results(
192 | iter_mappings(),
193 | mh.get_megalodon_fn(ref_out_info.out_dir, mh.SIG_MAP_NAME),
194 | ref_out_info.alphabet_info,
195 | verbose=False,
196 | )
197 | LOGGER.debug("GetterClosing")
198 | except Exception as e:
199 | aux_failed_q.put(
200 | ("SigMapProcessingError", str(e), traceback.format_exc())
201 | )
202 |
203 |
204 | if __name__ == "__main__":
205 | sys.stderr.write("This is a module. See commands with `megalodon -h`")
206 | sys.exit(1)
207 |
--------------------------------------------------------------------------------
/docs/algorithm_details.rst:
--------------------------------------------------------------------------------
1 | ***************************
2 | Megalodon Algorithm Details
3 | ***************************
4 |
5 | This page describes the details of how megalodon processes the raw nanopore signal to produce highly-accurate modified base and sequence variant calls.
6 |
7 | ------------
8 | Base Calling
9 | ------------
10 |
11 | Basecalling is performed exactly as in Guppy.
12 | Raw nanopore signal is normalized, chunked, processed with a recurrent neural network and decoded using Viterbi decoding.
13 | Currently Megalodon is only compatible with flip-flop basecalling networks (excluding RLE and Bonito models)
14 | See `guppy documentation on the community page (login required) `_ for more details.
15 |
16 | -------------------
17 | Reference Anchoring
18 | -------------------
19 |
20 | Megalodon's functionality centers on the anchoring of the high-information neural network basecalling output to a reference sequence.
21 | Given anchored neural network output, alternatives to the reference (either modified bases or canonical bases) are proposed and scored to produce the highest accuracy results.
22 |
23 | The neural network output is anchored to the reference via standard read mapping of produced basecalls to the reference sequence (maintaining the link to the neural network outputs).
24 | If no reference mapping is produced (using ``minimap2`` via the ``mappy`` python interface) that read is not processed further (basecalls will be output if requested).
25 | This standard read mapping is processed to produce a matching of each basecall with a reference position.
26 | Reference positions within an insertion or deletion are assigned to the previous mapped read position (left justified; this behavior may change in future versions).
27 | This constitutes the reference anchoring used for modified base and sequence variant calling steps.
28 |
29 | ------------------------
30 | Sequence Variant Calling
31 | ------------------------
32 |
33 | Megalodon currently filters alleles over a certain maximum size (default ``50``) as performance on larger indels has not currently been validated.
34 | Note also that variants are converted into an "atomic" form (containing minimal unique variant sequence for indels).
35 | Thus atomic variants do not contain context sequence and are expanded to include regions of ambiguity (indel within a repetitive region).
36 |
37 | At each valid variant a region of context sequence around the variant is extracted.
38 | The context sequence allows the scoring algorithm to traverse slightly different paths through the local neural network output.
39 | The width of this sequence of interest is defined by the ``--variant-context-bases`` argument (specified individually for single base and insertion/deletion variants; defaults ``10`` and ``30`` respectively).
40 |
41 | Next the neural network output corresponding to the reference sequence of interest is extracted.
42 | The fuzzy reference anchoring described above identifies the range of the neural network output containing the sequence of interest.
43 |
44 | The sequence scoring function performs the forward-backward algorithm and Viterbi decoding over the neural network output to produce a score for the reference and proposed alternative sequence.
45 | The difference between these two scores is the assigned score for the proposed variant.
46 | Lower (negative) score are evidence for the alternative sequence and higher (positive) scores are evidence for the reference sequence.
47 |
48 | These raw scores are softmax values over potential states, to match characteristics of a probability distribution.
49 | In practice, these scores do not match empirical probabilities for a variant given a truth data set.
50 | Thus a calibration step is applied to convert these scores to estimated empirical probabilities.
51 | This enables more accurate aggregation across reads.
52 |
53 | As of version 1.0.0, megalodon now performs a second round of variant detection taking nearby variants into account.
54 | Variants from the first round (considering each variant in isolation) are filtered to by a minimal probability of evidence for variant allele (default ``0.05``; set with ``--context-min-alt-prob`` argument).
55 | In the second pass, variants within a set region are considered when estimating the probability of a particular variant (up to a set maximum number of context variants in order to reduce compute).
56 | Scores for each potential context are combined statistically (using logsumexp) and these are the final scores reported for each variant.
57 | This process reduces the number of false positives where a true variant is adjacent to another proposed variant.
58 |
59 | Finally, calls across reads at each reference location are aggregated in order make a sample-level call.
60 | These results will be output into a VCF format file.
61 |
62 | Currently ``diploid`` (default) and ``haploid`` variant aggregation modes are available.
63 | In ``haploid`` mode the probability of the reference and alternative alleles are simply the normalized (via Bayes' theorem) product of the individual read probabilities.
64 | In ``diploid`` mode the probability of each genotype (homozygous reference, heterozygous and homozygous alternative) are computed.
65 | The probabilities for homozygous alleles are as in the ``haploid`` mode, while the heterozygous probability is given by the weighted sum of the maximal probabilities taken over the sampling distribution (binomial with ``p=0.5``) given a true diploid heterozygous allele.
66 | These probabilities are then normalized given by Bayes' theorem.
67 |
68 | ---------------------
69 | Modified Base Calling
70 | ---------------------
71 |
72 | Modified base calling is performed largely in the same manner as variant calling above in terms of sequence and associated neural network output extraction.
73 | The main difference is that instead of proposing alternative canonical bases in the sequence, a modified base is proposed.
74 | This means that in order to identify a particular modification the model must be aware of this modification.
75 | Training models for particular modifications of interest is described in `Megalodon documentation here `_.
76 |
77 | Use the ``--mod-motif`` argument in order to restrict tested locations to certain relevant motifs (e.g. ``--mod-motif m CG 0`` to test only in CpG locations).
78 | Per-read modified base calls can be output in either a text table format or into a BAM file.
79 | There are two options to output per-read modified base calls into the BAM format.
80 | The default option when ``--outputs mod_mappings`` is specified is the `hts-spec proposed format `_.
81 | The second option emulates bisulfite sequencing (since this provides visualization options in some genome browsers).
82 | Specify the ``--mod-map-emulate-bisulfite`` option to select this output.
83 | See the ``--mod-map-base-conv`` option (``megalodon --help-long``) for further specification of this output.
84 |
85 | Modified bases can also be output anchored to the basecalls as in Guppy, but these calls are generally not as accurate than the reference anchored calls.
86 | These ``mod_basecalls`` are output in the BAM ``Mm`` and ``Ml`` tags as specified by hts-specs proposed format.
87 |
--------------------------------------------------------------------------------
/megalodon/validation.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import numpy as np
4 | import seaborn as sns
5 | import matplotlib
6 |
7 | if True:
8 | # Agg appears to be the most robust backend when only saving plots.
9 | matplotlib.use("Agg")
10 | import matplotlib.pyplot as plt
11 | from sklearn.metrics import (
12 | roc_curve,
13 | auc,
14 | precision_recall_curve,
15 | average_precision_score,
16 | )
17 |
18 | from megalodon import logging, megalodon_helper as mh
19 |
20 | LOGGER = logging.get_logger()
21 |
22 | # BANDWIDTH2 supports seaborn<0.11 when bw_adjust was introduced
23 | MOD_BANDWIDTH = 0.9
24 | MOD_BANDWIDTH2 = 0.2
25 | GRIDSIZE = 1000
26 |
27 | MOD_VAL_METRICS_HEADER = (
28 | "{: <12}{: <19}{: <20}{: <9}{: <20}{: <19}{: <10}{} {}\n".format(
29 | "Optimal_F1",
30 | "Optimal_Threshold",
31 | "Mean_Avg_Precision",
32 | "ROC_AUC",
33 | "Num_Modified_Stats",
34 | "Num_Control_Stats",
35 | "Mod_Base",
36 | "Sample_Label",
37 | "Valid_Sites_Label",
38 | )
39 | )
40 | MOD_VAL_METRICS_TMPLT = (
41 | "{: <12.6f}{: <19.4f}{: <20.6f}{: <9.6f}{: <20d}{: <19d}{: <10}{} {}\n"
42 | )
43 |
44 |
45 | def plot_pr(pdf_fp, pr_data):
46 | for mod_base, mod_pr_data in pr_data.items():
47 | LOGGER.info("Plotting {} precision-recall curves".format(mod_base))
48 | plt.figure(figsize=(8, 7))
49 | for lab, prec, recall in mod_pr_data:
50 | plt.step(recall, prec, label=lab, where="post")
51 | plt.ylim([-0.05, 1.05])
52 | plt.xlim([-0.05, 1.05])
53 | plt.xlabel("Recall")
54 | plt.ylabel("Precision")
55 | plt.title(
56 | ('Modified Base "{}" Precision-Recall Curves').format(mod_base)
57 | )
58 | plt.legend()
59 | pdf_fp.savefig(bbox_inches="tight")
60 | plt.close()
61 |
62 |
63 | def plot_roc(pdf_fp, roc_data):
64 | for mod_base, mod_roc_data in roc_data.items():
65 | LOGGER.info("Plotting {} ROC curves".format(mod_base))
66 | plt.figure(figsize=(8, 7))
67 | for lab, fpr, tpr in mod_roc_data:
68 | plt.step(fpr, tpr, label=lab)
69 | plt.ylim([-0.05, 1.05])
70 | plt.xlim([-0.05, 1.05])
71 | plt.xlabel("False Positive Rate")
72 | plt.ylabel("True Positive Rate")
73 | plt.title(('Modified Base "{}" ROC Curves').format(mod_base))
74 | plt.legend()
75 | pdf_fp.savefig(bbox_inches="tight")
76 | plt.close()
77 |
78 |
79 | def plot_kde(pdf_fp, kde_data):
80 | for samp_lab, mod_stats, ctrl_stats in kde_data:
81 | LOGGER.info(
82 | "Plotting {} modified base statistics densities".format(samp_lab)
83 | )
84 | plt.figure(figsize=(8, 5))
85 | try:
86 | sns.kdeplot(
87 | mod_stats,
88 | shade=True,
89 | bw_adjust=MOD_BANDWIDTH,
90 | gridsize=GRIDSIZE,
91 | label="Yes",
92 | )
93 | sns.kdeplot(
94 | ctrl_stats,
95 | shade=True,
96 | bw_adjust=MOD_BANDWIDTH,
97 | gridsize=GRIDSIZE,
98 | label="No",
99 | )
100 | except AttributeError:
101 | sns.kdeplot(
102 | mod_stats,
103 | shade=True,
104 | bw=MOD_BANDWIDTH2,
105 | gridsize=GRIDSIZE,
106 | label="Yes",
107 | )
108 | sns.kdeplot(
109 | ctrl_stats,
110 | shade=True,
111 | bw=MOD_BANDWIDTH2,
112 | gridsize=GRIDSIZE,
113 | label="No",
114 | )
115 | plt.legend(prop={"size": 16}, title="Is Modified")
116 | plt.xlabel(
117 | "Log Likelihood Ratio\nMore Likely Modified <--> "
118 | + "More Likely Canonical"
119 | )
120 | plt.ylabel("Density")
121 | plt.title(samp_lab)
122 | pdf_fp.savefig(bbox_inches="tight")
123 | plt.close()
124 |
125 |
126 | def compute_mod_sites_stats(
127 | mod_stats, ctrl_stats, balance_classes, mod_base, samp_lab, vs_lab, out_fp
128 | ):
129 | if balance_classes:
130 | # randomly downsample sample with more observations
131 | if mod_stats.shape[0] > ctrl_stats.shape[0]:
132 | mod_stats = np.random.choice(
133 | mod_stats, ctrl_stats.shape[0], replace=False
134 | )
135 | elif mod_stats.shape[0] < ctrl_stats.shape[0]:
136 | ctrl_stats = np.random.choice(
137 | ctrl_stats, mod_stats.shape[0], replace=False
138 | )
139 |
140 | is_can = np.repeat([0, 1], [mod_stats.shape[0], ctrl_stats.shape[0]])
141 | all_stats = np.concatenate([mod_stats, ctrl_stats])
142 | if any(np.isnan(all_stats)):
143 | LOGGER.warning(
144 | ("Encountered {} NaN modified base scores.").format(
145 | sum(np.isnan(all_stats))
146 | )
147 | )
148 | all_stats = all_stats[~np.isnan(all_stats)]
149 | if all_stats.shape[0] == 0:
150 | raise mh.MegaError("All modified base scores are NaN")
151 | inf_idx = np.isinf(all_stats)
152 | if any(inf_idx):
153 | all_stats[inf_idx] = np.max(all_stats[~inf_idx])
154 | neginf_idx = np.isinf(all_stats)
155 | if any(neginf_idx):
156 | all_stats[neginf_idx] = np.min(all_stats[~neginf_idx])
157 | LOGGER.info(
158 | "Computing PR/ROC for {} from {} at {}".format(
159 | mod_base, samp_lab, vs_lab
160 | )
161 | )
162 | # compute roc and presicion recall
163 | precision, recall, thresh = precision_recall_curve(is_can, all_stats)
164 | prec_recall_sum = precision + recall
165 | valid_idx = np.where(prec_recall_sum > 0)
166 | all_f1 = (
167 | 2
168 | * precision[valid_idx]
169 | * recall[valid_idx]
170 | / prec_recall_sum[valid_idx]
171 | )
172 | optim_f1_idx = np.argmax(all_f1)
173 | optim_f1 = all_f1[optim_f1_idx]
174 | optim_thresh = thresh[optim_f1_idx]
175 | avg_prcn = average_precision_score(is_can, all_stats)
176 |
177 | fpr, tpr, _ = roc_curve(is_can, all_stats)
178 | roc_auc = auc(fpr, tpr)
179 |
180 | out_fp.write(
181 | MOD_VAL_METRICS_TMPLT.format(
182 | optim_f1,
183 | optim_thresh,
184 | avg_prcn,
185 | roc_auc,
186 | mod_stats.shape[0],
187 | ctrl_stats.shape[0],
188 | mod_base,
189 | samp_lab,
190 | vs_lab,
191 | )
192 | )
193 | pr_data = (
194 | "{} at {} mAP={:0.2f}".format(samp_lab, vs_lab, avg_prcn),
195 | precision,
196 | recall,
197 | )
198 | roc_data = (
199 | "{} at {} AUC={:0.2f}".format(samp_lab, vs_lab, roc_auc),
200 | fpr,
201 | tpr,
202 | )
203 | kde_data = (
204 | "{} from {} at {}".format(mod_base, samp_lab, vs_lab),
205 | mod_stats,
206 | ctrl_stats,
207 | )
208 |
209 | return pr_data, roc_data, kde_data
210 |
211 |
212 | if __name__ == "__main__":
213 | sys.stderr.write("This is a module. See commands with `megalodon -h`")
214 | sys.exit(1)
215 |
--------------------------------------------------------------------------------
/megalodon_extras/__main__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import argparse
3 | from importlib import import_module
4 |
5 | from megalodon import __version__
6 | from megalodon_extras import _extras_parsers as ep
7 |
8 |
9 | NESTED_COMMANDS = [
10 | (
11 | ep.GRP_AGG,
12 | "Aggregate per-read modified base and/or variant statistics",
13 | [
14 | (ep.CMD_AGG_RUN, "Run aggregation"),
15 | ],
16 | ),
17 | (
18 | ep.GRP_CALIB,
19 | "Calibrate model results with ground truth modified base or variants",
20 | [
21 | (ep.CMD_CALIB_MODS, "Calibrate modified base statistics"),
22 | (ep.CMD_CALIB_VARS, "Calibrate sequence variant statistics"),
23 | (ep.CMD_CALIB_GEN_MODS, "Generate stats for mod calibration"),
24 | (
25 | ep.CMD_CALIB_GEN_MODS_MSF,
26 | "Generate stats for mod calibration from mapped signal file",
27 | ),
28 | (ep.CMD_CALIB_GEN_VARS, "Generate stats for variant calibration"),
29 | (ep.CMD_CALIB_MERGE_MODS, "Merge modified base calibration files"),
30 | (ep.CMD_CALIB_MERGE_MODS_STATS, "Merge mod calibration stat files"),
31 | ],
32 | ),
33 | (
34 | ep.GRP_MERGE,
35 | "Merge per-read databases or aggregated files",
36 | [
37 | (ep.CMD_MERGE_MODS, "Merge per-read modified base database"),
38 | (
39 | ep.CMD_MERGE_AGG_MODS,
40 | "Merge aggregated modified base bedmethyl files",
41 | ),
42 | (ep.CMD_MERGE_VARS, "Merge per-read sequence variants database"),
43 | ],
44 | ),
45 | (
46 | ep.GRP_MODS,
47 | "Miscellaneous modified base operations",
48 | [
49 | (ep.CMD_MODS_ALPHABET, "Print the alphabet for a choosen model"),
50 | (
51 | ep.CMD_MODS_EST_THRESH,
52 | "Estimate optimal global modified base threshold for sequence markup",
53 | ),
54 | (
55 | ep.CMD_MODS_UPDATE_DB,
56 | "Update modified base database from older versions of Megalodon",
57 | ),
58 | (
59 | ep.CMD_MODS_GT,
60 | "Create ground truth modified base file from bedmethyl files",
61 | ),
62 | (ep.CMD_MODS_MOTIF, "Create BED file of motif sites"),
63 | (
64 | ep.CMD_MODS_PER_SITE,
65 | "Extract per-site modified base thresholds for signal mapping "
66 | + "sequence markup",
67 | ),
68 | (ep.CMD_MODS_INDEX, "Create per-read modified base database index"),
69 | (ep.CMD_MODS_SPLIT, "Split modified base database by motif"),
70 | ],
71 | ),
72 | (
73 | ep.GRP_PHASE,
74 | "Phase variants",
75 | [
76 | (
77 | ep.CMD_PHASE_FILT_WHATSHAP,
78 | "Filter variants not compatible with whatshap",
79 | ),
80 | (
81 | ep.CMD_PHASE_GET_HAP_READS,
82 | "Extract read ids from haplotypes determined by whatshap",
83 | ),
84 | (ep.CMD_PHASE_MERGE_HAP, "Merge variants from haploid calls"),
85 | ],
86 | ),
87 | (
88 | ep.GRP_TXT,
89 | "Output per-read text files",
90 | [
91 | (
92 | ep.CMD_TXT_MODS,
93 | "Output per-read modified base statistics text file",
94 | ),
95 | (
96 | ep.CMD_TXT_VARS,
97 | "Output per-read sequence variant statistics text file",
98 | ),
99 | ],
100 | ),
101 | (
102 | ep.GRP_VAL,
103 | "Validate per-read mapping and modified base results",
104 | [
105 | (
106 | ep.CMD_VAL_RES,
107 | "Validate per-read mappings and modified bases (if available)",
108 | ),
109 | (ep.CMD_VAL_AGG_MODS, "Validate aggregated modified bases results"),
110 | (
111 | ep.CMD_VAL_COMP_MODS,
112 | "Compare aggregated modified base results (bedMethyl files",
113 | ),
114 | (
115 | ep.CMD_VAL_MODS_CALIB,
116 | "Validate per-read modified bases from calibration file",
117 | ),
118 | ],
119 | ),
120 | (
121 | ep.GRP_VARS,
122 | "Miscellaneous sequence variant operations",
123 | [
124 | (ep.CMD_VAR_ATOM, "Atomize variants for faster processing"),
125 | (ep.CMD_VAR_RESOLVE, "Resolve potentially conflicting variants"),
126 | (
127 | ep.CMD_VAR_HET_FACTOR,
128 | "Estimate optimal heterozygous factors for diploid variant calling",
129 | ),
130 | # TODO variant database API does not allow opening for writing once
131 | # database exists.
132 | (ep.CMD_VAR_INDEX, "***** Stub for future implementation *****"),
133 | ],
134 | ),
135 | ]
136 |
137 |
138 | class SubcommandHelpFormatter(argparse.RawDescriptionHelpFormatter):
139 | def _format_action(self, action):
140 | parts = super(SubcommandHelpFormatter, self)._format_action(action)
141 | if action.nargs == argparse.PARSER:
142 | parts = "\n".join(parts.split("\n")[1:])
143 | return parts
144 |
145 |
146 | def _main():
147 | """The main routine."""
148 | desc = (
149 | "Megalodon extras command groups (additional help available "
150 | + "within each command group):\n"
151 | + "\n".join(
152 | [
153 | "\t{0: <25}{1}".format(grp_name, grp_help)
154 | for grp_name, grp_help, _ in NESTED_COMMANDS
155 | ]
156 | )
157 | )
158 | parser = argparse.ArgumentParser(
159 | prog="megalodon_extras",
160 | description="********** Megalodon Extras *********\n\n"
161 | + "Commands to perform operations related to main Megalodon command "
162 | + "including aggregation, variant phasing, validation, and more.\n\n"
163 | + desc,
164 | formatter_class=SubcommandHelpFormatter,
165 | )
166 | parser.add_argument(
167 | "-v",
168 | "--version",
169 | action="version",
170 | version="Megalodon version: {}".format(__version__),
171 | help="Show Megalodon version and exit.",
172 | )
173 |
174 | # add megalodon_extras command groups
175 | # service_command is a groupings of other commands
176 | # action_command is an executable command with detailed argument help
177 | service_subparsers = parser.add_subparsers(dest="service_command")
178 | for grp_name, grp_help, grp_sub_cmds in NESTED_COMMANDS:
179 | grp_desc = "\n".join(
180 | [
181 | "\t{0: <30}{1}".format(cmd_name, cmd_help)
182 | for cmd_name, cmd_help in grp_sub_cmds
183 | ]
184 | )
185 | grp_parser = service_subparsers.add_parser(
186 | grp_name,
187 | formatter_class=SubcommandHelpFormatter,
188 | description=grp_desc,
189 | )
190 | grp_subparser = grp_parser.add_subparsers(
191 | title=grp_name, dest="action_command"
192 | )
193 | for cmd_name, cmd_help in grp_sub_cmds:
194 | # add each action parser to this service parser group
195 | grp_subparser.add_parser(
196 | cmd_name,
197 | add_help=False,
198 | parents=[
199 | ep.PARSERS[grp_name][cmd_name](),
200 | ],
201 | )
202 |
203 | args = parser.parse_args()
204 | # if no service parser was provided print help and return
205 | if args.service_command is None:
206 | sys.stderr.write(
207 | "********** Please provide a megalodon_extras command group "
208 | + "for further help. **********\n"
209 | )
210 | parser.print_help()
211 | sys.exit(2)
212 |
213 | # if no action parser is provided print that command groups help
214 | if args.action_command is None:
215 | sys.stderr.write(
216 | "********** Please provide a command for further help. "
217 | + "**********\n"
218 | )
219 | parser.parse_args([args.service_command, "-h"])
220 |
221 | module = import_module(
222 | ".{}_{}".format(args.service_command, args.action_command),
223 | "megalodon_extras",
224 | )
225 | module._main(args)
226 |
227 |
228 | if __name__ == "__main__":
229 | _main()
230 |
--------------------------------------------------------------------------------
/docs/extras_calibrate.rst:
--------------------------------------------------------------------------------
1 | ******************************
2 | ``megalodon_extras calibrate``
3 | ******************************
4 |
5 | The ``megalodon_extras calibrate`` command group contains commands to produce Megalodon modified base and sequence variant calibration files for basecalling models.
6 | When a new basecalling model is trained a calibration file must be produced in order to obtain the most accurate aggregated modified base and sequence variant calls.
7 | Without a calibration file the ``--disable-mod-calibration`` or ``--disable-variant-calibration`` flags may be set, but aggregated results will likely be much less accurate.
8 |
9 | Calibration file estimation is broken down into two steps:
10 |
11 | 1. Ground truth statistic generation (``megalodon_extras calibrate generate_modified_base_stats`` and ``megalodon_extras calibrate generate_variant_stats`` commands)
12 |
13 | - This step processes completed Megalodon runs to extract ground truth positive and negative statistics.
14 | 2. Calibration estimation (``megalodon_extras calibrate modified_bases`` and ``megalodon_extras calibrate variants`` commands)
15 |
16 | - This step estimates the emperical probability of a modified base or sequence variant given the ground truth statistics from the first step.
17 |
18 | Note that the plots produced by the calibration procedure (with examples shown below) are stored in the GitHub repository for each released model (``megalodon/model_data/``).
19 |
20 | -----------------------------------------------------------
21 | ``megalodon_extras calibrate generate_modified_base_stats``
22 | -----------------------------------------------------------
23 |
24 | Generate ground truth modified base statistics.
25 |
26 | The ground truth modified base composition for a run can be specified in two ways:
27 |
28 | 1. Control Megalodon results
29 |
30 | - Specify ``--control-megalodon-results-dir``
31 | - Using this option assumes that all modified base statistics in ``--control-megalodon-results-dir`` represent canonical bases and all statistics in the main Megalodon results directory represent modified bases.
32 |
33 | - This respects the ``--mod-motif`` options specified in the main Megalodon commands.
34 | 2. Ground truth reference locations
35 |
36 | - Specify ``--ground-truth-data``
37 | - See the ``megalodon_extras modified_bases create_ground_truth`` command for help producing a ground truth CSV file.
38 |
39 | ----------------------------------------------------------
40 | ``megalodon_extras calibrate generate_mod_stats_from_msf``
41 | ----------------------------------------------------------
42 |
43 | In some situations ground truth control samples or reference locations are not available for calibration.
44 | The ``generate_mod_stats_from_msf`` sub-command uses the mapped signal file (``msf``) used for Taiyaki model training to produce Megalodon calibration statistics.
45 | This command uses the ground truth sequence including modified base annotation in order to extract modified base scores as computed in Megalodon.
46 | The extracted scores can be constricted to a fixed canonical sequence motif using the ``--motif`` argument (providing the sequence motif and relative modified position; e.g. ``--motif CG 0`` for CpG methylation).
47 | Note that the final set of Megalodon modified base statistics should contain enough data from both the modified and canonical set of sites.
48 | See ``megalodon_extras calibrate merge_modified_bases_stats`` below for merging sets of statistics.
49 |
50 | -----------------------------------------------------
51 | ``megalodon_extras calibrate generate_variant_stats``
52 | -----------------------------------------------------
53 |
54 | Generate ground truth sequence variant statistics.
55 |
56 | This method produces ground truth sequence variant statistics by proposing alternatives to a reference sequence.
57 | It is thus assumed that the mapping location for each read contains the correct reference sequence.
58 | It is advised to select a set of reads with high quality mappings to a high quality reference for the sample.
59 |
60 | This command performs basecalling and read mapping as in the main Megalodon command.
61 | Variants are then randomly proposed and scored for a random set of sites across each read.
62 | "Correct" variants are not produced by default due to the computational overhead required to map full reads to the "incorrect" reference.
63 | This functionality is provided on an experimental basis via the ``--compute-false-reference-scores`` flag, but these scores are not currently accepted by the ``megalodon_extras calibrate variants`` command.
64 |
65 | ---------------------------------------------
66 | ``megalodon_extras calibrate modified_bases``
67 | ---------------------------------------------
68 |
69 | Estimate modified base calibration file.
70 |
71 | Given a set of ground truth modified bases and raw Megalodon called statistics, compute empirical probabilities for a modified base.
72 | The ground truth statistics are generated by the ``megalodon_extras calibrate generate_modified_base_stats`` command, described above, and supplied via the ``--ground-truth-llrs`` argument.
73 | This command computes the empirical log-likelihood ratio over windows of observed modified base scores.
74 | This process involves several steps to ensure certain characteristics of the generating distributions (e.g. monotonicity).
75 | A separate calibration will be computed and stored in the output calibration file for each modified base found in the ground truth file.
76 |
77 | These steps are visualized in the example plot below, which can be produced for any new calibration file by providing the ``--out-pdf`` argument.
78 | The top facet of this plot shows the distribution of theoretical modified base log-likelihood ratios produced by the basecalling model.
79 | These distributions are smoothed such that they are monotonic from either extreme to the peak of the densities.
80 | The middle facet shows the inferred empirical probability that a base is modified given the theoretical modified base score produced by the basecaller.
81 | The final facet shows the same probabilities, but in log-likelihood space.
82 | A constraint is enforced on this function such that the value is monotonically increasing (red - before monotonic constraint; yellow - after monotonic constraint).
83 | The three vertical lines indicate common threshold values for modified base aggregation.
84 | Note that the fraction of data ignored at each threshold level is annotated in the figure legend.
85 |
86 | ----
87 |
88 | .. figure:: _images/modified_base_calibration.png
89 | :align: center
90 | :width: 600
91 |
92 | Visualization of modified base calibration method.
93 |
94 | ----
95 |
96 | ---------------------------------------------------
97 | ``megalodon_extras calibrate merge_modified_bases``
98 | ---------------------------------------------------
99 |
100 | Merge modified base calibration files.
101 |
102 | In some cases the ground truth source for one modified base my come from a different source than another modified base.
103 | In this case calibration files can be computed separately and combined with this command.
104 | If multiple calibration files contain calibration for the same modified base, the calibration from the file listed first will be stored.
105 |
106 | ---------------------------------------------------------
107 | ``megalodon_extras calibrate merge_modified_bases_stats``
108 | ---------------------------------------------------------
109 |
110 | Merge modified base calibration statistics files.
111 |
112 | In some cases the ground truth statistics may be extracted from several sources (unmodified and modified samples) and merged afterwards.
113 | This command enables this pipeline.
114 |
115 | ---------------------------------------
116 | ``megalodon_extras calibrate variants``
117 | ---------------------------------------
118 |
119 | Estimate sequence variant calibration file.
120 |
121 | Given a set of ground truth sequence variant statistics, via ``--ground-truth-llrs`` argument, compute empirical probabilities of a sequence variant.
122 | This command computes the empirical log-likelihood ratio over windows of observed sequence variant scores.
123 | This process involves several steps to ensure certain characteristics of the generating distributions.
124 | This procedure is largely the same as the modified base calibration step, but the variants are grouped into categories based on the type of ground truth sequence variant.
125 | Note that the vertical bars are not present in these plots as sequence variant per-read statistics are combined in a probabilistic fashion and not based on a hard threshold.
126 |
127 | ----
128 |
129 | .. figure:: _images/sequence_variant_calibration.png
130 | :align: center
131 | :width: 600
132 |
133 | Visualization of sequence variant calibration method.
134 |
135 | ----
136 |
--------------------------------------------------------------------------------
/docs/file_formats.rst:
--------------------------------------------------------------------------------
1 | ************
2 | File Formats
3 | ************
4 |
5 | This page describes the output file formats produced by ``megalodon``.
6 | Note that all outputs are unsorted (by reference position) unless specified in the output filename.
7 |
8 | ------------
9 | Base Calling
10 | ------------
11 |
12 | Basecalling produces either FASTQ or FASTA formats.
13 | Basecalls will be output into the ``basecalls.fastq`` or ``basecalls.fasta`` file within the ``--output-directory``.
14 |
15 | As of version 2.2, basecall anchored modified base calls (``mod_basecalls``) are output in an unmapped BAM file via the ``Mm`` and ``Ml`` tags `described by hts-specs here `_..
16 |
17 | -------
18 | Mapping
19 | -------
20 |
21 | Mapped reads can be output in SAM, BAM or CRAM formats.
22 | Basecalls will be output into the ``mappings.sam``, ``mappings.bam``, or ``mappings.cram`` file within the ``--output-directory``.
23 |
24 | ~~~~~~~~~~~~~~~
25 | Mapping Summary
26 | ~~~~~~~~~~~~~~~
27 |
28 | When ``--outputs mappings`` is requested the ``mappings.summary.txt`` is produced.
29 | This file contains the following fields:
30 |
31 | #. read_id
32 |
33 | - Unique read identifier (from FAST5)
34 | #. pct_identity
35 |
36 | - Mapping/reference percent identity (computed as ``100 * num_match / num_align``)
37 | #. num_align
38 |
39 | - Length of full alignment (``num_match + num_mismatch + num_ins + num_del``)
40 | #. num_match
41 |
42 | - Number of basecalls aligned to a matching reference base
43 | #. num_del
44 |
45 | - Number of deleted reference bases implied by the alignment
46 | #. num_ins
47 |
48 | - Number of inserted reference bases implied by the alignment
49 | #. read_pct_coverage
50 |
51 | - Percentage of read basecalls included in reference alignment
52 | #. chrom
53 |
54 | - Reference contig name for mapping
55 | #. strand
56 |
57 | - Strand for mapping
58 | #. start
59 |
60 | - Reference coordinate for start of mapping (0-based close interval coordinate)
61 | #. end
62 |
63 | - Reference coordinate for end of mapping (0-based open interval coordinate)
64 | #. query_start
65 |
66 | - Basecall coordinate for start of mapping (0-based closed interval coordinate)
67 | #. query_end
68 |
69 | - Basecall coordinate for end of mapping (0-based open interval coordinate)
70 | #. map_sig_start
71 |
72 | - Raw signal coordinate for start of mapping (0-based closed interval coordinate). Note that this coordinate is as stored in the FAST5 file, so for RNA reads (5' to 3' read direction) the start coordinate will be greater than the end coordinate.
73 | #. map_sig_end
74 |
75 | - Raw signal coordinate for end of mapping (0-based open interval coordinate)
76 | #. sig_len
77 |
78 | - Length of signal for complete read
79 | #. map_num
80 |
81 | - Mapping number to distinguish multiple mappings from the same read. Should always be ``0`` when ``--allow-supplementary-alignments`` is not set.
82 |
83 | ~~~~~~~~~~~~~~~~~~~~~
84 | Modified Base Mapping
85 | ~~~~~~~~~~~~~~~~~~~~~
86 |
87 | As of version 2.2, the default output for the ``mod_mappings`` output type will be a single BAM file with modified base probabilities stored via the ``Mm`` and ``Ml`` tags, as in ``mod_basecalls`` above.
88 | This format can be output in SAM, BAM or CRAM format as specified by the ``--mappings-format`` argument (which also applies to the ``mappings`` and ``mod_basecalls`` outputs).
89 |
90 | In order to obtain ``mod_mappings`` in the same format as Meglodon version < 2.2 use the ``--mod-map-emulate-bisulfite`` flag.
91 | This option will output a file for each modified base represented in the basecalling model.
92 | The mapped reads in this output represent only the information about modified bases contained within each read.
93 | Each read includes the mapped reference bases with only the called modified bases annotated.
94 | The quality score for each called base (whether called as modified or canonical) represent the probability of a modified status and not the canonical base probability (as specified by the SAM format).
95 | Bases without a proposed modified base will contain a quality score of ``40``.
96 |
97 | In addition, the ``--mod-map-base-conv`` is provided to modulate the bases output by this format.
98 | This option is useful since the BAM and CRAM formats do not support modified bases and will convert all alternative bases to ``N`` for storage.
99 | For example, to mimic bisulfite output use ``--mod-map-base-conv C T --mod-map-base-conv Z C``
100 | This can then be visualized by a genome browser as with standard bisulfite data.
101 |
102 | ----
103 |
104 | .. figure:: _images/mod_mapping_viz.png
105 | :align: center
106 | :width: 600
107 |
108 | Genome browser visualization. Megalodon mod_mappings output.
109 |
110 | ----
111 |
112 | ~~~~~~~~~~~~~~~
113 | Variant Mapping
114 | ~~~~~~~~~~~~~~~
115 |
116 | In addition to standard mapping files, megalodon includes a special mapping-style output with specific relevance to the variant calling pipeline.
117 | This format can be output as a SAM, BAM or CRAM file as with standard mapping format (as specified by the ``--mappings-format`` argument).
118 | The mapped reads in this output represent only the information about proposed variants contained within each read.
119 | Each read includes the mapped reference bases with only the called variants annotated.
120 | The score for each call is encoded in the base quality scores for each read.
121 | Bases without a proposed variant will contain a quality score of ``40``.
122 | Note that storage of insertion probabilities is not supported by the SAM/BAM format, so these score are lost in this format.
123 | This output is useful for 1) producing more accurate variant phasing and read haplotagging via whatshap and 2) visualizing per-read variant calls in a genome browser.
124 |
125 | -----------------------
126 | Per-read Modified Bases
127 | -----------------------
128 |
129 | ~~~~~~~~
130 | Database
131 | ~~~~~~~~
132 |
133 | The primary output for per-read modified base results is an `sqlite database `_.
134 | This database contains an indexed table with per-read, per-position, modified base scores, as well as auxiliary tables with read, modification type and reference chromosomes/records information.
135 | The read table (``read``) contains the read UUID.
136 | The modification type table (``mod``) contains the single letter modified base code, the modified base long name and the associated canonical base.
137 |
138 | As of version 2.2, the ``pos`` table has been dropped from the modified base schema.
139 | In place of the ``pos`` table, the ``chrm`` table contains the name and length of each chromosome/record in the reference sequence.
140 | The ``score_pos`` in the ``data`` table then contains an integer encoding of the ``(chrm, pos, strand)`` tuple (see ``megalodon.mods.ModsDb.get_pos_dbid`` and ``megalodon.mods.ModsDb.get_pos`` functions).
141 | This allows more efficient access to position information without requiring additional interaction with the database.
142 |
143 | The ``data`` table then contains the links between these tables along with the per-read log probability for each modified base at each called reference position in the ``score`` column.
144 | This table is indexed at the end of the run by the ``score_pos`` field such that iteration over the table (via ``megalodon.mods.ModsDb.iter_pos_scores`` occurs in reference sorted order.
145 |
146 | This database may be accessed via the ``megalodon.mods.ModsDb`` object.
147 | More documentation on the usage of the ``megalodon.mods.ModsDb`` interface will be added in a future release.
148 |
149 | ~~~~~~~~~~~~~
150 | Tab-delimited
151 | ~~~~~~~~~~~~~
152 |
153 | Modified bases results are also available via tab-delimited text output.
154 | This output can be requested via the ``--write-mods-text`` flag or obtained after a run via the ``megalodon_extras per_read_text modified_bases`` command.
155 | This output contains the following fields: ``read_id``, ``chrm``, ``strand``, ``pos``, ``mod_log_prob``, ``can_log_prob``, and ``mod_base``
156 |
157 | -------------------------
158 | Aggregated Modified Bases
159 | -------------------------
160 |
161 | The default aggregated modified base output is the bedMethyl format (`description here `_).
162 | Alternative formats are `wiggle `_ (variableStep) and VCF (treating the modified base as if it were a sequence variant).
163 |
164 | --------------------------
165 | Per-read Sequence Variants
166 | --------------------------
167 |
168 | As with the modified base results, the primary output for per-read sequence variant results is as `sqlite database `_.
169 | This database contains an indexed table with per-read, per-position, variant scores, as well as auxiliary tables with read, reference location and alternative allele information.
170 |
171 | The reference location table (``loc``) contains the mapped 0-based position, strand (1=forward, -1=reverse) and chromosome (via a final ``chrm`` table which contains the chromosome text).
172 | The ``loc`` table also contains the location for the start and end of the tested positions (applicable for insertions/deletions).
173 | For example, insertions generally require a context base for downstream processing, but within megalodon only the inserted position is considered (without context).
174 | Each reference location is linked to the IDs linked with this location from the input variants file.
175 | Finally the reference sequence for the location is included in this table.
176 | In the related ``alt`` table, each alternative sequence is stored.
177 | Links between alternative sequences and reference locations are made via the main ``data`` table.
178 |
179 | The ``read`` table contains the read UUID as well as the mapped strand for each read.
180 |
181 | ----------------------------
182 | Aggregated Sequence Variants
183 | ----------------------------
184 |
185 | Sequence variant calls are output in standard VCF format (version 4.1).
186 | The sample format fields includes the following standard VCF fields: ``gt``, ``gq``, ``gp``, ``gl``, and ````pl``
187 | In addition the non-standard ``log_probs`` field, containing the per-read contributions to the variant call, can be added to the VCF file by setting the ``--write-vcf-log-probs`` flag.
188 |
--------------------------------------------------------------------------------
/docs/modbase_training.rst:
--------------------------------------------------------------------------------
1 | **************************************
2 | Megalodon Modified Base Model Training
3 | **************************************
4 |
5 | This page describes the process to generate modified base training data and train a basecalling model capable of also detecting modified bases.
6 | The options provided from Megalodon for modified base training data annotation are each linked to a specific sample type.
7 | If the provided options for do not perform sufficiently for a particular sample type, please open an issue on the `Megalodon issues page `_ with details for the sample type and intended training procedure.
8 |
9 | Currently two sample types are supported for modified base training data markup:
10 |
11 | 1. Modified base in known sequence context (e.g. bacterial methylation)
12 | 2. Native (fractionally modified) sample with existing modified base basecalling model
13 |
14 | Given the first sample above, a model will never be presented a modified and canonical base in close proximity (within the same training chunk).
15 | Thus the second sample type is often required to produce a final model with sufficient performance across biologically relevant samples.
16 |
17 | The highest accuracy markup of the native sample is imperative to achieve the highest performance modified base basecalling model.
18 | In version 2.3, an adaptation was added to allow for markup informed by a reference methylation ground truth (fraction of modified reads at each reference site).
19 | This ground truth can come from a number of sources and technologies.
20 | Find more details below on how to use this method.
21 |
22 | ------------------------------
23 | Modified Base in Known Context
24 | ------------------------------
25 |
26 | Given a sample with all locations matching a particular sequence motif modified (e.g. bacterial methylation) the ``--ref-mods-all-motifs`` option can be specified.
27 | The ``--ref-mods-all-motifs`` argument takes ``4`` arguments.
28 | These values are:
29 |
30 | 1. Single letter code for the modified base
31 | 2. Long name for the modified base
32 | 3. Sequence motif (using `IUPAC ambiguity codes `_)
33 | 4. Relative position for the modified base within the sequence motif (0-based coordinate)
34 |
35 | The first two values are simply stored in the training data file.
36 | When used for model training, these values will be saved for annotation of output files where appropriate (e.g. Megalodon or Guppy output files).
37 |
38 | Using this mode of modified base annotation, the specified basecalling model can be either a standard (canonical bases only) model or a modified base model.
39 | Thus this method of annotation can be specified for modifications with no previous modeling (as long as the current basecalls map to the provided reference).
40 | If the basecalling model specified is a modified base model, the single letter, long name, and corresponding canonical base attributes must be in agreement with the basecalling model.
41 |
42 | As an example a native E. coli K12 sample contains 5mC methylation at a single motifs, ``CCWGG`` at the second position, and 6mA methylation at several motifs, ``GATC`` at the second position, ``AACNNNNNNGTGC`` at the second position, and ``GCACNNNNNNGTT`` at the third position (`source `_).
43 | In order to annotate all of these modifications in a training data set the following command would be run:
44 |
45 | ::
46 |
47 | megalodon ecoli_k12_fast5s/ \
48 | --outputs signal_mappings \
49 | --reference ecoli_k12_reference.fa \
50 | --devices 0 --processes 40 \
51 | --ref-mods-all-motifs m 5mC CCWGG 1 \
52 | --ref-mods-all-motifs a 6mA GATC 1 \
53 | --ref-mods-all-motifs a 6mA AACNNNNNNGTGC 1 \
54 | --ref-mods-all-motifs a 6mA GCACNNNNNNGTT 2
55 |
56 | Note that the single letter codes, ``m`` and ``a``, can be set to any value desired by the user.
57 | It is recommended that the values follow `specifications found in hts-specs `_.
58 | These values will be stored in the trained model for outputs where appropriate.
59 |
60 | ----------------------------------
61 | Bootstrap Modified Base Annotation
62 | ----------------------------------
63 |
64 | Once a modified base model is trained (see above) and calibration file computed (see below), further models can be trained by marking modified base training data with this model.
65 |
66 | .. warning::
67 |
68 | Great care should be taken when training a modified base basecalling model, especially with this method.
69 | The accuracy of reference modified base markup is strongly indicative of the final modified base detection performance for a trained model.
70 |
71 | The following example assumes a trained model to detect 5mC (``m``) in ``CG`` sequence contexts (model specified in ``model_final.cfg``).
72 | In order to annotate 5mC sites in a modified base training data set (``signal_mapping``) using a modified base model the following command would be run:
73 |
74 | ::
75 |
76 | megalodon native_human_fast5s/ \
77 | --outputs signal_mappings \
78 | --reference reference.fa.mmi \
79 | --devices 0 --processes 40 \
80 | --mod-motif m CG 0 \
81 | --ref-include-mods \
82 | --guppy-config model_final.cfg
83 |
84 | The ``--ref-mod-threshold`` argument is provided to adjust the annotation based on modeling results.
85 | By default the threshold to annotate a base as modified is a log likelihood ratio of ``0`` (i.e. the modified base is more likely than the canonical base based on empirically calibrated statistics).
86 | In some samples this value may not be optimal.
87 | The ``megalodon_extras modified_bases estimate_threshold`` command is provided for assistance in determining a reasonable value for this parameter.
88 |
89 | -----------------------------------------------------
90 | Ground Truth Aided Bootstrap Modified Base Annotation
91 | -----------------------------------------------------
92 |
93 | To further improve modified base training data markup, a reference anchored ground truth can be leveraged.
94 | This method sets the modified base markup threshold at each reference position, informed by the provided ground truth.
95 | This is similar to the ``--ref-mod-threshold`` argument, but this threshold is global to all reference positions.
96 |
97 | The first step in this method is to call per-read modified bases on the native sample of interest.
98 | This sample should contain sufficient depth, such that the identified modified base threshold at each position will be robust.
99 | 50X coverage is a rough target for sufficient coverage with this method.
100 |
101 | Given a completed Megalodon run (``megalodon_results`` with ``--outputs mappings per_read_mods``) and the ground truth bedmethyl file (``ground_truth_methylation.CG.bed``) the following command will compute per-site modified base thresholds and low coverage sites.
102 |
103 | ::
104 |
105 | # Compute per-site thresholds
106 | megalodon_extras \
107 | modified_bases per_site_thresholds \
108 | megalodon_results \
109 | ground_truth_methylation.CG.bed \
110 | --strand-offset 1 \
111 | --ground-truth-coverage-pdf gt_cov.CG.pdf \
112 | --ground-truth-cov-min 50 \
113 | --nanopore-cov-min 50 \
114 | --out-blacklist-sites low_coverage_sites.CG.bed \
115 | --out-per-site-mod-thresholds site_mod_thresholds.CG.bed
116 | # sort low coverage sites for faster bedtools filtering
117 | sort -S 25% --parallel=56 -T /tmp/ \
118 | -k1,1V -k2,2n \
119 | -o low_coverage_sites.CG.sorted.bed
120 | # filter and sort first round mappings (% ident>90, read coverage>90%, length between 1000 and 20000)
121 | awk '$2 > 90 && $7 > 90 && $3 - $6 > 1000 && $3 - $6 < 20000 {print $8"\t"$10"\t"$11"\t"$1"\t.\t"$9}' \
122 | megalodon_results/mappings.summary.txt | \
123 | sort -S 25% --parallel=56 -T /tmp/ -k1,1V -k2,2n > \
124 | mappings.filtered.sorted.bed
125 | intersectBed \
126 | -a mappings.filtered.sorted.bed \
127 | -b low_coverage_sites.CG.sorted.bed -s -sorted -v | \
128 | awk '{print $4}' > train_read_ids.txt
129 |
130 | Finally the ground truth aided modified base markup training data set is produced with the following command.
131 |
132 | ::
133 |
134 | megalodon \
135 | native_human_fast5s/ \
136 | --reference reference.fa.mmi \
137 | --output-directory per_site_markup_mega_res \
138 | --outputs per_read_mods signal_mappings \
139 | --mod-motif m CG 0 \
140 | --guppy-config model_final.cfg \
141 | --processes 40 --devices 0 \
142 | --ref-include-mods \
143 | --mod-per-site-threshold site_mod_thresholds.CG.bed \
144 | --read-ids-filename train_read_ids.txt
145 |
146 | ----------------------------
147 | Modified Base Model Training
148 | ----------------------------
149 |
150 | Given any of the above modified base annotated mapped signal files, a new modified base model can be trained with `Taiyaki `_.
151 | Below is an example command to train a modified base model from the data prepared above and convert the final model for use with Guppy or Megalodon.
152 |
153 | ::
154 |
155 | train_flipflop.py ./taiyaki/models/mLstm_cat_mod_flipflop.py \
156 | megalodon_results/signal_mappings.hdf5 --device 0
157 | # dump model to json format for use by guppy
158 | dump_json.py training/model_final.checkpoint \
159 | --output model_final.jsn
160 |
161 | The produced model should be referenced from a new Guppy config file.
162 | The easiest way to obtain this would be to copy and edit the closest existing Guppy config file in the ``data`` directory of Guppy.
163 |
164 | ---------------------
165 | Megalodon Calibration
166 | ---------------------
167 |
168 | In order to produce the most accurate aggregated modified base calls, Megalodon requires the computation of a calibration file.
169 | A ground truth sample containing known modified reference sites as well as known canonical base sites is required.
170 | This can be the same as the model training data.
171 | A modified base calibration file is created with the ``megalodon_extras calibrate generate_modified_base_stats`` and ``megalodon_extras calibrate modified_bases`` commands.
172 | Please see the `calibration documentation page `_ for more details about this process.
173 |
--------------------------------------------------------------------------------