├── .devcontainer ├── devcontainer.json ├── initializeCommand.sh └── postCreateCommand.sh ├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.yml │ └── feature-request.yml ├── codecov.yml ├── copy-pr-bot.yaml ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── approvals.yml │ ├── bionemo-subpackage-ci.yml │ ├── blossom-ci.yml │ ├── gh-docs-deploy.yml │ ├── internal_tools.yml │ ├── trufflehog.yml │ └── unit-tests.yml ├── .gitignore ├── .gitmodules ├── .nspect-allowlist.toml ├── .pre-commit-config.yaml ├── .secrets-nb.baseline ├── .secrets.baseline ├── .vscode └── settings.json ├── CODE-REVIEW.md ├── CODEOWNERS ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── license.txt └── third_party.txt ├── README.md ├── SECURITY.md ├── VERSION ├── ci ├── benchmarks │ ├── partial-conv │ │ ├── amplify_pretrain.yaml │ │ ├── esm2_pretrain.yaml │ │ ├── evo2_pretrain.yaml │ │ └── geneformer_pretrain.yaml │ └── perf │ │ ├── amplify_pretrain.yaml │ │ ├── esm2_pretrain.yaml │ │ ├── evo2_pretrain.yaml │ │ └── geneformer_pretrain.yaml ├── docker │ ├── Dockerfile.uv │ └── entrypoint.sh └── scripts │ ├── build_docker_image.sh │ ├── run_pytest.sh │ ├── static_checks.sh │ └── utils.sh ├── docker_build_patches └── decord_ffmpeg6_fix.patch ├── docs ├── Dockerfile ├── README.md ├── conftest.py ├── docs │ ├── SUMMARY.md │ ├── assets │ │ ├── css │ │ │ ├── chatbot.css │ │ │ ├── color-schemes.css │ │ │ ├── custom-material.css │ │ │ ├── fonts.css │ │ │ └── jupyter-themes.css │ │ ├── images │ │ │ ├── amplify │ │ │ │ ├── training_loss.png │ │ │ │ ├── validation_loss.png │ │ │ │ └── validation_ppl.png │ │ │ ├── esm2 │ │ │ │ ├── esm2_device_scaling.png │ │ │ │ ├── esm2_device_scaling.svg │ │ │ │ ├── esm2_model_scaling.png │ │ │ │ ├── esm2_model_scaling.svg │ │ │ │ ├── esm2_peft_memory_usage.png │ │ │ │ ├── esm2_peft_time.png │ │ │ │ ├── esm2_pretrain_convergence.png │ │ │ │ ├── esm2_pretrain_convergence.svg │ │ │ │ ├── esm2_single_node_training_perf.png │ │ │ │ └── esm2_single_node_training_perf.svg │ │ │ ├── evo2 │ │ │ │ ├── evo2_bionemo_1b_6950steps.png │ │ │ │ ├── evo2_bionemo_7bnv_28ksteps.png │ │ │ │ ├── evo2_performance_by_cluster_size.png │ │ │ │ ├── evo2_savanna_1b_6950steps.png │ │ │ │ ├── evo2_savanna_7b_28ksteps.png │ │ │ │ ├── evo2_vs_7b_40b_performance_vs_context_length.png │ │ │ │ ├── evo2_vs_llama2_performance_vs_context_length.png │ │ │ │ └── evo2_zeroshot_brca1_stripplot.png │ │ │ ├── favicon.png │ │ │ ├── geneformer │ │ │ │ ├── F1-score-models.png │ │ │ │ ├── Geneformer_steven_106m_train.png │ │ │ │ ├── Geneformer_steven_106m_val.png │ │ │ │ ├── accuracy-models-04-18-2025.png │ │ │ │ ├── average-accuracy-models.png │ │ │ │ ├── f1-score-models-04-18-2025.png │ │ │ │ ├── geneformer_106m_train_loss.png │ │ │ │ ├── geneformer_106m_val_loss.png │ │ │ │ ├── geneformer_10m_training_loss.png │ │ │ │ ├── geneformer_10m_val_loss.png │ │ │ │ ├── loss_curve_new_v_old_geneformer_64_node_10M.png │ │ │ │ └── model_tflops_per_gpu_chart_geneformer.png │ │ │ ├── logo-icon-black.svg │ │ │ ├── logo-white.svg │ │ │ ├── megatron_background │ │ │ │ ├── README.md │ │ │ │ ├── data_parallelism.png │ │ │ │ ├── execution_schedulers.png │ │ │ │ ├── fsdp_slide1.png │ │ │ │ ├── fsdp_slide2.png │ │ │ │ ├── pipeline_parallelism.png │ │ │ │ ├── sp_korthikanti_2022_fig5.png │ │ │ │ ├── tensor_and_pipeline_parallelism.png │ │ │ │ └── tensor_parallelism.png │ │ │ ├── sub_package_graphs │ │ │ │ ├── dependency_file_imports.png │ │ │ │ ├── dependency_graph_pyproject.png │ │ │ │ └── dependency_graph_tach.png │ │ │ └── wandb_tips_tricks │ │ │ │ └── trainer_global_step.png │ │ ├── javascript │ │ │ └── chatbot.js │ │ └── old_images │ │ │ ├── .gitkeep │ │ │ ├── MMB_molecule_generation_1.png │ │ │ ├── MMB_molecule_generation_2.png │ │ │ ├── MMB_molecule_generation_3.png │ │ │ ├── MMB_molecule_generation_4.png │ │ │ ├── MMB_molecule_generation_5.png │ │ │ ├── MolMIM_model.png │ │ │ ├── MolMIM_molecule_generation_1.png │ │ │ ├── MolMIM_molecule_generation_2.png │ │ │ ├── bcp_snapshot_.png │ │ │ ├── bcp_snapshot_1.png │ │ │ ├── bcp_snapshot_2.png │ │ │ ├── bcp_snapshot_3.png │ │ │ ├── bionemo_overview_1.png │ │ │ ├── bionemo_overview_2.png │ │ │ ├── cellxgene │ │ │ ├── num_cells_by_assay.png │ │ │ ├── num_cells_by_dataset.png │ │ │ ├── num_genes_measured_by_assay.png │ │ │ ├── pct_cells_by_age.png │ │ │ ├── pct_cells_by_ethnicity_category.png │ │ │ ├── pct_cells_by_sex.png │ │ │ ├── pct_cells_by_tissue_category.png │ │ │ └── top9_datasets_tissue_distribution.png │ │ │ ├── diffdock_1.png │ │ │ ├── diffdock_2.png │ │ │ ├── diffdock_3.png │ │ │ ├── diffdock_4.png │ │ │ ├── diffdock_fw_overview.png │ │ │ ├── equidock_1.png │ │ │ ├── equidock_2.png │ │ │ ├── equidock_3.png │ │ │ ├── equidock_4.png │ │ │ ├── esm1nv_1.png │ │ │ ├── esm1nv_2.png │ │ │ ├── esm1nv_3.png │ │ │ ├── esm1nv_4.png │ │ │ ├── mmb_1.png │ │ │ ├── mmb_2.png │ │ │ ├── mmb_3.png │ │ │ ├── mmb_4.png │ │ │ ├── mmb_5.png │ │ │ ├── molmim-embedding.png │ │ │ ├── molmim-hidden-state.png │ │ │ ├── molmim-predictive-modeling.png │ │ │ ├── sc_fm │ │ │ ├── F1-score-models.png │ │ │ ├── average-accuracy-models.png │ │ │ ├── geneformer-106m-240530-val-train-loss.png │ │ │ ├── geneformer-10m-240530-val-train-loss.png │ │ │ ├── geneformer-240530-val-comparison.png │ │ │ └── model_tflops_per_gpu_chart_tight_layout.png │ │ │ ├── wandai_charts.png │ │ │ └── wandb-dashboard.png │ ├── index.md │ ├── main │ │ ├── SUMMARY.md │ │ ├── about │ │ │ ├── SUMMARY.md │ │ │ ├── background │ │ │ │ ├── SUMMARY.md │ │ │ │ ├── megatron_datasets.md │ │ │ │ └── nemo2.md │ │ │ ├── overview.md │ │ │ └── releasenotes-fw.md │ │ ├── contributing │ │ │ ├── Writing Documentation │ │ │ │ ├── index.md │ │ │ │ ├── jupyter-notebooks.ipynb │ │ │ │ └── mkdocs.md │ │ │ ├── code-review.md │ │ │ ├── contributing.md │ │ │ └── sub-package_dependency_graph.md │ │ ├── datasets │ │ │ ├── CELLxGENE.md │ │ │ ├── index.md │ │ │ └── uniprot.md │ │ ├── developer-guide │ │ │ └── SUMMARY.md │ │ ├── examples │ │ │ ├── .gitignore │ │ │ ├── SUMMARY.md │ │ │ └── conftest.py │ │ ├── getting-started │ │ │ ├── SUMMARY.md │ │ │ ├── access-startup.md │ │ │ ├── development.md │ │ │ ├── index.md │ │ │ ├── initialization-guide.md │ │ │ ├── pre-reqs.md │ │ │ ├── training-models.md │ │ │ └── using-slurm.md │ │ ├── index.md │ │ └── references │ │ │ ├── API_reference │ │ │ └── index.md │ │ │ └── FAQ.md │ └── models │ │ ├── ESM-2 │ │ ├── SUMMARY.md │ │ ├── index.md │ │ └── pre-training.md │ │ ├── amplify.md │ │ ├── evo2.md │ │ ├── geneformer.md │ │ └── index.md ├── mkdocs.yml ├── overrides │ ├── .icons │ │ └── nvidia │ │ │ └── nvidia-logo.svg │ └── main.html ├── requirements.txt └── scripts │ └── gen_ref_pages.py ├── internal ├── Pypi_publish.md ├── README_justfile.md ├── infra-bionemo │ ├── LICENSE │ ├── README.md │ ├── pyproject.toml │ ├── setup.py │ ├── src │ │ └── infra_bionemo │ │ │ ├── __init__.py │ │ │ ├── license_check.py │ │ │ └── new_project │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── exe │ │ │ ├── __init__.py │ │ │ ├── bionemo_subpackage.py │ │ │ ├── namespace.py │ │ │ └── simple.py │ │ │ ├── templates.py │ │ │ └── utils.py │ └── tests │ │ ├── conftest.py │ │ └── test_infra_bionemo │ │ ├── test_license_check.py │ │ └── test_new_project │ │ ├── test_api.py │ │ ├── test_cli_tools.py │ │ └── test_utils.py └── scripts │ ├── README.md │ ├── build_dev_image.sh │ ├── run_dev.sh │ └── setup_env_file.sh ├── justfile ├── license_header ├── pyproject.toml ├── requirements-cve.txt ├── requirements-dev.txt ├── requirements-test.txt ├── scripts ├── gpt-pretrain.py └── protein │ └── esm2 │ └── esm2_dataset_perplexity.py ├── sub-packages ├── bionemo-amplify │ ├── README.md │ ├── VERSION │ ├── pyproject.toml │ ├── src │ │ └── bionemo │ │ │ └── amplify │ │ │ ├── __init__.py │ │ │ ├── convert.py │ │ │ ├── datamodule.py │ │ │ ├── dataset.py │ │ │ ├── hf_rotary.py │ │ │ ├── infer_amplify.py │ │ │ ├── model.py │ │ │ ├── tokenizer.py │ │ │ └── train_amplify.py │ └── tests │ │ └── bionemo │ │ └── amplify │ │ ├── __init__.py │ │ ├── test_convert.py │ │ ├── test_datamodule.py │ │ ├── test_dataset.py │ │ ├── test_hf_rotary.py │ │ ├── test_infer_amplify.py │ │ ├── test_model.py │ │ ├── test_tokenizer.py │ │ └── test_train_amplify.py ├── bionemo-core │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── pyproject.toml │ ├── src │ │ └── bionemo │ │ │ └── core │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── data │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── load.py │ │ │ ├── multi_epoch_dataset.py │ │ │ ├── permute.py │ │ │ ├── resamplers.py │ │ │ ├── resource.py │ │ │ └── resources │ │ │ │ ├── esm2.yaml │ │ │ │ ├── evo2.yaml │ │ │ │ ├── geneformer.yaml │ │ │ │ ├── scdl.yaml │ │ │ │ └── single_cell.yaml │ │ │ ├── model │ │ │ ├── __init__.py │ │ │ └── config.py │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── batching_utils.py │ │ │ ├── dtypes.py │ │ │ └── random_utils.py │ └── tests │ │ └── bionemo │ │ └── core │ │ ├── data │ │ ├── test_load.py │ │ ├── test_load_notebook.ipynb │ │ ├── test_multi_epoch_dataset.py │ │ ├── test_permute.py │ │ ├── test_resamplers.py │ │ └── test_resource.py │ │ └── utils │ │ └── test_dtypes.py ├── bionemo-esm2 │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── examples │ │ ├── finetune.ipynb │ │ ├── inference.ipynb │ │ ├── mutant-design.ipynb │ │ └── pretrain.md │ ├── pyproject.toml │ ├── src │ │ └── bionemo │ │ │ └── esm2 │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── datamodule.py │ │ │ ├── dataset.py │ │ │ └── tokenizer │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── special_tokens_map.json │ │ │ │ ├── tokenizer_config.json │ │ │ │ └── vocab.txt │ │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── convert.py │ │ │ ├── embedding.py │ │ │ ├── finetune │ │ │ │ ├── __init__.py │ │ │ │ ├── datamodule.py │ │ │ │ ├── dataset.py │ │ │ │ ├── loss.py │ │ │ │ ├── peft.py │ │ │ │ ├── sequence_model.py │ │ │ │ └── token_model.py │ │ │ └── model.py │ │ │ ├── run │ │ │ ├── __init__.py │ │ │ ├── config_models.py │ │ │ ├── main.py │ │ │ └── recipes.py │ │ │ ├── scripts │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── finetune_esm2.py │ │ │ ├── infer_esm2.py │ │ │ └── train_esm2.py │ │ │ └── testing │ │ │ ├── __init__.py │ │ │ └── compare.py │ └── tests │ │ └── bionemo │ │ └── esm2 │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── data │ │ ├── __init__.py │ │ ├── test_datamodule.py │ │ ├── test_dataset.py │ │ └── test_tokenizer.py │ │ ├── model │ │ ├── __init__.py │ │ ├── finetune │ │ │ ├── __init__.py │ │ │ ├── test_datamodule.py │ │ │ ├── test_dataset.py │ │ │ ├── test_sequence_model.py │ │ │ └── test_token_model.py │ │ ├── test_convert.py │ │ ├── test_embedding.py │ │ ├── test_model.py │ │ └── test_stop_and_go.py │ │ └── scripts │ │ ├── __init__.py │ │ ├── test_finetune_esm2.py │ │ ├── test_infer_esm2.py │ │ ├── test_pydantic_train.py │ │ └── test_train_esm2.py ├── bionemo-evo2 │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── assets │ │ └── 1b_finetuning_train_curve_500_steps_256gbs.png │ ├── examples │ │ ├── .gitignore │ │ ├── configs │ │ │ ├── README.md │ │ │ ├── full_pretrain_longphase_config.yaml │ │ │ ├── full_pretrain_shortphase_config.yaml │ │ │ ├── test_preproc_config.yaml │ │ │ └── test_promotors_dataset_config.yaml │ │ ├── fine-tuning-tutorial.ipynb │ │ └── zeroshot_brca1.ipynb │ ├── pyproject.toml │ ├── src │ │ └── bionemo │ │ │ └── evo2 │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── fasta_dataset.py │ │ │ ├── preprocess.py │ │ │ ├── tokenizer.py │ │ │ └── transcript_extraction.py │ │ │ ├── run │ │ │ ├── __init__.py │ │ │ ├── infer.py │ │ │ ├── predict.py │ │ │ └── train.py │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── checkpoint │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── convert_checkpoint_model_parallel_evo2.py │ │ │ ├── convert_to_nemo.py │ │ │ ├── convert_zero3_to_zero1.py │ │ │ ├── params.py │ │ │ └── zero3_conversion_lib.py │ │ │ └── config.py │ └── tests │ │ └── bionemo │ │ └── evo2 │ │ ├── data │ │ ├── test_fasta_dataset.py │ │ ├── test_preprocess.py │ │ └── test_tokenizer.py │ │ ├── run │ │ ├── test_infer.py │ │ ├── test_inference.py │ │ ├── test_predict.py │ │ └── test_train.py │ │ ├── test_evo2.py │ │ └── test_hyena_operators.py ├── bionemo-example_model │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── pyproject.toml │ ├── src │ │ └── bionemo │ │ │ └── example_model │ │ │ ├── lightning │ │ │ ├── __init__.py │ │ │ └── lightning_basic.py │ │ │ └── training_scripts │ │ │ ├── finetune_mnist.py │ │ │ ├── predict_mnist.py │ │ │ └── pretrain_mnist.py │ └── tests │ │ └── bionemo │ │ └── example_model │ │ └── lightning │ │ └── test_lightning_basic.py ├── bionemo-fw │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── pyproject.toml │ ├── src │ │ └── bionemo │ │ │ └── fw │ │ │ ├── __init__.py │ │ │ └── dependency_graph.py │ └── tests │ │ ├── __init__.py │ │ └── bionemo │ │ └── fw │ │ ├── test_dependency_graph.py │ │ └── test_sub_package_imports.py ├── bionemo-geneformer │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── examples │ │ ├── .gitignore │ │ ├── geneformer-celltype-classification.ipynb │ │ ├── geneformer-gene-embedding-GRN.ipynb │ │ └── geneformer_cellxgene_tutorial.ipynb │ ├── pyproject.toml │ ├── scripts │ │ ├── README.md │ │ └── geneformer_mlm_loss_eval.py │ ├── src │ │ └── bionemo │ │ │ └── geneformer │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── preprocess.py │ │ │ └── singlecell │ │ │ │ ├── __init__.py │ │ │ │ ├── datamodule.py │ │ │ │ ├── dataset.py │ │ │ │ ├── preprocess.py │ │ │ │ └── utils.py │ │ │ ├── model │ │ │ ├── __init__.py │ │ │ └── finetune_token_regressor.py │ │ │ ├── run │ │ │ ├── __init__.py │ │ │ ├── config_models.py │ │ │ ├── main.py │ │ │ └── recipes.py │ │ │ ├── scripts │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── celltype_classification_bench │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── bench.py │ │ │ │ └── download.py │ │ │ ├── infer_geneformer.py │ │ │ └── train_geneformer.py │ │ │ ├── tokenizer │ │ │ ├── __init__.py │ │ │ └── gene_tokenizer.py │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── callbacks.py │ └── tests │ │ └── bionemo │ │ └── geneformer │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── scripts │ │ ├── __init__.py │ │ ├── test_pydantic_train.py │ │ └── test_train_geneformer.py │ │ ├── test_celltype_bench.py │ │ ├── test_dataset.py │ │ ├── test_model.py │ │ ├── test_stop_and_go.py │ │ └── test_transformer_specs.py ├── bionemo-geometric │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── pyproject.toml │ ├── requirements.txt │ ├── src │ │ └── bionemo │ │ │ └── geometric │ │ │ ├── __init__.py │ │ │ ├── atom_featurizers.py │ │ │ ├── base_featurizer.py │ │ │ ├── bond_featurizers.py │ │ │ ├── data │ │ │ └── electronic_data.csv │ │ │ └── molecule_featurizers.py │ └── tests │ │ └── bionemo │ │ └── geometric │ │ ├── test_atom_featurizers.py │ │ ├── test_bionemo_geometric.py │ │ ├── test_bond_featurizers.py │ │ └── test_molecule_featurizers.py ├── bionemo-llm │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── pyproject.toml │ ├── src │ │ └── bionemo │ │ │ └── llm │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── collate.py │ │ │ ├── datamodule.py │ │ │ ├── label2id_tokenizer.py │ │ │ ├── masking.py │ │ │ └── types.py │ │ │ ├── lightning.py │ │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── biobert │ │ │ │ ├── __init__.py │ │ │ │ ├── lightning.py │ │ │ │ ├── model.py │ │ │ │ ├── testing_utils.py │ │ │ │ └── transformer_specs.py │ │ │ ├── config.py │ │ │ ├── layers.py │ │ │ ├── loss.py │ │ │ └── lr_scheduler.py │ │ │ ├── run │ │ │ ├── __init__.py │ │ │ └── config_models.py │ │ │ ├── train.py │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── callbacks.py │ │ │ ├── datamodule_utils.py │ │ │ ├── iomixin_utils.py │ │ │ ├── logger_utils.py │ │ │ ├── megatron_utils.py │ │ │ ├── remote.py │ │ │ └── weight_utils.py │ └── tests │ │ ├── __init__.py │ │ └── bionemo │ │ └── llm │ │ ├── __init__.py │ │ ├── data │ │ ├── test_collate.py │ │ ├── test_datamodule.py │ │ └── test_masking.py │ │ ├── model │ │ ├── biobert │ │ │ └── test_transformer_specs.py │ │ ├── test_loss.py │ │ └── test_lr_scheduler.py │ │ ├── test_lightning.py │ │ └── utils │ │ ├── __init__.py │ │ ├── test_callbacks.py │ │ ├── test_datamodule_utils.py │ │ ├── test_iomixin_utils.py │ │ ├── test_logger_utils.py │ │ └── test_megatron_utils.py ├── bionemo-moco │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── blog.md │ ├── documentation.md │ ├── environment │ │ ├── Instructions.md │ │ ├── clone_bionemo_moco.sh │ │ ├── moco_env.yaml │ │ └── setup.sh │ ├── examples │ │ ├── continuous_data_interpolant_tutorial_cfm.ipynb │ │ ├── continuous_data_interpolant_tutorial_ddpm.ipynb │ │ ├── continuous_data_interpolant_tutorial_vdm.ipynb │ │ ├── discrete_data_interpolant_tutorial.ipynb │ │ └── ot_sampler_tutorial.ipynb │ ├── figures │ │ └── model_figure.png │ ├── pyproject.toml │ ├── scripts │ │ ├── README.md │ │ ├── clean_documentation.py │ │ └── create_documentation.sh │ ├── src │ │ └── bionemo │ │ │ └── moco │ │ │ ├── __init__.py │ │ │ ├── distributions │ │ │ ├── __init__.py │ │ │ ├── prior │ │ │ │ ├── __init__.py │ │ │ │ ├── continuous │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── gaussian.py │ │ │ │ │ ├── harmonic.py │ │ │ │ │ └── utils.py │ │ │ │ ├── discrete │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── custom.py │ │ │ │ │ ├── mask.py │ │ │ │ │ └── uniform.py │ │ │ │ └── distribution.py │ │ │ └── time │ │ │ │ ├── __init__.py │ │ │ │ ├── beta.py │ │ │ │ ├── distribution.py │ │ │ │ ├── logit_normal.py │ │ │ │ ├── uniform.py │ │ │ │ └── utils.py │ │ │ ├── interpolants │ │ │ ├── __init__.py │ │ │ ├── base_interpolant.py │ │ │ ├── batch_augmentation.py │ │ │ ├── continuous_time │ │ │ │ ├── __init__.py │ │ │ │ ├── continuous │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── continuous_flow_matching.py │ │ │ │ │ ├── data_augmentation │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── augmentation_types.py │ │ │ │ │ │ ├── equivariant_ot_sampler.py │ │ │ │ │ │ ├── kabsch_augmentation.py │ │ │ │ │ │ └── ot_sampler.py │ │ │ │ │ └── vdm.py │ │ │ │ └── discrete │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── discrete_flow_matching.py │ │ │ │ │ └── mdlm.py │ │ │ └── discrete_time │ │ │ │ ├── __init__.py │ │ │ │ ├── continuous │ │ │ │ ├── __init__.py │ │ │ │ └── ddpm.py │ │ │ │ ├── discrete │ │ │ │ ├── __init__.py │ │ │ │ └── d3pm.py │ │ │ │ └── utils.py │ │ │ ├── schedules │ │ │ ├── __init__.py │ │ │ ├── inference_time_schedules.py │ │ │ ├── noise │ │ │ │ ├── __init__.py │ │ │ │ ├── continuous_noise_transforms.py │ │ │ │ ├── continuous_snr_transforms.py │ │ │ │ └── discrete_noise_schedules.py │ │ │ └── utils.py │ │ │ └── testing │ │ │ ├── __init__.py │ │ │ └── parallel_test_utils.py │ └── tests │ │ └── bionemo │ │ └── moco │ │ ├── distributions │ │ ├── prior │ │ │ ├── continuous │ │ │ │ ├── test_gaussian.py │ │ │ │ └── test_harmonic.py │ │ │ └── discrete │ │ │ │ ├── test_custom.py │ │ │ │ ├── test_mask.py │ │ │ │ └── test_uniform.py │ │ └── time │ │ │ └── test_time_distribution.py │ │ ├── interpolants │ │ ├── continuous_time │ │ │ ├── continuous │ │ │ │ ├── test_continuous_flow_matching.py │ │ │ │ ├── test_continuous_flow_matching_parallel.py │ │ │ │ ├── test_optimal_transport.py │ │ │ │ ├── test_vdm.py │ │ │ │ └── test_vdm_parallel.py │ │ │ └── discrete │ │ │ │ ├── test_discrete_flow_matching.py │ │ │ │ ├── test_discrete_flow_matching_parallel.py │ │ │ │ ├── test_mdlm.py │ │ │ │ └── test_mdlm_parallel.py │ │ └── discrete_time │ │ │ ├── continuous │ │ │ ├── test_ddpm.py │ │ │ └── test_ddpm_parallel.py │ │ │ └── discrete │ │ │ ├── test_d3pm.py │ │ │ └── test_d3pm_parallel.py │ │ ├── schedules │ │ ├── noise │ │ │ ├── test_continuous_noise_transforms.py │ │ │ ├── test_continuous_snr_transforms.py │ │ │ └── test_discrete_noise_schedule.py │ │ └── test_inference_schedules.py │ │ └── test_env.py ├── bionemo-noodles │ ├── .gitignore │ ├── Cargo.lock │ ├── Cargo.toml │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── pyproject.toml │ ├── requirements.txt │ ├── rust │ │ └── src │ │ │ └── lib.rs │ ├── src │ │ └── bionemo │ │ │ └── noodles │ │ │ ├── __init__.py │ │ │ └── nvfaidx.py │ └── tests │ │ └── bionemo │ │ └── noodles │ │ ├── data │ │ ├── bad_index.fasta │ │ ├── bad_index.fasta.fai │ │ ├── dupes.fasta │ │ ├── sample.fasta │ │ └── sample.fasta.fai │ │ ├── test_nvfaidx.py │ │ └── test_sequence_ops.py ├── bionemo-scdl │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── assets │ │ ├── disk_space.png │ │ └── throughput.png │ ├── examples │ │ └── example_notebook.ipynb │ ├── pyproject.toml │ ├── src │ │ └── bionemo │ │ │ └── scdl │ │ │ ├── __init__.py │ │ │ ├── api │ │ │ ├── __init__.py │ │ │ └── single_cell_row_dataset.py │ │ │ ├── index │ │ │ ├── __init__.py │ │ │ └── row_feature_index.py │ │ │ ├── io │ │ │ ├── __init__.py │ │ │ ├── single_cell_collection.py │ │ │ └── single_cell_memmap_dataset.py │ │ │ ├── scripts │ │ │ ├── __init__.py │ │ │ └── convert_h5ad_to_scdl.py │ │ │ └── util │ │ │ ├── __init__.py │ │ │ ├── async_worker_queue.py │ │ │ ├── filecopyutil.py │ │ │ └── torch_dataloader_utils.py │ └── tests │ │ └── bionemo │ │ └── scdl │ │ ├── conftest.py │ │ ├── index │ │ └── test_row_feature_index.py │ │ ├── io │ │ ├── test_single_cell_collection.py │ │ └── test_single_cell_memmap_dataset.py │ │ └── util │ │ ├── test_async_worker_queue.py │ │ └── test_torch_dataloader_utils.py ├── bionemo-size-aware-batching │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── pyproject.toml │ ├── requirements.txt │ ├── src │ │ └── bionemo │ │ │ └── size_aware_batching │ │ │ ├── __init__.py │ │ │ ├── sampler.py │ │ │ └── utils.py │ └── tests │ │ └── bionemo │ │ └── size_aware_batching │ │ ├── conftest.py │ │ ├── test_sampler.py │ │ └── test_utils.py ├── bionemo-testing │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── pyproject.toml │ ├── src │ │ └── bionemo │ │ │ └── testing │ │ │ ├── __init__.py │ │ │ ├── callbacks.py │ │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── esm2.py │ │ │ ├── fasta.py │ │ │ ├── load.py │ │ │ └── resource.py │ │ │ ├── harnesses │ │ │ ├── __init__.py │ │ │ ├── mode.py │ │ │ └── stop_and_go.py │ │ │ ├── lightning.py │ │ │ ├── megatron_dataset_compatibility.py │ │ │ ├── megatron_parallel_state_utils.py │ │ │ ├── subprocess_utils.py │ │ │ ├── testing_callbacks.py │ │ │ ├── torch.py │ │ │ └── utils.py │ └── tests │ │ └── bionemo │ │ └── testing │ │ ├── data │ │ └── test_fasta.py │ │ ├── test_megatron_dataset_compatibility.py │ │ └── test_megatron_parallel_state_utils.py └── bionemo-webdatamodule │ ├── LICENSE │ ├── README.md │ ├── VERSION │ ├── pyproject.toml │ ├── src │ └── bionemo │ │ └── webdatamodule │ │ ├── __init__.py │ │ ├── datamodule.py │ │ └── utils.py │ └── tests │ └── bionemo │ └── webdatamodule │ ├── __init__.py │ ├── conftest.py │ └── test_datamodule.py ├── tach.toml └── uv.lock /.devcontainer/initializeCommand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Create the mounted config directories if they don't already exist 3 | 4 | mkdir -p ~/.aws 5 | mkdir -p ~/.ngc 6 | mkdir -p ~/.cache 7 | mkdir -p ~/.ssh 8 | [ ! -f ~/.netrc ] && touch ~/.netrc 9 | 10 | # Create the ~/.bash_history_devcontainer file if it doesn't exist 11 | [ ! -f ~/.bash_history_devcontainer ] && touch ~/.bash_history_devcontainer 12 | 13 | exit 0 14 | -------------------------------------------------------------------------------- /.devcontainer/postCreateCommand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for sub in ./3rdparty/*/ ./sub-packages/bionemo-*/; do 4 | uv pip install --no-deps --no-build-isolation --editable $sub 5 | done 6 | -------------------------------------------------------------------------------- /.github/codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: false 3 | 4 | coverage: 5 | status: 6 | project: 7 | default: 8 | target: auto 9 | threshold: 5 10 | 11 | comment: 12 | layout: "diff, flags, files" 13 | behavior: default 14 | require_changes: false # if true: only post the comment if coverage changes 15 | -------------------------------------------------------------------------------- /.github/copy-pr-bot.yaml: -------------------------------------------------------------------------------- 1 | enabled: true 2 | auto_sync_draft: false 3 | auto_sync_ready: true 4 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | - package-ecosystem: "gitsubmodule" 5 | schedule: 6 | interval: "weekly" 7 | directory: "/" 8 | target-branch: "main" 9 | open-pull-requests-limit: 2 10 | reviewers: 11 | - "pstjohn" 12 | - "jstjohn" 13 | - package-ecosystem: "docker" 14 | directory: "/" 15 | target-branch: "main" 16 | open-pull-requests-limit: 1 17 | schedule: 18 | interval: "weekly" 19 | reviewers: 20 | - "pstjohn" 21 | - "dorotat-nv" 22 | - "trvachov" 23 | -------------------------------------------------------------------------------- /.github/workflows/gh-docs-deploy.yml: -------------------------------------------------------------------------------- 1 | name: gh-pages-docs-deploy 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | merge_group: 9 | types: [checks_requested] 10 | 11 | jobs: 12 | build-and-deploy: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: 3.x 19 | - name: Cache dependencies 20 | uses: actions/cache@v4 21 | with: 22 | path: ~/.cache/pip 23 | key: ${{ runner.os }}-pip-${{ hashFiles('docs/requirements.txt') }} 24 | restore-keys: | 25 | ${{ runner.os }}-pip- 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install -r docs/requirements.txt 30 | - name: Build site 31 | run: mkdocs build 32 | working-directory: docs 33 | - name: Configure Git Credentials 34 | if: github.event_name == 'push' 35 | run: | 36 | git config user.name github-actions[bot] 37 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com 38 | - name: Deploy 39 | if: github.event_name == 'push' 40 | run: mkdocs gh-deploy --force 41 | working-directory: docs 42 | -------------------------------------------------------------------------------- /.github/workflows/internal_tools.yml: -------------------------------------------------------------------------------- 1 | name: Install internal tools Python packages & run test suite 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | paths: 7 | - internal/infra-bionemo/** 8 | 9 | jobs: 10 | infra-bionemo: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | fetch-depth: 0 16 | submodules: "recursive" 17 | - uses: actions/setup-python@v5 18 | with: 19 | python-version: "3.10" 20 | cache: "pip" 21 | - run: pip install -r requirements-dev.txt -r requirements-test.txt 22 | - run: pip install internal/infra-bionemo 23 | - run: cd internal/infra-bionemo && pytest -v --cov=infra_bionemo --cov-report=term . 24 | -------------------------------------------------------------------------------- /.github/workflows/trufflehog.yml: -------------------------------------------------------------------------------- 1 | name: TruffleHog Scan 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | merge_group: 9 | 10 | permissions: 11 | contents: read 12 | id-token: write 13 | issues: write 14 | pull-requests: write 15 | 16 | jobs: 17 | TruffleHog: 18 | runs-on: ubuntu-latest 19 | defaults: 20 | run: 21 | shell: bash 22 | steps: 23 | - name: Checkout code 24 | uses: actions/checkout@v4 25 | with: 26 | fetch-depth: 0 27 | 28 | - name: TruffleHog OSS 29 | if: github.event_name != 'merge_group' 30 | id: trufflehog 31 | uses: trufflesecurity/trufflehog@main 32 | continue-on-error: true 33 | with: 34 | extra_args: --results=verified,unknown 35 | 36 | - name: Scan Results Status 37 | if: steps.trufflehog.outcome == 'failure' 38 | run: exit 1 39 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rdparty/Megatron-LM"] 2 | path = 3rdparty/Megatron-LM 3 | url = https://github.com/NVIDIA/Megatron-LM.git 4 | [submodule "3rdparty/NeMo"] 5 | path = 3rdparty/NeMo 6 | url = https://github.com/NVIDIA/NeMo.git 7 | -------------------------------------------------------------------------------- /.nspect-allowlist.toml: -------------------------------------------------------------------------------- 1 | version = "1.0.0" 2 | 3 | [oss] 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: trailing-whitespace 7 | - id: check-yaml 8 | exclude: "mkdocs.yml" 9 | - repo: https://github.com/astral-sh/ruff-pre-commit 10 | rev: v0.9.10 11 | hooks: 12 | - id: ruff 13 | # 1. Attempt to automatically fix any lint issues. 14 | args: ["--fix"] 15 | - id: ruff-format 16 | - repo: https://github.com/Yelp/detect-secrets 17 | rev: v1.5.0 18 | hooks: 19 | - id: detect-secrets 20 | name: detect-secrets (everything but notebooks) 21 | args: ['--baseline', '.secrets.baseline', '--exclude-files', '(.*\.ipynb|.*\.baseline)$', ] 22 | exclude: package.lock.json 23 | - id: detect-secrets 24 | name: detect-secrets (notebooks only) 25 | args: ['--baseline', '.secrets-nb.baseline', '--exclude-files', '^.(?!.*\.ipynb)', '--exclude-lines', '"(hash|id|image/\w+)":.*|<.*at 0x[0-9a-f]+>|object at 0x[0-9a-f]+', ] 26 | - repo: local 27 | hooks: 28 | - id: license-header-check 29 | name: Run license-check script 30 | entry: python internal/infra-bionemo/src/infra_bionemo/license_check.py -c scripts -c sub-packages -c docs -c internal --license-header ./license_header --modify 31 | language: python 32 | additional_dependencies: ["click==8.1.7"] 33 | pass_filenames: false 34 | always_run: true 35 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cSpell.words": [ 3 | "adata", 4 | "allclose", 5 | "bionemo", 6 | "boto", 7 | "botocore", 8 | "docstrings", 9 | "dtype", 10 | "Knowledgebase", 11 | "NBVAL", 12 | "nemo", 13 | "ngcsdk", 14 | "pbss", 15 | "platformdirs", 16 | "pretraining", 17 | "pydantic", 18 | "rampup", 19 | "Resampler", 20 | "resamplers", 21 | "singlecell", 22 | "tqdm", 23 | "uniref", 24 | "upsampling" 25 | ], 26 | "editor.rulers": [ 27 | 120 28 | ], 29 | "autoDocstring.docstringFormat": "google-notypes" 30 | } 31 | -------------------------------------------------------------------------------- /CODE-REVIEW.md: -------------------------------------------------------------------------------- 1 | docs/docs/user-guide/contributing/code-review.md -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | docs/docs/user-guide/contributing/contributing.md -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Security 2 | 3 | NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization. 4 | 5 | If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub/GitLab.** 6 | 7 | ## Reporting Potential Security Vulnerability in an NVIDIA Product 8 | 9 | To report a potential security vulnerability in any NVIDIA product: 10 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html) 11 | - E-Mail: psirt@nvidia.com 12 | - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key) 13 | - Please include the following information: 14 | - Product/Driver name and version/branch that contains the vulnerability 15 | - Type of vulnerability (code execution, denial of service, buffer overflow, etc.) 16 | - Instructions to reproduce the vulnerability 17 | - Proof-of-concept or exploit code 18 | - Potential impact of the vulnerability, including how an attacker could exploit the vulnerability 19 | 20 | While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information. 21 | 22 | ## NVIDIA Product Security 23 | 24 | For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security 25 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 2.6 2 | -------------------------------------------------------------------------------- /ci/docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Get host UID/GID from environment variables. These need to be passed in explicitly when invoking the container; i.e., 5 | # docker run -e HOST_UID=$(id -u) -e HOST_GID=$(id -g) ... 6 | HOST_UID=${HOST_UID:-1000} 7 | HOST_GID=${HOST_GID:-1000} 8 | 9 | # Update the UID/GID of the container user 10 | groupmod -g $HOST_GID bionemo > /dev/null 11 | usermod -u $HOST_UID bionemo > /dev/null 12 | 13 | # Execute the main container command 14 | exec gosu bionemo "$@" 15 | -------------------------------------------------------------------------------- /ci/scripts/static_checks.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -xueo pipefail 4 | 5 | REPOSITORY_ROOT=$(git rev-parse --show-toplevel) 6 | cd $REPOSITORY_ROOT 7 | 8 | echo "Running ruff checks" 9 | set +e 10 | ruff check scripts/ sub-packages/ docs/ 11 | E_RUFF_CHECK="$?" 12 | set -e 13 | 14 | echo "Running tach checks" 15 | set +e 16 | tach check 17 | E_TACH_CHECK="$?" 18 | set -e 19 | 20 | echo "Running pre-commit checks" 21 | set +e 22 | pre-commit run --all-files --show-diff-on-failure --color always 23 | E_PRE_COMMIT="$?" 24 | set -e 25 | 26 | set +e 27 | ANY_FAILURE=0 28 | if [[ "${E_PRE_COMMIT}" != "0" ]]; then 29 | ANY_FAILURE=1 30 | echo "ERROR: pre-commit hooks failed! (exit: ${E_PRE_COMMIT})" 31 | fi 32 | if [[ "${E_RUFF_CHECK}" != "0" ]]; then 33 | ANY_FAILURE=1 34 | echo "ERROR: ruff check failed! (exit: ${E_RUFF_CHECK})" 35 | fi 36 | if [[ "${E_TACH_CHECK}" != "0" ]]; then 37 | ANY_FAILURE=1 38 | echo "ERROR: tach check failed! (exit: ${E_TACH_CHECK})" 39 | fi 40 | if [[ "${ANY_FAILURE}" != "0" ]]; then 41 | exit 1 42 | else 43 | exit 0 44 | fi 45 | -------------------------------------------------------------------------------- /docs/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM squidfunk/mkdocs-material:latest 2 | 3 | # Install plugins. 4 | RUN apk add gcc python3-dev musl-dev linux-headers 5 | COPY docs/requirements.txt /tmp/ 6 | RUN pip install --disable-pip-version-check --no-cache-dir -r /tmp/requirements.txt 7 | -------------------------------------------------------------------------------- /docs/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # FIXME: remove this notebooks from ignore when this issue is fixed: https://github.com/NVIDIA/bionemo-framework/issues/778 18 | collect_ignore = ["docs/user-guide/examples/bionemo-geneformer/geneformer_cellxgene_tutorial.ipynb"] 19 | -------------------------------------------------------------------------------- /docs/docs/SUMMARY.md: -------------------------------------------------------------------------------- 1 | - [Home](index.md) 2 | - [BioNeMo Documentation](main/) 3 | - [Models](models/) 4 | -------------------------------------------------------------------------------- /docs/docs/assets/css/chatbot.css: -------------------------------------------------------------------------------- 1 | /* match styles of llm_bot Chatbot icon */ 2 | img.open-icon { 3 | max-height: 20px !important; 4 | max-width: 20px !important; 5 | } 6 | 7 | .open-icon { 8 | border: none; 9 | max-width: 22px; 10 | margin-right: 8px; 11 | } 12 | 13 | /* tempoarily make invisible to test in production. 14 | turn on in going md-container > md-main > md-contant > article */ 15 | #chatbot > button { 16 | display: block; 17 | } 18 | -------------------------------------------------------------------------------- /docs/docs/assets/css/fonts.css: -------------------------------------------------------------------------------- 1 | @font-face { 2 | font-family: "NVIDIA Sans"; 3 | font-style: normal; 4 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_Lt.woff2); 5 | font-weight: light; 6 | } 7 | 8 | @font-face { 9 | font-family: "NVIDIA Sans"; 10 | font-style: italic; 11 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_LtIt.woff2); 12 | font-weight: light; 13 | } 14 | 15 | @font-face { 16 | font-family: "NVIDIA Sans"; 17 | font-style: normal; 18 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_Rg.woff2); 19 | font-weight: normal; 20 | } 21 | 22 | @font-face { 23 | font-family: "NVIDIA Sans"; 24 | font-style: italic; 25 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_It.woff2); 26 | font-weight: normal; 27 | } 28 | 29 | @font-face { 30 | font-family: "NVIDIA Sans"; 31 | font-style: normal; 32 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_Bd.woff2); 33 | font-weight: bold; 34 | } 35 | 36 | @font-face { 37 | font-family: "NVIDIA Sans"; 38 | font-style: italic; 39 | src: url(https://brand-assets.cne.ngc.nvidia.com/assets/fonts/nvidia-sans/1.0.0/NVIDIASans_BdIt.woff2); 40 | font-weight: bold; 41 | } 42 | -------------------------------------------------------------------------------- /docs/docs/assets/css/jupyter-themes.css: -------------------------------------------------------------------------------- 1 | /* theme: light */ 2 | body[data-md-color-scheme="light"] .jupyter-notebook { 3 | --jp-cell-editor-background: #f7f7f7; 4 | --jp-cell-editor-border-color: #cfcfcf; 5 | --jp-cell-prompt-fg-color: #303030; 6 | --jp-cell-prompt-bg-color: #f0f0f0; 7 | --jp-notebook-background: #ffffff; 8 | --jp-layout-color1: #ffffff; 9 | --jp-content-font-color1: #000000; 10 | } 11 | 12 | /* theme: dark */ 13 | body[data-md-color-scheme="dark"] .jupyter-notebook { 14 | --jp-cell-editor-background: #2b2b2b; 15 | --jp-cell-editor-border-color: #464646; 16 | --jp-cell-prompt-fg-color: #d7d7d7; 17 | --jp-cell-prompt-bg-color: #333333; 18 | --jp-notebook-background: #1e1e1e; 19 | --jp-layout-color1: #1e1e1e; 20 | --jp-content-font-color1: #d4d4d4; 21 | } 22 | -------------------------------------------------------------------------------- /docs/docs/assets/images/amplify/training_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/amplify/training_loss.png -------------------------------------------------------------------------------- /docs/docs/assets/images/amplify/validation_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/amplify/validation_loss.png -------------------------------------------------------------------------------- /docs/docs/assets/images/amplify/validation_ppl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/amplify/validation_ppl.png -------------------------------------------------------------------------------- /docs/docs/assets/images/esm2/esm2_device_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/esm2/esm2_device_scaling.png -------------------------------------------------------------------------------- /docs/docs/assets/images/esm2/esm2_model_scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/esm2/esm2_model_scaling.png -------------------------------------------------------------------------------- /docs/docs/assets/images/esm2/esm2_peft_memory_usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/esm2/esm2_peft_memory_usage.png -------------------------------------------------------------------------------- /docs/docs/assets/images/esm2/esm2_peft_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/esm2/esm2_peft_time.png -------------------------------------------------------------------------------- /docs/docs/assets/images/esm2/esm2_pretrain_convergence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/esm2/esm2_pretrain_convergence.png -------------------------------------------------------------------------------- /docs/docs/assets/images/esm2/esm2_single_node_training_perf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/esm2/esm2_single_node_training_perf.png -------------------------------------------------------------------------------- /docs/docs/assets/images/evo2/evo2_bionemo_1b_6950steps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/evo2/evo2_bionemo_1b_6950steps.png -------------------------------------------------------------------------------- /docs/docs/assets/images/evo2/evo2_bionemo_7bnv_28ksteps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/evo2/evo2_bionemo_7bnv_28ksteps.png -------------------------------------------------------------------------------- /docs/docs/assets/images/evo2/evo2_performance_by_cluster_size.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/evo2/evo2_performance_by_cluster_size.png -------------------------------------------------------------------------------- /docs/docs/assets/images/evo2/evo2_savanna_1b_6950steps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/evo2/evo2_savanna_1b_6950steps.png -------------------------------------------------------------------------------- /docs/docs/assets/images/evo2/evo2_savanna_7b_28ksteps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/evo2/evo2_savanna_7b_28ksteps.png -------------------------------------------------------------------------------- /docs/docs/assets/images/evo2/evo2_vs_7b_40b_performance_vs_context_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/evo2/evo2_vs_7b_40b_performance_vs_context_length.png -------------------------------------------------------------------------------- /docs/docs/assets/images/evo2/evo2_vs_llama2_performance_vs_context_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/evo2/evo2_vs_llama2_performance_vs_context_length.png -------------------------------------------------------------------------------- /docs/docs/assets/images/evo2/evo2_zeroshot_brca1_stripplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/evo2/evo2_zeroshot_brca1_stripplot.png -------------------------------------------------------------------------------- /docs/docs/assets/images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/favicon.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/F1-score-models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/F1-score-models.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/Geneformer_steven_106m_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/Geneformer_steven_106m_train.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/Geneformer_steven_106m_val.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/Geneformer_steven_106m_val.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/accuracy-models-04-18-2025.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/accuracy-models-04-18-2025.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/average-accuracy-models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/average-accuracy-models.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/f1-score-models-04-18-2025.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/f1-score-models-04-18-2025.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/geneformer_106m_train_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/geneformer_106m_train_loss.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/geneformer_106m_val_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/geneformer_106m_val_loss.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/geneformer_10m_training_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/geneformer_10m_training_loss.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/geneformer_10m_val_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/geneformer_10m_val_loss.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/loss_curve_new_v_old_geneformer_64_node_10M.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/loss_curve_new_v_old_geneformer_64_node_10M.png -------------------------------------------------------------------------------- /docs/docs/assets/images/geneformer/model_tflops_per_gpu_chart_geneformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/geneformer/model_tflops_per_gpu_chart_geneformer.png -------------------------------------------------------------------------------- /docs/docs/assets/images/megatron_background/README.md: -------------------------------------------------------------------------------- 1 | NOTE: these images are from https://nvidia.sharepoint.com/:p:/r/sites/PixelsManagementTeams/_layouts/15/doc2.aspx?sourcedoc=%7BD1FC26B6-A366-4D1E-8595-A9D3CD3A0D71%7D&file=Pixels_SW_Team_Meeting_2024_09_05.pptx&action=edit&mobileredirect=true&DefaultItemOpen=1 2 | -------------------------------------------------------------------------------- /docs/docs/assets/images/megatron_background/data_parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/megatron_background/data_parallelism.png -------------------------------------------------------------------------------- /docs/docs/assets/images/megatron_background/execution_schedulers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/megatron_background/execution_schedulers.png -------------------------------------------------------------------------------- /docs/docs/assets/images/megatron_background/fsdp_slide1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/megatron_background/fsdp_slide1.png -------------------------------------------------------------------------------- /docs/docs/assets/images/megatron_background/fsdp_slide2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/megatron_background/fsdp_slide2.png -------------------------------------------------------------------------------- /docs/docs/assets/images/megatron_background/pipeline_parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/megatron_background/pipeline_parallelism.png -------------------------------------------------------------------------------- /docs/docs/assets/images/megatron_background/sp_korthikanti_2022_fig5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/megatron_background/sp_korthikanti_2022_fig5.png -------------------------------------------------------------------------------- /docs/docs/assets/images/megatron_background/tensor_and_pipeline_parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/megatron_background/tensor_and_pipeline_parallelism.png -------------------------------------------------------------------------------- /docs/docs/assets/images/megatron_background/tensor_parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/megatron_background/tensor_parallelism.png -------------------------------------------------------------------------------- /docs/docs/assets/images/sub_package_graphs/dependency_file_imports.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/sub_package_graphs/dependency_file_imports.png -------------------------------------------------------------------------------- /docs/docs/assets/images/sub_package_graphs/dependency_graph_pyproject.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/sub_package_graphs/dependency_graph_pyproject.png -------------------------------------------------------------------------------- /docs/docs/assets/images/sub_package_graphs/dependency_graph_tach.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/sub_package_graphs/dependency_graph_tach.png -------------------------------------------------------------------------------- /docs/docs/assets/images/wandb_tips_tricks/trainer_global_step.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/images/wandb_tips_tricks/trainer_global_step.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/.gitkeep -------------------------------------------------------------------------------- /docs/docs/assets/old_images/MMB_molecule_generation_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/MMB_molecule_generation_1.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/MMB_molecule_generation_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/MMB_molecule_generation_2.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/MMB_molecule_generation_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/MMB_molecule_generation_3.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/MMB_molecule_generation_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/MMB_molecule_generation_4.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/MMB_molecule_generation_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/MMB_molecule_generation_5.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/MolMIM_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/MolMIM_model.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/MolMIM_molecule_generation_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/MolMIM_molecule_generation_1.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/MolMIM_molecule_generation_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/MolMIM_molecule_generation_2.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/bcp_snapshot_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/bcp_snapshot_.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/bcp_snapshot_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/bcp_snapshot_1.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/bcp_snapshot_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/bcp_snapshot_2.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/bcp_snapshot_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/bcp_snapshot_3.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/bionemo_overview_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/bionemo_overview_1.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/bionemo_overview_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/bionemo_overview_2.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/cellxgene/num_cells_by_assay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/cellxgene/num_cells_by_assay.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/cellxgene/num_cells_by_dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/cellxgene/num_cells_by_dataset.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/cellxgene/num_genes_measured_by_assay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/cellxgene/num_genes_measured_by_assay.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/cellxgene/pct_cells_by_age.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/cellxgene/pct_cells_by_age.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/cellxgene/pct_cells_by_ethnicity_category.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/cellxgene/pct_cells_by_ethnicity_category.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/cellxgene/pct_cells_by_sex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/cellxgene/pct_cells_by_sex.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/cellxgene/pct_cells_by_tissue_category.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/cellxgene/pct_cells_by_tissue_category.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/cellxgene/top9_datasets_tissue_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/cellxgene/top9_datasets_tissue_distribution.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/diffdock_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/diffdock_1.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/diffdock_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/diffdock_2.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/diffdock_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/diffdock_3.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/diffdock_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/diffdock_4.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/diffdock_fw_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/diffdock_fw_overview.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/equidock_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/equidock_1.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/equidock_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/equidock_2.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/equidock_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/equidock_3.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/equidock_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/equidock_4.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/esm1nv_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/esm1nv_1.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/esm1nv_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/esm1nv_2.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/esm1nv_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/esm1nv_3.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/esm1nv_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/esm1nv_4.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/mmb_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/mmb_1.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/mmb_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/mmb_2.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/mmb_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/mmb_3.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/mmb_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/mmb_4.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/mmb_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/mmb_5.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/molmim-embedding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/molmim-embedding.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/molmim-hidden-state.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/molmim-hidden-state.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/molmim-predictive-modeling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/molmim-predictive-modeling.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/sc_fm/F1-score-models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/sc_fm/F1-score-models.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/sc_fm/average-accuracy-models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/sc_fm/average-accuracy-models.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/sc_fm/geneformer-106m-240530-val-train-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/sc_fm/geneformer-106m-240530-val-train-loss.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/sc_fm/geneformer-10m-240530-val-train-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/sc_fm/geneformer-10m-240530-val-train-loss.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/sc_fm/geneformer-240530-val-comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/sc_fm/geneformer-240530-val-comparison.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/sc_fm/model_tflops_per_gpu_chart_tight_layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/sc_fm/model_tflops_per_gpu_chart_tight_layout.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/wandai_charts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/wandai_charts.png -------------------------------------------------------------------------------- /docs/docs/assets/old_images/wandb-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/docs/docs/assets/old_images/wandb-dashboard.png -------------------------------------------------------------------------------- /docs/docs/main/SUMMARY.md: -------------------------------------------------------------------------------- 1 | - [About](about/) 2 | - [Get Started](getting-started/) 3 | - [Developer Guide](developer-guide/) 4 | - [Tutorials](examples/) 5 | - [Data Sets](datasets/) 6 | - [Contributing](contributing/) 7 | - [References](references/) 8 | -------------------------------------------------------------------------------- /docs/docs/main/about/SUMMARY.md: -------------------------------------------------------------------------------- 1 | - [Overview](overview.md) 2 | - [Background](background/) 3 | - [Release Notes](releasenotes-fw.md) 4 | -------------------------------------------------------------------------------- /docs/docs/main/about/background/SUMMARY.md: -------------------------------------------------------------------------------- 1 | - [NeMo2 Parallelism](nemo2.md) 2 | - [Megatron Dataset Considerations](megatron_datasets.md) 3 | -------------------------------------------------------------------------------- /docs/docs/main/contributing/Writing Documentation/mkdocs.md: -------------------------------------------------------------------------------- 1 | # MkDocs 2 | 3 | ## Build system 4 | 5 | BioNeMo 2 uses [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/) to build it's documentation. 6 | Docstrings are converted to automatically-generated API reference pages using `mkdocstrings`, and can be linked from 7 | markdown pages using [paths](https://mkdocstrings.github.io/usage/). 8 | -------------------------------------------------------------------------------- /docs/docs/main/contributing/sub-package_dependency_graph.md: -------------------------------------------------------------------------------- 1 | ## Sub-Package Dependency Graph 2 | 3 | The script in `sub-packages/bionemo/fw/src/dependency_graph.py` generates a dependency graph for the BioNeMo sub-packages and verifies that the pyproject.toml and tach.toml files align and capture the dependencies needed for imports in the python files. Additionally, it checks dependencies between BioNeMo sub-packages and creates visual representations of the dependencies in pyproject.toml files, in tach.toml, and in the source files. 4 | 5 | These are visualizations of the dependency graph from the pyproject.toml files: 6 | 7 | Dependency Graph 8 | 9 | Similarly from the tach.toml file: 10 | 11 | Dependency Graph 12 | 13 | And these are the dependencies from the file imports: 14 | 15 | Dependency Graph 16 | -------------------------------------------------------------------------------- /docs/docs/main/datasets/index.md: -------------------------------------------------------------------------------- 1 | # BioNeMo Framework: Available Datasets 2 | 3 | The BioNeMo Framework provides access to a variety of high-quality datasets for bioinformatics and cheminformatics research. These datasets cover a range of biological and chemical modalities, supporting various research applications. The following table lists the currently available datasets: 4 | 5 | | **Dataset** | **Modality** | **Uses** | 6 | | -------------------------------------------------------- | -------------- | ------------------------------------------------ | 7 | | [CELLxGENE](./CELLxGENE.md) | Single Cell | Single-Cell Gene Expression | 8 | | [UniProt](./uniprot.md) | Protein | Protein Sequence and Function Analysis | 9 | 10 | For more information about the datasets included in the BioNeMo Framework, refer to the Dataset Cards linked in the table above or the original sources referenced in the respective dataset descriptions. 11 | -------------------------------------------------------------------------------- /docs/docs/main/developer-guide/SUMMARY.md: -------------------------------------------------------------------------------- 1 | - [bionemo-amplify](bionemo-amplify/bionemo-amplify-Overview.md) 2 | - [bionemo-core](bionemo-core/bionemo-core-Overview.md) 3 | - [bionemo-esm2](bionemo-esm2/bionemo-esm2-Overview.md) 4 | - [bionemo-evo2](bionemo-evo2/bionemo-evo2-Overview.md) 5 | - [bionemo-example-model](bionemo-example_model/bionemo-example_model-Overview.md) 6 | - [bionemo-fw](bionemo-fw/bionemo-fw-Overview.md) 7 | - [bionemo-geneformer](bionemo-geneformer/bionemo-geneformer-Overview.md) 8 | - [bionemo-geometric](bionemo-geometric/bionemo-geometric-Overview.md) 9 | - [bionemo-llm](bionemo-llm/bionemo-llm-Overview.md) 10 | - [bionemo-moco](bionemo-moco/bionemo-moco-Overview.md) 11 | - [bionemo-noodles](bionemo-noodles/bionemo-noodles-Overview.md) 12 | - [bionemo-scdl](bionemo-scdl/bionemo-scdl-Overview.md) 13 | - [bionemo-size-aware-batching](bionemo-size-aware-batching/bionemo-size-aware-batching-Overview.md) 14 | - [bionemo-testing](bionemo-testing/bionemo-testing-Overview.md) 15 | - [bionemo-webdatamodule](bionemo-webdatamodule/bionemo-webdatamodule-Overview.md) 16 | -------------------------------------------------------------------------------- /docs/docs/main/examples/.gitignore: -------------------------------------------------------------------------------- 1 | # IMPORTANT: This directory is reserved for examples that are automatically 2 | # generated by mkdocs. Please do not manually add files here. To add 3 | # examples, read the sub-packages section in root level docs README.md. 4 | 5 | # Ignore everything in this directory 6 | /* 7 | 8 | # Except the SUMMARY.md file, this .gitignore & the conftest.py file 9 | !.gitignore 10 | !conftest.py 11 | !SUMMARY.md 12 | -------------------------------------------------------------------------------- /docs/docs/main/examples/SUMMARY.md: -------------------------------------------------------------------------------- 1 | - [ESM-2](bionemo-esm2/) 2 | - [Evo2](bionemo-evo2/) 3 | - [Geneformer](bionemo-geneformer/) 4 | - [MoCo](bionemo-moco/) 5 | - [SCDL](bionemo-scdl/) 6 | -------------------------------------------------------------------------------- /docs/docs/main/examples/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | def pytest_collectstart(collector): 18 | if collector.fspath and collector.fspath.ext == ".ipynb": 19 | collector.skip_compare += ( 20 | "text/html", 21 | "application/javascript", 22 | "stderr", 23 | ) 24 | -------------------------------------------------------------------------------- /docs/docs/main/getting-started/SUMMARY.md: -------------------------------------------------------------------------------- 1 | - [Hardware and Software Prerequisites](pre-reqs.md) 2 | - [Access and Startup](access-startup.md) 3 | - [Initialization Guide](initialization-guide.md) 4 | - [Development](development.md) 5 | - [Training Models](training-models.md) 6 | -------------------------------------------------------------------------------- /docs/docs/main/references/API_reference/index.md: -------------------------------------------------------------------------------- 1 | # API reference 2 | 3 | The API reference contains detailed descriptions of all public functions and objects. It's the best place to look if you need information on a specific function. 4 | -------------------------------------------------------------------------------- /docs/docs/models/ESM-2/SUMMARY.md: -------------------------------------------------------------------------------- 1 | - [Model Overview](index.md) 2 | - [Pre-trained Checkpoints](pre-training.md) 3 | -------------------------------------------------------------------------------- /docs/docs/models/index.md: -------------------------------------------------------------------------------- 1 | # BioNeMo Framework: Available Models 2 | 3 | State-of-the-art models are continually integrated into the BioNeMo Framework. The BioNeMo Framework currently offers the following pre-trained models: 4 | 5 | | **Model** | **Modality** | **Uses** | 6 | | ----------------------------- | ------------ | ----------------------- | 7 | | [AMPLIFY](./amplify.md) | Protein | Representation Learning | 8 | | [ESM-2](./ESM-2/index.md) | Protein | Representation Learning | 9 | | [Evo2](./evo2.md) | DNA | Generative AI | 10 | | [Geneformer](./geneformer.md) | Single Cell | Representation Learning | 11 | 12 | For more information about the models included in BioNeMo Framework, refer to the Model Cards linked in the table above or the original publications referenced in the respective model descriptions. 13 | -------------------------------------------------------------------------------- /docs/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | 4 | {% block outdated %} You're not viewing the latest version. 5 | 6 | Click here to go to latest. 7 | 8 | {% endblock %} {% block styles %} {{ super() }} 9 | 10 | {% endblock %} {% block scripts %} {{ super() }} 11 | 12 | {% endblock %} {% block content %} {{ super() }} 13 |
14 | {% endblock %} 15 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs-material 2 | mkdocs-macros-plugin 3 | mkdocs-minify-plugin 4 | mkdocstrings[python] 5 | mkdocs-gen-files 6 | pymdown-extensions 7 | mkdocs-jupyter 8 | mkdocs-include-dir-to-nav 9 | mkdocs-literate-nav 10 | mkdocs-site-urls 11 | mike 12 | -------------------------------------------------------------------------------- /internal/README_justfile.md: -------------------------------------------------------------------------------- 1 | To get started, first download [`just`](https://github.com/casey/just). You can use [Homebrew](https://brew.sh/) on OS X & Linux: 2 | ```bash 3 | brew install just 4 | ``` 5 | 6 | **Once you have `just`, you need to run the `just setup` command once _before_ you can run any other command.** 7 | Thus, if it's your first time, you will need to do this first: 8 | ```bash 9 | just setup 10 | just 11 | ``` 12 | 13 | You can see all of the commands for the development cycle by running `just`. These commands are executable as 14 | `just X` for each command `X` listed: 15 | ``` 16 | build-dev # Builds the development image. 17 | build-release # Builds the release image. 18 | run-dev cmd='bash' # Runs an interactive program in the development bionemo image. 19 | run-release cmd='bash' # Runs an interactive program in the release bionemo image. 20 | setup # Checks for installed programs (docker, git, etc.), their versions, and grabs the latest cache image. 21 | test # Executes pytest in the release image. 22 | ``` 23 | 24 | You can combine `just` commands together. For example, run `just build-dev build-release` to build both images. 25 | -------------------------------------------------------------------------------- /internal/infra-bionemo/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | # For guidance, see: https://packaging.python.org/en/latest/guides/writing-pyproject-toml/ 6 | [project] 7 | name = "infra-bionemo" 8 | version = "0.1.0" 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | description = "Internal library of utilities and programs for BioNeMo-related infrastructure." 11 | readme = "README.md" 12 | requires-python = ">=3.10" 13 | keywords = [] 14 | license = {file = "LICENSE"} 15 | classifiers = [ 16 | "Programming Language :: Python :: 3.10", 17 | "Private :: Do Not Upload", 18 | ] 19 | dependencies = [ 20 | "click>=8.1.7,<9.0.0", 21 | "tomli>=2.0.2", 22 | "tomli_w>=1.1.0", 23 | ] 24 | 25 | [project.scripts] 26 | license-check = "infra_bionemo.license_check:entrypoint" 27 | create-bionemo-project = "infra_bionemo.new_project.exe.bionemo_subpackage:entrypoint" 28 | create-py-project = "infra_bionemo.new_project.exe.simple:entrypoint" 29 | create-namespaced-project = "infra_bionemo.new_project.exe.namespace:entrypoint" 30 | 31 | [tool.pytest.ini_options] 32 | testpaths = ["tests"] 33 | filterwarnings = [ "ignore::DeprecationWarning",] 34 | 35 | [tool.coverage.run] 36 | source = ["infra_bionemo"] 37 | -------------------------------------------------------------------------------- /internal/infra-bionemo/setup.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from setuptools import setup 18 | 19 | 20 | if __name__ == "__main__": 21 | setup() 22 | -------------------------------------------------------------------------------- /internal/infra-bionemo/src/infra_bionemo/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /internal/infra-bionemo/src/infra_bionemo/new_project/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /internal/infra-bionemo/src/infra_bionemo/new_project/exe/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /internal/infra-bionemo/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /internal/infra-bionemo/tests/test_infra_bionemo/test_new_project/test_utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import io 18 | 19 | from pytest import raises 20 | 21 | from infra_bionemo.new_project.utils import ask_yes_or_no 22 | 23 | 24 | def test_ask_yes_or_no(monkeypatch): 25 | with raises(ValueError): 26 | ask_yes_or_no("") 27 | 28 | with monkeypatch.context() as ctx: 29 | ctx.setattr("sys.stdin", io.StringIO("y")) 30 | assert ask_yes_or_no("hello world?") 31 | 32 | with monkeypatch.context() as ctx: 33 | ctx.setattr("sys.stdin", io.StringIO("n")) 34 | assert not ask_yes_or_no("hello world?") 35 | 36 | with monkeypatch.context() as ctx: 37 | ctx.setattr("sys.stdin", io.StringIO("loop once\ny")) 38 | assert ask_yes_or_no("hello world?") 39 | -------------------------------------------------------------------------------- /internal/scripts/README.md: -------------------------------------------------------------------------------- 1 | # Scripts for commonly performed bionemo-framework actions. 2 | 3 | ## First Time Setup 4 | 5 | After cloning the repository, you need to run the setup script **first**: 6 | 7 | ```bash 8 | ./internal/scripts/setup_env_file.sh 9 | ``` 10 | 11 | This will return an exit code of 1 on a first time run. 12 | 13 | ## Release Image Building 14 | 15 | To build the release image, run the following script: 16 | 17 | ```bash 18 | DOCKER_BUILDKIT=1 ./ci/scripts/build_docker_image.sh \ 19 | -regular-docker-builder \ 20 | -image-name "nvcr.io/nvidian/cvai_bnmo_trng/bionemo:bionemo2-$(git rev-parse HEAD)" 21 | ``` 22 | 23 | ## Development Image Building 24 | 25 | To build the development image, run the following script: 26 | 27 | ```bash 28 | ./internal/scripts/build_dev_image.sh 29 | ``` 30 | 31 | ## Interactive Shell in Development Image 32 | 33 | After building the development image, you can start a container from it and open a bash shell in it by executing: 34 | 35 | ```bash 36 | ./internal/scripts/run_dev.sh 37 | ``` 38 | 39 | ## Testing Locally 40 | 41 | Inside the development container, run `./ci/scripts/static_checks.sh` to validate that code changes will pass the code 42 | formatting and license checks run during CI. In addition, run the longer `./ci/scripts/run_pytest.sh` script to run unit 43 | tests for all sub-packages. 44 | -------------------------------------------------------------------------------- /internal/scripts/build_dev_image.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | COMMIT=$(git rev-parse HEAD) 6 | DATE=$(date --iso-8601=seconds -u) 7 | 8 | set -x 9 | DOCKER_BUILDKIT=1 docker buildx build \ 10 | -t "nvcr.io/nvidian/cvai_bnmo_trng/bionemo:dev-bionemo2-${COMMIT}" \ 11 | --target="development" \ 12 | --load \ 13 | --cache-from nvcr.io/nvidia/clara/bionemo-framework:nightly \ 14 | --cache-to type=inline \ 15 | --label com.nvidia.bionemo.git_sha=${COMMIT} \ 16 | --label com.nvidia.bionemo.created_at=${DATE} \ 17 | -f ./Dockerfile \ 18 | . 19 | -------------------------------------------------------------------------------- /license_header: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /requirements-cve.txt: -------------------------------------------------------------------------------- 1 | onnx>=1.16.0 2 | setuptools>=78.1.1 # Addresses CVE https://github.com/advisories/GHSA-5rjg-fvgr-3xxf 3 | aiohttp>=3.9.4 4 | jupyterlab>=3.6.8 5 | jupyter_server>=2.14.1 # https://github.com/advisories/GHSA-hrw6-wg82-cm62 6 | Werkzeug>=3.0.3 7 | nltk>=3.9.1 8 | pillow>=10.3.0 9 | tornado>=6.5.0 # Addresses CVE https://github.com/advisories/GHSA-7cx3-6m66-7c5m 10 | wandb>=0.19.1 # Addresses CVE GHSA-v778-237x-gjrc 11 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | ruff==0.9.10 # Needs to match the version of ruff used in .pre-commit-config.yaml. 2 | pre-commit==3.4.0 3 | virtualenv==20.26.6 4 | ipdb==0.13.11 5 | click==8.1.7 6 | tenacity==8.5.0 7 | tach>=0.9.0 8 | maturin==1.7.4 9 | -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | pytest-cov==4.1.0 2 | pytest-timeout==2.2.0 3 | pytest-dependency==0.5.1 4 | testbook==0.4.2 5 | requests_mock==1.11.0 6 | # For SwiftStack access 7 | awscli==1.33.33 8 | nbval==0.11.0 9 | # For NvFaidx equivalence tests 10 | pyfaidx==0.8.1.3 11 | 12 | # Temporary pin for pytorch-lightning until megatron callbacks in ProgressPrinter can get fixed. 13 | # See https://nvidia.slack.com/archives/C02A7LYGHK8/p1734727482697309 14 | pytorch-lightning<2.5.0 15 | lightning<2.5.0 16 | -------------------------------------------------------------------------------- /sub-packages/bionemo-amplify/README.md: -------------------------------------------------------------------------------- 1 | # bionemo-amplify 2 | 3 | To install, execute the following: 4 | ```bash 5 | pip install -e . 6 | ``` 7 | 8 | To run unit tests, execute: 9 | ```bash 10 | pytest -v . 11 | ``` 12 | -------------------------------------------------------------------------------- /sub-packages/bionemo-amplify/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.1 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-amplify/pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | [build-system] 3 | requires = ["setuptools>=64", "wheel"] 4 | build-backend = "setuptools.build_meta" 5 | 6 | [project] 7 | name = "bionemo-amplify" 8 | readme = "README.md" 9 | description = "A BioNeMo sub-package for training AMPLIFY models." 10 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 11 | requires-python = ">=3.10" 12 | license = { file = "LICENSE" } 13 | dynamic = ["version"] 14 | dependencies = [ 15 | # internal 16 | 'bionemo-core', 17 | 'bionemo-llm', 18 | 'bionemo-esm2', 19 | ] 20 | 21 | [project.optional-dependencies] 22 | test = [ 23 | 'bionemo-testing' 24 | ] 25 | te = [ 26 | # TE & Apex need to be installed after PyTorch, NVCC, and CUDA. 27 | # TODO(@pstjohn, @cspades): Figure out how to do this without post-installation. 28 | 'transformer_engine[pytorch]' 29 | ] 30 | 31 | [project.scripts] 32 | train_amplify = "bionemo.amplify.train_amplify:app" 33 | infer_amplify = "bionemo.amplify.infer_amplify:app" 34 | 35 | [tool.setuptools.packages.find] 36 | where = ["src"] 37 | include = ["bionemo.*"] 38 | namespaces = true 39 | exclude = ["test*."] 40 | 41 | [tool.uv] 42 | cache-keys = [{ git = true }] 43 | 44 | [tool.setuptools.dynamic] 45 | version = { file = "VERSION" } 46 | -------------------------------------------------------------------------------- /sub-packages/bionemo-amplify/src/bionemo/amplify/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-amplify/src/bionemo/amplify/tokenizer.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import transformers 18 | from nemo.lightning.io import IOMixin 19 | 20 | 21 | class BioNeMoAMPLIFYTokenizer(transformers.PreTrainedTokenizerFast, IOMixin): # noqa D101 22 | def __init__(self): 23 | """A wrapper to make AutoTokenizer serializable for the ESM2 tokenizer.""" 24 | other = transformers.AutoTokenizer.from_pretrained("chandar-lab/AMPLIFY_350M", use_fast=True) 25 | self.__dict__.update(other.__dict__) 26 | -------------------------------------------------------------------------------- /sub-packages/bionemo-amplify/tests/bionemo/amplify/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-amplify/tests/bionemo/amplify/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import pytest 18 | import torch 19 | from nemo.lightning import io 20 | 21 | from bionemo.amplify.tokenizer import BioNeMoAMPLIFYTokenizer 22 | 23 | 24 | @pytest.fixture 25 | def tokenizer(): 26 | return BioNeMoAMPLIFYTokenizer() 27 | 28 | 29 | def test_tokenizer_serialization(tokenizer, tmp_path): 30 | tokenizer.io_dump(tmp_path / "tokenizer", yaml_attrs=[]) # BioNeMoESMTokenizer takes no __init__ arguments 31 | deserialized_tokenizer = io.load(tmp_path / "tokenizer", tokenizer.__class__) 32 | 33 | our_tokens = deserialized_tokenizer.encode("KAISQ", add_special_tokens=False) 34 | amplify_tokens = torch.tensor([17, 7, 2, 14, 10, 18]) 35 | torch.testing.assert_close(torch.tensor(our_tokens), amplify_tokens) 36 | -------------------------------------------------------------------------------- /sub-packages/bionemo-core/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-core/VERSION: -------------------------------------------------------------------------------- 1 | 2.4.4 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-core/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-core" 7 | readme = "README.md" 8 | description = "BioNeMo core interfaces and PyTorch-related code." 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | # bionemo sub-packages 15 | # bionemo-core **MUST NOT** depend on any other sub-packages !!!!! 16 | # external 17 | "numpy", 18 | "platformdirs", 19 | "torch>=2.2.1", 20 | 'nest_asyncio', 21 | 'ngcsdk', 22 | 'pooch', 23 | 'pydantic[email]>=2.7.0', 24 | 'pyyaml', 25 | 'tqdm', 26 | ] 27 | 28 | [project.scripts] 29 | download_bionemo_data = "bionemo.core.data.load:entrypoint" 30 | 31 | # Make sure that the resource yaml files are being packaged alongside the python files. 32 | [tool.setuptools.package-data] 33 | "bionemo.core" = ["**/*.yaml"] 34 | 35 | [tool.setuptools.packages.find] 36 | where = ["src"] 37 | include = ["bionemo.*"] 38 | namespaces = true 39 | exclude = ["test*."] 40 | 41 | [tool.setuptools.dynamic] 42 | version = { file = "VERSION" } 43 | 44 | [tool.uv] 45 | cache-keys = [{ git = true }] 46 | -------------------------------------------------------------------------------- /sub-packages/bionemo-core/src/bionemo/core/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | from pathlib import Path 18 | from typing import Sequence 19 | 20 | import platformdirs 21 | 22 | 23 | __all__: Sequence[str] = ("BIONEMO_CACHE_DIR",) 24 | 25 | 26 | def _get_cache_dir() -> Path: 27 | """Get the cache directory for downloaded resources.""" 28 | if cache_dir := os.getenv("BIONEMO_CACHE_DIR"): 29 | return Path(cache_dir) 30 | 31 | cache_dir = Path(platformdirs.user_cache_dir(appname="bionemo", appauthor="nvidia")) 32 | 33 | try: 34 | cache_dir.mkdir(exist_ok=True, parents=True) 35 | except PermissionError as ex: 36 | raise PermissionError( 37 | f"Permission denied creating a cache directory at {cache_dir}. Please set BIONEMO_CACHE_DIR to a directory " 38 | "you have write access to." 39 | ) from ex 40 | return cache_dir 41 | 42 | 43 | BIONEMO_CACHE_DIR = _get_cache_dir() 44 | -------------------------------------------------------------------------------- /sub-packages/bionemo-core/src/bionemo/core/api.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from typing import Sequence 18 | 19 | from bionemo.core.model.config import BionemoModelConfig, BionemoTrainableModelConfig, Model, ModelOutput 20 | 21 | 22 | __all__: Sequence[str] = ( 23 | "BionemoModelConfig", 24 | "BionemoTrainableModelConfig", 25 | "Model", 26 | "ModelOutput", 27 | ) 28 | -------------------------------------------------------------------------------- /sub-packages/bionemo-core/src/bionemo/core/data/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-core/src/bionemo/core/data/api.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from typing import Sequence 18 | 19 | 20 | __all__: Sequence[str] = () 21 | -------------------------------------------------------------------------------- /sub-packages/bionemo-core/src/bionemo/core/data/resources/scdl.yaml: -------------------------------------------------------------------------------- 1 | - tag: sample 2 | ngc: nvidia/clara/scdl_sample_test:1.0 3 | ngc_registry: resource 4 | pbss: "s3://bionemo-ci/test-data/scdl_sample_test.tar.gz" 5 | sha256: 7a4237537bf535dfa00301ce8cc7073e0a23d5bc8aa902ad65db9f51b57a6df9 # pragma: allowlist secret 6 | owner: Polina Binder 7 | description: Sample test data for SCDL. 8 | 9 | - tag: sample_scdl_feature_ids 10 | ngc: nvidia/clara/scdl_sample_test_feature_ids:1.0 11 | ngc_registry: resource 12 | pbss: s3://bionemo-ci/test-data/scdl_sample_test_feat_ids.tar.gz 13 | sha256: 9020ba336dbfe33bddadba26ca0cde49958cbd73c5ad44f0960a5a4837c9db26 # pragma: allowlist secret 14 | owner: Savitha Srinivasan 15 | description: Sample test data for SCDL with feature IDs appended. 16 | -------------------------------------------------------------------------------- /sub-packages/bionemo-core/src/bionemo/core/model/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-core/src/bionemo/core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-core/tests/bionemo/core/utils/test_dtypes.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import pytest 18 | import torch 19 | 20 | from bionemo.core.utils.dtypes import get_autocast_dtype 21 | 22 | 23 | @pytest.mark.parametrize( 24 | "precision, expected_dtype", 25 | [ 26 | ("fp16", torch.float16), 27 | ("bf16", torch.bfloat16), 28 | ("fp32", torch.float32), 29 | ("bf16-mixed", torch.bfloat16), 30 | ("fp32-mixed", torch.float32), 31 | ], 32 | ) 33 | def test_get_autocast_dtype(precision: str, expected_dtype: torch.dtype): 34 | assert get_autocast_dtype(precision) == expected_dtype 35 | 36 | 37 | def test_unsupported_autocast_dtype(): 38 | with pytest.raises(ValueError): 39 | get_autocast_dtype("unsupported") 40 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/README.md: -------------------------------------------------------------------------------- 1 | # bionemo-esm2 2 | ESM-2 is a protein language model with BERT architecture trained on millions of protein sequences from UniProt. ESM-2 learns the patterns and dependencies between amino acids that ultimately give rise to a protein’s structure. ESM-2 is pretrained on a masked language model (MLM) objective. During pretraining, 15% of the input sequence is perturbed, and within which 80% of the residues are replaced with a mask token, 10% are replaced with a random token, and 10% are left unchanged. The model is then trained to predict the original amino acids at the perturbed positions with the context of the surrounding amino acids. 3 | 4 | Despite pretraining on an MLM objective, the sequence representation learned by ESM-2 is highly transferable to downstream tasks. ESM-2 can be fine-tuned on a variety of tasks, including secondary structure prediction as, and whole-sequence prediction on cellular localization, thermostability, solubility, and other protein properties. 5 | 6 | ### Setup 7 | To install, execute the following: 8 | ```bash 9 | pip install -e . 10 | ``` 11 | 12 | To run unit tests, execute: 13 | ```bash 14 | pytest -v . 15 | ``` 16 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/VERSION: -------------------------------------------------------------------------------- 1 | 2.4 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-esm2" 7 | readme = "README.md" 8 | description = "BioNeMo ESM2 model." 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | # internal 15 | 'bionemo-core', 16 | 'bionemo-llm', 17 | # external 18 | ] 19 | 20 | [project.optional-dependencies] 21 | test = ['bionemo-testing'] 22 | te = [ 23 | # TE & Apex need to be installed after PyTorch, NVCC, and CUDA. 24 | # TODO(@pstjohn, @cspades): Figure out how to do this without post-installation. 25 | 'transformer_engine[pytorch]', 26 | ] 27 | 28 | [project.scripts] 29 | bionemo-esm2-train = "bionemo.esm2.run.main:main" 30 | bionemo-esm2-recipe = "bionemo.esm2.run.recipes:main" 31 | infer_esm2 = "bionemo.esm2.scripts.infer_esm2:infer_esm2_entrypoint" 32 | train_esm2 = "bionemo.esm2.scripts.train_esm2:train_esm2_entrypoint" 33 | finetune_esm2 = "bionemo.esm2.scripts.finetune_esm2:finetune_esm2_entrypoint" 34 | convert_esm2 = "bionemo.esm2.model.convert:app" 35 | 36 | # Make sure that the tokenizer files are included along with the python files during installation. 37 | [tool.setuptools.package-data] 38 | "bionemo.esm2" = ["data/tokenizer/*.json", "data/tokenizer/*.txt"] 39 | 40 | [tool.setuptools.packages.find] 41 | where = ["src"] 42 | include = ["bionemo.*"] 43 | namespaces = true 44 | exclude = ["test*."] 45 | 46 | [tool.setuptools.dynamic] 47 | version = { file = "VERSION" } 48 | 49 | [tool.uv] 50 | cache-keys = [{ git = true }] 51 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/api.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from typing import Sequence 18 | 19 | from bionemo.esm2.model.model import ESM2Config, ESM2GenericConfig, ESM2Model 20 | 21 | 22 | __all__: Sequence[str] = ( 23 | "ESM2Config", 24 | "ESM2GenericConfig", 25 | "ESM2Model", 26 | ) 27 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/data/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/data/tokenizer/README.md: -------------------------------------------------------------------------------- 1 | # Vendored tokenizer config for facebook/esm2_t33_650M_UR50D 2 | 3 | This directory contains the output of 4 | 5 | ```python 6 | from transformers import AutoTokenizer 7 | AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D").save_pretrained("...") 8 | ``` 9 | 10 | for reproducible results and to reduce reliance on external API calls. 11 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/data/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import functools 17 | from importlib.resources import files 18 | 19 | import transformers 20 | from nemo.lightning.io import IOMixin 21 | 22 | 23 | class BioNeMoESMTokenizer(transformers.EsmTokenizer, IOMixin): # noqa D101 24 | def __init__(self): 25 | """A wrapper to make AutoTokenizer serializable for the ESM2 tokenizer.""" 26 | other = transformers.AutoTokenizer.from_pretrained(str(files("bionemo.esm2.data.tokenizer")), use_fast=True) 27 | self.__dict__.update(dict(other.__dict__)) 28 | 29 | 30 | @functools.cache 31 | def get_tokenizer() -> BioNeMoESMTokenizer: 32 | """Get the tokenizer for the ESM2 model.""" 33 | return BioNeMoESMTokenizer() 34 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/data/tokenizer/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "", 3 | "eos_token": "", 4 | "mask_token": "", 5 | "pad_token": "", 6 | "unk_token": "" 7 | } 8 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/data/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "added_tokens_decoder": { 3 | "0": { 4 | "content": "", 5 | "lstrip": false, 6 | "normalized": false, 7 | "rstrip": false, 8 | "single_word": false, 9 | "special": true 10 | }, 11 | "1": { 12 | "content": "", 13 | "lstrip": false, 14 | "normalized": false, 15 | "rstrip": false, 16 | "single_word": false, 17 | "special": true 18 | }, 19 | "2": { 20 | "content": "", 21 | "lstrip": false, 22 | "normalized": false, 23 | "rstrip": false, 24 | "single_word": false, 25 | "special": true 26 | }, 27 | "3": { 28 | "content": "", 29 | "lstrip": false, 30 | "normalized": false, 31 | "rstrip": false, 32 | "single_word": false, 33 | "special": true 34 | }, 35 | "32": { 36 | "content": "", 37 | "lstrip": false, 38 | "normalized": false, 39 | "rstrip": false, 40 | "single_word": false, 41 | "special": true 42 | } 43 | }, 44 | "clean_up_tokenization_spaces": true, 45 | "cls_token": "", 46 | "eos_token": "", 47 | "mask_token": "", 48 | "model_max_length": 1000000000000000019884624838656, 49 | "pad_token": "", 50 | "tokenizer_class": "EsmTokenizer", 51 | "unk_token": "" 52 | } 53 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/data/tokenizer/vocab.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | L 6 | A 7 | G 8 | V 9 | S 10 | E 11 | R 12 | T 13 | I 14 | D 15 | P 16 | K 17 | Q 18 | N 19 | F 20 | Y 21 | M 22 | H 23 | W 24 | C 25 | X 26 | B 27 | U 28 | Z 29 | O 30 | . 31 | - 32 | 33 | 34 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/model/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/model/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/README.md: -------------------------------------------------------------------------------- 1 | ## ESM2 Scripts Directory 2 | This is a collection for one-off scripts that can be ran through the command line. See the `[project.scripts]` section 3 | of the pyproject.toml file for how these are generated. 4 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/src/bionemo/esm2/testing/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/tests/bionemo/esm2/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/tests/bionemo/esm2/data/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/tests/bionemo/esm2/model/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/VERSION: -------------------------------------------------------------------------------- 1 | 2.4 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/assets/1b_finetuning_train_curve_500_steps_256gbs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/sub-packages/bionemo-evo2/assets/1b_finetuning_train_curve_500_steps_256gbs.png -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/examples/.gitignore: -------------------------------------------------------------------------------- 1 | # ignore temp files made by this tutorial 2 | # chromosome files 3 | *.fa 4 | *.fa.gz 5 | 6 | # config files 7 | *.yaml 8 | 9 | # directories created during these notebook runs. 10 | nemo2_evo2_1b_8k/ 11 | preprocessed_data/ 12 | pretraining_demo/ 13 | brca1_fasta_files/ 14 | brca1/ 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/examples/configs/README.md: -------------------------------------------------------------------------------- 1 | ## Example configs 2 | These configs are provided as examples to the user. Note that the files referenced in these configs can be downloaded from [OpenGenome2 dataset on Hugging Face](https://huggingface.co/datasets/arcinstitute/opengenome2). 3 | * `full_pretrain_shortphase_config.yaml` was used to test full scale pre-training runs of evo2 at the 8k context length. 4 | * `full_pretrain_longphase_config.yaml` was used to test full scale context extension phase pre-training (starting from an 8k checkpoint and continuing to train at longer context lengths). 5 | * `test_preproc_config.yaml` was used to test our preprocessing scripts to generate .bin/.idx files that are used for pre-training from fasta file inputs. 6 | * `test_promotors_dataset_config.yaml` is a smaller test file that can be used for pre-training but is one of the smaller tests. 7 | -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/examples/configs/test_preproc_config.yaml: -------------------------------------------------------------------------------- 1 | - datapaths: ["/workspace/bionemo2/data/mmseqs_results_rep_seq_distinct.fasta"] 2 | output_dir: "/workspace/bionemo2/data" 3 | output_prefix: promoters_ab_test_noodles_uint8_distinct 4 | # Datasplit 5 | train_split: 1.0 # because they do manual splits of first 1000 for validation, 2nd 1000 for test, and leftover for training 6 | valid_split: 0.0 7 | test_split: 0.0 8 | # Overwrite existing binaries. Otherwise, skip already preprocessed datasets. 9 | overwrite: True 10 | # Raw Preprocessing Transforms 11 | embed_reverse_complement: true 12 | random_reverse_complement: 0.0 13 | random_lineage_dropout: 0.1 14 | transcribe: "back_transcribe" 15 | force_uppercase: true 16 | indexed_dataset_dtype: "uint8" 17 | # Tokenizer Transforms 18 | append_eod: true 19 | enforce_sample_length: null 20 | ftfy: false 21 | # Tokenizer 22 | tokenizer_type: "Byte-Level" 23 | vocab_file: null 24 | vocab_size: null 25 | merges_file: null 26 | tokenizer_model_name: null 27 | pretrained_tokenizer_model: null 28 | special_tokens: null 29 | fast_hf_tokenizer: true 30 | # Compute 31 | workers: 1 32 | preproc_concurrency: 100000 33 | chunksize: 25 34 | # Filters 35 | drop_empty_sequences: true 36 | nnn_filter: true 37 | # RNG 38 | seed: 42 39 | # Evo2 Taxonomic Lineage Tags 40 | taxonomy_data: 41 | FP002272: 42 | kingdom: KINGDOM 43 | phylum: PHYLUM 44 | clazz: CLASS 45 | order: ORDER 46 | family: FAMILY 47 | genus: GENUS 48 | species: SPECIES 49 | FP000491: 50 | kingdom: king 51 | order: ord 52 | family: fam 53 | -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/examples/configs/test_promotors_dataset_config.yaml: -------------------------------------------------------------------------------- 1 | - dataset_prefix: /workspace/bionemo2/sub-packages/bionemo-evo2/tests/bionemo/evo2/data/test_datasets/test_promoters_uint8_distinct_byte-level_train 2 | dataset_split: train 3 | dataset_weight: 1.0 4 | - dataset_prefix: /workspace/bionemo2/sub-packages/bionemo-evo2/tests/bionemo/evo2/data/test_datasets/test_promoters_uint8_distinct_byte-level_val 5 | dataset_split: validation 6 | dataset_weight: 1.0 7 | - dataset_prefix: /workspace/bionemo2/sub-packages/bionemo-evo2/tests/bionemo/evo2/data/test_datasets/test_promoters_uint8_distinct_byte-level_test 8 | dataset_split: test 9 | dataset_weight: 1.0 10 | -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-evo2" 7 | readme = "README.md" 8 | description = "Library containing data preprocessing, training, and inference tooling for Evo2." 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | # internal 15 | "bionemo-noodles", 16 | "bionemo-core", 17 | "bionemo-llm", 18 | # external 19 | ] 20 | 21 | [project.optional-dependencies] 22 | test = [ 23 | 'bionemo-testing' 24 | ] 25 | 26 | [project.scripts] 27 | infer_evo2 = "bionemo.evo2.run.infer:main" 28 | train_evo2 = "bionemo.evo2.run.train:main" 29 | predict_evo2 = "bionemo.evo2.run.predict:main" 30 | preprocess_evo2 = "bionemo.evo2.data.preprocess:main" 31 | splice_evo2 = "bionemo.evo2.data.transcript_extraction:main" 32 | evo2_convert_to_nemo2 = "bionemo.evo2.utils.checkpoint.convert_to_nemo:main" 33 | 34 | [tool.setuptools.packages.find] 35 | where = ["src"] 36 | include = ["bionemo.*"] 37 | namespaces = true 38 | exclude = ["test*."] 39 | 40 | [tool.setuptools.dynamic] 41 | version = { file = "VERSION" } 42 | 43 | [tool.uv] 44 | cache-keys = [{ git = true }] 45 | -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/src/bionemo/evo2/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 Arc Institute. All rights reserved. 3 | # SPDX-FileCopyrightText: Copyright (c) 2024 Michael Poli. All rights reserved. 4 | # SPDX-FileCopyrightText: Copyright (c) 2024 Stanford University. All rights reserved 5 | # SPDX-License-Identifier: LicenseRef-Apache2 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/src/bionemo/evo2/data/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 Arc Institute. All rights reserved. 3 | # SPDX-FileCopyrightText: Copyright (c) 2024 Michael Poli. All rights reserved. 4 | # SPDX-FileCopyrightText: Copyright (c) 2024 Stanford University. All rights reserved 5 | # SPDX-License-Identifier: LicenseRef-Apache2 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/src/bionemo/evo2/run/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 Arc Institute. All rights reserved. 3 | # SPDX-FileCopyrightText: Copyright (c) 2024 Michael Poli. All rights reserved. 4 | # SPDX-FileCopyrightText: Copyright (c) 2024 Stanford University. All rights reserved 5 | # SPDX-License-Identifier: LicenseRef-Apache2 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/src/bionemo/evo2/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 Arc Institute. All rights reserved. 3 | # SPDX-FileCopyrightText: Copyright (c) 2024 Michael Poli. All rights reserved. 4 | # SPDX-FileCopyrightText: Copyright (c) 2024 Stanford University. All rights reserved 5 | # SPDX-License-Identifier: LicenseRef-Apache2 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | -------------------------------------------------------------------------------- /sub-packages/bionemo-evo2/src/bionemo/evo2/utils/checkpoint/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 Arc Institute. All rights reserved. 3 | # SPDX-FileCopyrightText: Copyright (c) 2024 Michael Poli. All rights reserved. 4 | # SPDX-FileCopyrightText: Copyright (c) 2024 Stanford University. All rights reserved 5 | # SPDX-License-Identifier: LicenseRef-Apache2 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | -------------------------------------------------------------------------------- /sub-packages/bionemo-example_model/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-example_model/VERSION: -------------------------------------------------------------------------------- 1 | ../../VERSION -------------------------------------------------------------------------------- /sub-packages/bionemo-example_model/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | # UV doesn't seem to pick up on changes to requirements.txt files as a signal that it needs to re-lock a project's 7 | # dependencies. We should probably just move to listing requirements in these pyproject.toml files directly, and also 8 | # now include bionemo-* sub-packages explicitly. 9 | name = "bionemo-example_model" 10 | readme = "README.md" 11 | description = "BioNeMo example_model: Example model for documentation and tutorials. Do Not Distriburte on PyPI !!" 12 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 13 | requires-python = ">=3.10" 14 | classifiers = ["Private :: Do Not Upload", "Programming Language :: Python :: 3.10"] 15 | license = { file = "LICENSE" } 16 | dynamic = ["version"] 17 | dependencies = [ 18 | 'bionemo-core', 19 | 'bionemo-llm', 20 | 'megatron-core', 21 | 'nemo_toolkit', 22 | 'torchvision >= 0.15.1', 23 | ] 24 | 25 | [project.optional-dependencies] 26 | test = [ 27 | "bionemo-testing" 28 | ] 29 | 30 | [tool.setuptools.packages.find] 31 | where = ["src"] 32 | include = ["bionemo.*"] 33 | namespaces = true 34 | exclude = ["test*."] 35 | 36 | [tool.setuptools.dynamic] 37 | version = { file = "VERSION" } 38 | 39 | [tool.uv] 40 | cache-keys = [{ git = true }] 41 | -------------------------------------------------------------------------------- /sub-packages/bionemo-example_model/src/bionemo/example_model/lightning/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-fw/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-fw/README.md: -------------------------------------------------------------------------------- 1 | # bionemo-fw 2 | 3 | The BioNeMo Framework (FW): a production grade framework for AI-enabled Drug Discovery. 4 | 5 | The `bionemo-fw` Python package contains framework-spanning code under the `bionemo.fw` namespace. 6 | All other namespaces of the BioNeMo Framework (`bionemo.*`) are dependencies of this package. 7 | 8 | ## Developer Setup 9 | After following the setup specified in the [README](https://github.com/NVIDIA/bionemo-framework/blob/main/README.md), 10 | you may install this project's code in your environment via executing: 11 | ```bash 12 | pip install -e . 13 | ``` 14 | 15 | To run unit tests with code coverage, execute: 16 | ```bash 17 | pytest -v --cov=bionemo --cov-report=term . 18 | ``` 19 | -------------------------------------------------------------------------------- /sub-packages/bionemo-fw/VERSION: -------------------------------------------------------------------------------- 1 | ../../VERSION -------------------------------------------------------------------------------- /sub-packages/bionemo-fw/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-fw" 7 | readme = "README.md" 8 | description = "BioNeMo Framework (FW): Production grade framework for AI-enabled Drug Discovery. Consists of all independently installable bionemo feature packages too." 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | 'bionemo-core', 15 | 'bionemo-esm2', 16 | 'bionemo-geneformer', 17 | 'bionemo-geometric', 18 | 'bionemo-llm', 19 | 'bionemo-noodles', 20 | 'bionemo-scdl', 21 | 'bionemo-size-aware-batching', 22 | 'bionemo-webdatamodule', 23 | 'bionemo-amplify', 24 | # 25 | # NOTE: DO **NOT** INCLUDE: 26 | # bionemo-testing (test-time only dependency) 27 | # bionemo-example_model (documentation) 28 | # bionemo-fw (itself!) 29 | # external 30 | 'nltk', 31 | 'numba>=0.57.1', 32 | 'toml', 33 | 'zarr', 34 | ] 35 | 36 | [tool.setuptools.packages.find] 37 | where = ["src"] 38 | include = ["bionemo.*"] 39 | namespaces = true 40 | exclude = ["test*."] 41 | 42 | [tool.setuptools.dynamic] 43 | version = { file = "VERSION" } 44 | 45 | [tool.uv] 46 | cache-keys = [{ git = true }] 47 | -------------------------------------------------------------------------------- /sub-packages/bionemo-fw/src/bionemo/fw/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-fw/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-fw/tests/bionemo/fw/test_sub_package_imports.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | def test_import_bionemo_core(): 18 | from bionemo import core as subpackage 19 | 20 | assert subpackage is not None 21 | del subpackage 22 | 23 | 24 | def test_import_bionemo_llm(): 25 | from bionemo import core as subpackage 26 | 27 | assert subpackage is not None 28 | del subpackage 29 | 30 | 31 | def test_import_bionemo_geneformer(): 32 | from bionemo import geneformer as subpackage 33 | 34 | assert subpackage is not None 35 | del subpackage 36 | 37 | 38 | def test_import_bionemo_esm2(): 39 | from bionemo import esm2 as subpackage 40 | 41 | assert subpackage is not None 42 | del subpackage 43 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/README.md: -------------------------------------------------------------------------------- 1 | # bionemo-geneformer 2 | 3 | Geneformer is a foundational single-cell RNA (scRNA) language model using a BERT architecture trained on millions of single-cell RNA sequences. It captures gene co-expression patterns to learn cellular representations, enabling predictive tasks across biology and medicine. Geneformer is trained on a masked language model (MLM) objective, where expression rank-ordered "gene tokens" in single-cell RNA sequences are masked, replaced, or left unchanged, and the model learns to predict these masked genes based on context. This module provides Dataset classes, collators for expression rank ordering, and Config objects for constructing Geneformer-style models. 4 | 5 | ## Setup 6 | To install, execute the following from this directory (or point the install to this directory): 7 | 8 | ```bash 9 | pip install -e . 10 | ``` 11 | 12 | To run unit tests, execute: 13 | ```bash 14 | pytest -v . 15 | ``` 16 | 17 | 18 | ## Acquiring Data 19 | Datasets are expected to be in the form of AnnData (.h5ad) objects such as those downloaded from [Cell x Gene | CZI](https://chanzuckerberg.github.io/cellxgene-census/). They are then pre-processed with `sub-packages/bionemo-scdl/src/bionemo/scdl/scripts/convert_h5ad_to_scdl.py`. 20 | 21 | ## Geneformer-nv 10M and 106M 22 | Refer to the Dataset cards and Model cards to learn more about the pre-trained checkpoints provided for both 10M and 106M of Geneformer-nv. 23 | 24 | ## See Also 25 | - [sc-DL pypi](https://pypi.org/project/bionemo-scdl/) 26 | - [sc-DL github](https://github.com/NVIDIA/bionemo-framework/tree/main/sub-packages/bionemo-scdl) 27 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/VERSION: -------------------------------------------------------------------------------- 1 | 2.4 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/examples/.gitignore: -------------------------------------------------------------------------------- 1 | **.png 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-geneformer" 7 | readme = "README.md" 8 | description = "BioNeMo Geneformer" 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | # bionemo sub-packages 15 | 'bionemo-core', 16 | 'bionemo-llm', 17 | # external 18 | 'cellxgene_census', 19 | ] 20 | 21 | [project.optional-dependencies] 22 | test = [ 23 | 'bionemo-testing' 24 | ] 25 | te = [ 26 | # TE & Apex need to be installed after PyTorch, NVCC, and CUDA. 27 | # TODO(@pstjohn, @cspades): Figure out how to do this without post-installation. 28 | 'transformer_engine[pytorch]' 29 | ] 30 | 31 | [project.scripts] 32 | bionemo-geneformer-train= "bionemo.geneformer.run.main:main" 33 | bionemo-geneformer-recipe= "bionemo.geneformer.run.recipes:main" 34 | infer_geneformer = "bionemo.geneformer.scripts.infer_geneformer:geneformer_infer_entrypoint" 35 | train_geneformer = "bionemo.geneformer.scripts.train_geneformer:entrypoint" 36 | geneformer_mlm_loss_eval = "bionemo.geneformer.scripts.geneformer_mlm_loss_eval:entrypoint" 37 | 38 | [tool.setuptools.packages.find] 39 | where = ["src"] 40 | include = ["bionemo.*"] 41 | namespaces = true 42 | exclude = ["test*."] 43 | 44 | [tool.setuptools.dynamic] 45 | version = { file = "VERSION" } 46 | 47 | [tool.uv] 48 | cache-keys = [{ git = true }] 49 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/scripts/README.md: -------------------------------------------------------------------------------- 1 | # WARNING 2 | This folder contains one-off eval scripts that may not run and are not actively tested or kept up to date. 3 | Also these scripts may depend on `bionemo-testing` which is generally not allowed. 4 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/src/bionemo/geneformer/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/src/bionemo/geneformer/data/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/src/bionemo/geneformer/data/singlecell/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/src/bionemo/geneformer/data/singlecell/utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import numpy as np 17 | 18 | 19 | def sample_or_truncate( 20 | gene_ids: np.ndarray, 21 | max_length: int, 22 | sample: bool = True, 23 | ) -> np.ndarray: 24 | """Truncate and pad samples. 25 | 26 | Args: 27 | gene_ids (np.ndarray): Array of gene IDs. 28 | max_length (int): Maximum length of the samples. 29 | sample (bool, optional): Whether to sample or truncate the samples. Defaults to True. 30 | 31 | Returns: 32 | np.array: Tuple containing the truncated or padded gene IDs. 33 | """ 34 | if len(gene_ids) <= max_length: 35 | return gene_ids 36 | 37 | if sample: 38 | indices = np.random.permutation(len(gene_ids))[:max_length] 39 | return gene_ids[indices] 40 | else: 41 | return gene_ids[:max_length] 42 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/src/bionemo/geneformer/model/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/README.md: -------------------------------------------------------------------------------- 1 | ## Geneformer Scripts Directory 2 | This is a collection for one-off scripts that can be ran through the command line. See the `[project.scripts]` section 3 | of the pyproject.toml file for how these are generated. 4 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/celltype_classification_bench/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/src/bionemo/geneformer/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/src/bionemo/geneformer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/tests/bionemo/geneformer/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geometric/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-geometric/README.md: -------------------------------------------------------------------------------- 1 | # bionemo-geometric 2 | 3 | To install, execute the following: 4 | ```bash 5 | pip install -e . 6 | ``` 7 | 8 | To run unit tests, execute: 9 | ```bash 10 | pytest -v . 11 | ``` 12 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geometric/VERSION: -------------------------------------------------------------------------------- 1 | ../../VERSION -------------------------------------------------------------------------------- /sub-packages/bionemo-geometric/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-geometric" 7 | readme = "README.md" 8 | description = "BioNeMo component library for graphical neural networks (GNNs) solving drug discovery problems." 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | 'bionemo-core', 15 | # Version pins for the pip package. If updating these, also update the image pins in requirements.txt. 16 | # Ideally we should allow these dependencies to float in the package definition. 17 | 'torch-cluster==1.6.3', 18 | 'torch-geometric==2.5.0', 19 | 'torch-scatter==2.1.2', 20 | 'torch_sparse==0.6.18', 21 | 'rdkit==2023.9.6', 22 | ] 23 | 24 | # Make sure that the data CSV files are being packaged alongside the python files. 25 | [tool.setuptools.package-data] 26 | "bionemo.geometric" = ["**/*.csv"] 27 | 28 | [tool.setuptools.packages.find] 29 | where = ["src"] 30 | include = ["bionemo.*"] 31 | namespaces = true 32 | exclude = ["test*."] 33 | 34 | [tool.setuptools.dynamic] 35 | version = { file = "VERSION" } 36 | 37 | [tool.uv] 38 | cache-keys = [{ git = true }] 39 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geometric/requirements.txt: -------------------------------------------------------------------------------- 1 | # Pinned versions installed in the Docker container. 2 | # If updating these, also update the version pins in `pyproject.toml` !! 3 | torch-cluster==1.6.3 4 | torch-geometric==2.5.0 5 | torch-scatter==2.1.2 6 | torch_sparse==0.6.18 7 | rdkit==2023.9.6 8 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geometric/src/bionemo/geometric/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-geometric/tests/bionemo/geometric/test_bionemo_geometric.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # TODO: replace this "test" once bionemo-geometric has some real code! 18 | def test_import_geometric() -> None: 19 | from bionemo import geometric 20 | 21 | assert geometric is not None 22 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/README.md: -------------------------------------------------------------------------------- 1 | # bionemo-llm 2 | 3 | The Bionemo Large Language Model (LLM) submodule contains common code used in submodules that train LLMs on biological 4 | datasets (currently `bionemo-esm2` and `bionemo-geneformer`). This includes data masking and collate functions, the 5 | bio-BERT common architecture code, loss functions, and other NeMo / Megatron-LM compatibility functions. Sub-packages 6 | should only depend on `bionemo-llm` if they need access to NeMo and Megatron-LM. 7 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/VERSION: -------------------------------------------------------------------------------- 1 | 2.4.5 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-llm" 7 | readme = "README.md" 8 | description = "BioNeMo Large Language Model Components using NeMo and Megatron" 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | # bionemo sub-packages 15 | 'bionemo-core', 16 | # external 17 | 'lightning>=2.2.1', 18 | 'megatron-core', 19 | 'nemo_toolkit[nlp,eval]>=2.2.1', 20 | 'nemo-run', 21 | 'hatchling', 22 | ] 23 | 24 | [project.optional-dependencies] 25 | test = [ 26 | 'bionemo-testing' 27 | ] 28 | te = [ 29 | # TE & Apex need to be installed after PyTorch, NVCC, and CUDA. 30 | # TODO(@pstjohn, @cspades): Figure out how to do this without post-installation. 31 | 'transformer_engine[pytorch]' 32 | ] 33 | 34 | [tool.setuptools.packages.find] 35 | where = ["src"] 36 | include = ["bionemo.*"] 37 | namespaces = true 38 | exclude = ["test*."] 39 | 40 | [tool.setuptools.dynamic] 41 | version = { file = "VERSION" } 42 | 43 | [tool.uv] 44 | cache-keys = [{ git = true }] 45 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/src/bionemo/llm/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/src/bionemo/llm/data/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/src/bionemo/llm/model/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/src/bionemo/llm/run/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/src/bionemo/llm/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/tests/bionemo/llm/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-llm/tests/bionemo/llm/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/README.md: -------------------------------------------------------------------------------- 1 | # Modular Co-Design (MoCo) Interpolants 2 | 3 | MoCo enables abstracted interpolants for building and sampling from a variety of popular generative model frameworks. Specifically, MoCo supports interpolants for both continuous and discrete data types. 4 | [![PyPI version](https://badge.fury.io/py/bionemo-moco.svg)](https://pypi.org/project/bionemo-moco/) 5 | 6 | ### Continuous Data Interpolants 7 | MoCo currently supports the following continuous data interpolants: 8 | - DDPM (Denoising Diffusion Probabilistic Models) 9 | - VDM (Variational Diffusion Models) 10 | - CFM (Conditional Flow Matching) 11 | 12 | ### Discrete Data Interpolants 13 | MoCo also supports the following discrete data interpolants: 14 | - D3PM (Discrete Denoising Diffusion Probabilistic Models) 15 | - MDLM (Masked Diffusion Language Models) 16 | - DFM (Discrete Flow Matching) 17 | 18 | ### Useful Abstractions 19 | MoCo also provides useful wrappers for customizable time distributions and inference time schedules. 20 | 21 | ### Extendible 22 | If the desired interpolant or sampling method is not already supported, MoCo was designed to be easily extended. 23 | 24 | ## Installation 25 | For Conda environment setup, please refer to the `environment` directory for specific instructions. 26 | 27 | Once your environment is set up, you can install this project by running the following command: 28 | 29 | ```bash 30 | pip install -e . 31 | ``` 32 | This will install the project in editable mode, allowing you to make changes and see them reflected immediately. 33 | 34 | ## Examples 35 | Please see examples of all interpolants in the [examples directory](https://github.com/NVIDIA/bionemo-framework/tree/main/sub-packages/bionemo-moco/examples). 36 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.2.1 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/environment/Instructions.md: -------------------------------------------------------------------------------- 1 | Environment Setup 2 | =============== 3 | 4 | From the bionemo-moco directory run: 5 | 6 | ```bash 7 | bash environment/setup.sh 8 | ``` 9 | 10 | This creates the conda environment, installs bionemo-moco and runs the tests. 11 | 12 | Local Code Setup 13 | =============== 14 | From the bionemo-moco directory run: 15 | 16 | ```bash 17 | bash environment/clone_bionemo_moco.sh 18 | ``` 19 | 20 | This creates clones only the bionemo subpackage. To install in your local env use: 21 | 22 | ```bash 23 | pip install -e . 24 | ``` 25 | 26 | inside the bionemo-moco directory. 27 | 28 | ```bash 29 | pip install --no-deps -e . 30 | ``` 31 | can be used if want to install bionemo-moco over your current torch version. The remaining required jaxtyping and pot dependencies can be manually installed via pip. 32 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/environment/clone_bionemo_moco.sh: -------------------------------------------------------------------------------- 1 | git clone --filter=blob:none --sparse https://github.com/NVIDIA/bionemo-framework.git 2 | cd bionemo-framework 3 | git sparse-checkout set sub-packages/bionemo-moco 4 | mv sub-packages/bionemo-moco .. 5 | cd .. 6 | rm -rf bionemo-framework 7 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/environment/moco_env.yaml: -------------------------------------------------------------------------------- 1 | name: moco_bionemo 2 | channels: 3 | - conda-forge 4 | - pytorch 5 | - nvidia 6 | 7 | dependencies: 8 | - python=3.10 9 | - pytorch=2.2.1 10 | - pytorch-cuda=12.1 11 | - torchvision=0.17.1 12 | - torchaudio=2.2.1 13 | 14 | - pip: 15 | - ruff==0.0.292 16 | - black==23.1.0 17 | - pre-commit==3.4.0 18 | - virtualenv==20.26.3 19 | - ipdb==0.13.11 20 | - click==8.1.7 21 | - tenacity==8.5.0 22 | - tach>=0.9.0 23 | - pytest-cov==4.1.0 24 | - pytest-timeout==2.2.0 25 | - pytest-dependency==0.5.1 26 | - testbook==0.4.2 27 | - requests_mock==1.11.0 28 | - awscli==1.33.33 29 | - nbval==0.11.0 30 | - onnx>=1.16.0 31 | - setuptools>=70.0.0 32 | - aiohttp>=3.9.4 33 | - jupyterlab>=3.6.8 34 | - jupyter_server>=2.14.1 # Fix for GHSA-hrw6-wg82-cm62 35 | - Werkzeug>=3.0.3 36 | - nltk>=3.9.1 37 | - numpy>=1.24.4,<2 38 | - jaxtyping==0.2.34 39 | - pot>=0.9.5 40 | - scikit-learn>=1.6.0 41 | - matplotlib>=3.3.2 42 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/environment/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set the path to your Conda environment YAML file 4 | ENV_YAML="environment/moco_env.yaml" 5 | 6 | # Extract the environment name from the YAML file 7 | ENV_NAME=$(head -n 1 "$ENV_YAML" | cut -d':' -f2- | tr -d ' ') 8 | 9 | # Load Conda to enable command 10 | source "$(conda info --base)/etc/profile.d/conda.sh" 11 | 12 | # Create the Conda environment from the YAML file 13 | echo "Creating Conda environment $ENV_NAME from $ENV_YAML..." 14 | conda env create -f "$ENV_YAML" 15 | 16 | # Activate the Conda environment 17 | echo "Activating Conda environment $ENV_NAME..." 18 | conda activate "$ENV_NAME" 19 | 20 | # Check if the environment was successfully activated 21 | if [ "$CONDA_DEFAULT_ENV" == "$ENV_NAME" ]; then 22 | echo "Conda environment $ENV_NAME activated successfully." 23 | # Navigate to your project directory if needed 24 | # cd /path/to/your/project # Uncomment and adjust this path as necessary 25 | # Install your project in editable mode using pip 26 | pip install pydoc-markdown>=4.8.2 27 | pip install pytest-cov==4.1.0 pytest-timeout==2.2.0 pytest-dependency==0.5.1 28 | pre-commit install 29 | echo "Installing bionemo-moco in editable mode using pip..." 30 | pip install -e . 31 | echo "Setup complete." 32 | # Run tests 33 | echo "Running tests..." 34 | pytest 35 | echo "Tests complete. You can now work within the $ENV_NAME environment." 36 | else 37 | echo "Failed to activate Conda environment $ENV_NAME. Exiting..." 38 | exit 1 39 | fi 40 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/figures/model_figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/sub-packages/bionemo-moco/figures/model_figure.png -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-moco" 7 | readme = "README.md" 8 | description = "BioNeMo Modular Co-Design: Making building Diffusion and Flow Matching generative models easier" 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | # bionemo sub-packages 15 | # external 16 | 'torch>=2.2', 17 | 'numpy>=1.24.4,<2', #needed for notebooks 18 | 'jaxtyping>=0.2.34', 19 | 'pot>=0.9.5', #needed for optimal transport 20 | 'scikit-learn>=1.6.0', #needed for notebooks 21 | 'matplotlib>=3.3.2' #needed for notebooks 22 | ] 23 | 24 | [tool.setuptools.packages.find] 25 | where = ["src"] 26 | include = ["bionemo.*"] 27 | namespaces = true 28 | exclude = ["test*."] 29 | 30 | [tool.setuptools.dynamic] 31 | version = { file = "VERSION" } 32 | 33 | [tool.uv] 34 | cache-keys = [{ git = true }] 35 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/scripts/clean_documentation.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import re 18 | 19 | 20 | with open("documentation.md", "r") as file: 21 | lines = file.readlines() 22 | 23 | # Delete lines that start with " * " and " * " 24 | lines = [line for line in lines if not line.startswith(" * ") and not line.startswith(" * ")] 25 | 26 | # Join the lines back into a string 27 | markdown = "".join(lines) 28 | 29 | # Replace dots with no space in anchor ids 30 | markdown = re.sub(r'', lambda match: f'', markdown) 31 | 32 | # Replace dots with no space in links 33 | markdown = re.sub( 34 | r"\[([^\]]+)\]\(#([a-zA-Z0-9_\.]+)\)", 35 | lambda match: f"[{match.group(1)}](#{match.group(2).replace('.', '')})", 36 | markdown, 37 | ) 38 | 39 | # Replace 'moco.' with 'bionemo.moco.' 40 | markdown = re.sub(r"moco\.", "bionemo.moco.", markdown) 41 | 42 | with open("documentation.md", "w") as file: 43 | file.write(markdown) 44 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/scripts/create_documentation.sh: -------------------------------------------------------------------------------- 1 | pydoc-markdown -I src/bionemo --render-toc > documentation.md 2 | python scripts/clean_documentation.py 3 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .schedules.utils import TimeDirection 18 | 19 | 20 | __all__ = ["TimeDirection"] 21 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/distributions/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/distributions/prior/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .continuous.gaussian import GaussianPrior 18 | from .discrete.custom import DiscreteCustomPrior 19 | from .discrete.mask import DiscreteMaskedPrior 20 | from .discrete.uniform import DiscreteUniformPrior 21 | 22 | 23 | __all__ = ["DiscreteCustomPrior", "DiscreteMaskedPrior", "DiscreteUniformPrior", "GaussianPrior"] 24 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/distributions/prior/continuous/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/distributions/prior/continuous/utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from typing import Optional 18 | 19 | from torch import Tensor 20 | 21 | 22 | def remove_center_of_mass(data: Tensor, mask: Optional[Tensor] = None) -> Tensor: 23 | """Calculates the center of mass (CoM) of the given data. 24 | 25 | Args: 26 | data: The input data with shape (..., nodes, features). 27 | mask: An optional binary mask to apply to the data with shape (..., nodes) to mask out interaction from CoM calculation. Defaults to None. 28 | 29 | Returns: 30 | The CoM of the data with shape (..., 1, features). 31 | """ 32 | if mask is None: 33 | com = data.mean(dim=-2, keepdim=True) 34 | else: 35 | masked_data = data * mask.unsqueeze(-1) 36 | num_nodes = mask.sum(dim=-1, keepdim=True).unsqueeze(-1) 37 | com = masked_data.sum(dim=-2, keepdim=True) / num_nodes 38 | return data - com 39 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/distributions/prior/discrete/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/distributions/time/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .beta import BetaTimeDistribution 18 | from .distribution import MixTimeDistribution, TimeDistribution 19 | from .logit_normal import LogitNormalTimeDistribution 20 | from .uniform import UniformTimeDistribution 21 | 22 | 23 | __all__ = [ 24 | "BetaTimeDistribution", 25 | "LogitNormalTimeDistribution", 26 | "MixTimeDistribution", 27 | "TimeDistribution", 28 | "UniformTimeDistribution", 29 | ] 30 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/distributions/time/utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | 19 | def float_time_to_index(time: torch.Tensor, num_time_steps: int) -> torch.Tensor: 20 | """Convert a float time value to a time index. 21 | 22 | Args: 23 | time (torch.Tensor): A tensor of float time values in the range [0, 1]. 24 | num_time_steps (int): The number of discrete time steps. 25 | 26 | Returns: 27 | torch.Tensor: A tensor of time indices corresponding to the input float time values. 28 | """ 29 | # Ensure time values are in the range [0, 1] 30 | time = torch.clamp(time, 0.0, 1.0) 31 | 32 | # Scale to the index range and round 33 | indices = torch.round(time * (num_time_steps - 1)).to(torch.int64) 34 | 35 | return indices 36 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/interpolants/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .continuous_time.continuous.continuous_flow_matching import ContinuousFlowMatcher 18 | from .continuous_time.continuous.data_augmentation.equivariant_ot_sampler import EquivariantOTSampler 19 | from .continuous_time.continuous.data_augmentation.kabsch_augmentation import KabschAugmentation 20 | from .continuous_time.continuous.data_augmentation.ot_sampler import OTSampler 21 | from .continuous_time.continuous.vdm import VDM 22 | from .continuous_time.discrete.discrete_flow_matching import DiscreteFlowMatcher 23 | from .continuous_time.discrete.mdlm import MDLM 24 | from .discrete_time.continuous.ddpm import DDPM 25 | from .discrete_time.discrete.d3pm import D3PM 26 | 27 | 28 | __all__ = [ 29 | "D3PM", 30 | "DDPM", 31 | "MDLM", 32 | "VDM", 33 | "ContinuousFlowMatcher", 34 | "DiscreteFlowMatcher", 35 | "EquivariantOTSampler", 36 | "KabschAugmentation", 37 | "OTSampler", 38 | ] 39 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/interpolants/continuous_time/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/interpolants/continuous_time/continuous/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/interpolants/continuous_time/continuous/data_augmentation/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/interpolants/continuous_time/continuous/data_augmentation/augmentation_types.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from enum import Enum 18 | 19 | 20 | class AugmentationType(Enum): 21 | """An enumeration representing the type ofOptimal Transport that can be used in Continuous Flow Matching. 22 | 23 | - **EXACT_OT**: Standard mini batch optimal transport defined in https://arxiv.org/pdf/2302.00482. 24 | - **EQUIVARIANT_OT**: Adding roto/translation optimization to mini batch OT see https://arxiv.org/pdf/2306.15030 https://arxiv.org/pdf/2312.07168 4.2. 25 | - **KABSCH**: Simple Kabsch alignment between each data and noise point, No permuation # https://arxiv.org/pdf/2410.22388 Sec 3.2 26 | 27 | These prediction types can be used to train neural networks for specific tasks, such as denoising, image synthesis, or time-series forecasting. 28 | """ 29 | 30 | EXACT_OT = "exact_ot" 31 | EQUIVARIANT_OT = "equivariant_ot" 32 | KABSCH = "kabsch" 33 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/interpolants/continuous_time/discrete/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/interpolants/discrete_time/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/interpolants/discrete_time/continuous/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/interpolants/discrete_time/discrete/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/schedules/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/schedules/noise/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/schedules/utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from enum import Enum 18 | 19 | 20 | class TimeDirection(Enum): 21 | """Enum for the direction of the noise schedule.""" 22 | 23 | UNIFIED = "unified" # Noise(0) --> Data(1) 24 | DIFFUSION = "diffusion" # Noise(1) --> Data(0) 25 | -------------------------------------------------------------------------------- /sub-packages/bionemo-moco/src/bionemo/moco/testing/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "noodles_fasta_wrapper" 3 | version = "0.1.2" # Also update the VERSION file when you change the version! 4 | edition = "2021" 5 | 6 | [lib] 7 | crate-type = ["cdylib"] 8 | name = "noodles_fasta_wrapper" # The name of the library 9 | path = "rust/src/lib.rs" # Path to the library file 10 | 11 | [dependencies] 12 | pyo3 = { version = "0.18", features = ["extension-module"] } 13 | noodles-fasta = "0.45.0" # Update to the latest version of noodles 14 | noodles-core = "*" 15 | memmap2 = "*" 16 | 17 | [package.metadata.pyo3] 18 | name = "noodles_fasta_wrapper" 19 | -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/VERSION: -------------------------------------------------------------------------------- 1 | 0.1.2 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.0,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "bionemo-noodles" 7 | readme = "README.md" 8 | description = "Python wrapper around [noodles](https://github.com/zaeleus/noodles)." 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | # internal 15 | 'bionemo-core', 16 | # external 17 | 'pyfaidx', 18 | ] 19 | 20 | [project.optional-dependencies] 21 | test = [ 22 | 'torch', 23 | ] 24 | 25 | [tool.maturin] 26 | bindings = "pyo3" 27 | compatibility = "manylinux_2_28" 28 | python-source = "src" 29 | # we could make this bionemo.noodles.fasta_wrapper, but that would require it to be its own namespaced package. 30 | module-name = "bionemo.noodles_fasta_wrapper" 31 | version = { file = "VERSION" } 32 | 33 | [tool.setuptools.packages.find] 34 | where = ["src"] 35 | include = ["bionemo.*"] 36 | namespaces = true 37 | exclude = ["test*."] 38 | 39 | [tool.uv] 40 | cache-keys = [{ git = true }] 41 | -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/requirements.txt: -------------------------------------------------------------------------------- 1 | maturin 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/src/bionemo/noodles/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from bionemo.noodles_fasta_wrapper import ( 17 | PyFaidxRecord, 18 | PyIndexedMmapFastaReader, 19 | back_transcribe_sequence, 20 | complement_sequence, 21 | reverse_sequence, 22 | transcribe_sequence, 23 | ) 24 | 25 | 26 | __all__ = ( 27 | "PyFaidxRecord", 28 | "PyIndexedMmapFastaReader", 29 | "back_transcribe_sequence", 30 | "complement_sequence", 31 | "reverse_sequence", 32 | "transcribe_sequence", 33 | ) 34 | -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/tests/bionemo/noodles/data/bad_index.fasta: -------------------------------------------------------------------------------- 1 | >chr1 2 | ACTGACTGACTG 3 | >chr2 4 | GGTCAAGGTCAA 5 | >chr3 6 | AGTCAAGGTCCA 7 | CGTCAAGGTCCC 8 | GGTCAAGGTCCG 9 | TGTCAAGGTCCT 10 | AGTCAAGGTCAA 11 | CGTCAAGGTCAC 12 | GGTCAAGGTCAG 13 | >chr4 14 | CCCCCCCCCCCC 15 | ACGT 16 | >chr5 17 | A 18 | -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/tests/bionemo/noodles/data/bad_index.fasta.fai: -------------------------------------------------------------------------------- 1 | this is not a valid fasta index!!!!!! 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/tests/bionemo/noodles/data/dupes.fasta: -------------------------------------------------------------------------------- 1 | >chr1 version|of|seq1 2 | ACTGACTGACTG 3 | >chr1 version|of|seq2 4 | GGTCAAGGTCAA 5 | >chr1 some|random|inputs 6 | AGTCAAGGTCCA 7 | CGTCAAGGTCCC 8 | GGTCAAGGTCCG 9 | TGTCAAGGTCCT 10 | AGTCAAGGTCAA 11 | CGTCAAGGTCAC 12 | GGTCAAGGTCAG 13 | >chr1 why|is|this|done 14 | CCCCCCCCCCCC 15 | ACGT 16 | >chr1 stop|violated|fasta|spec 17 | A 18 | -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/tests/bionemo/noodles/data/sample.fasta: -------------------------------------------------------------------------------- 1 | >chr1 2 | ACTGACTGACTG 3 | >chr2 4 | GGTCAAGGTCAA 5 | >chr3 6 | AGTCAAGGTCCA 7 | CGTCAAGGTCCC 8 | GGTCAAGGTCCG 9 | TGTCAAGGTCCT 10 | AGTCAAGGTCAA 11 | CGTCAAGGTCAC 12 | GGTCAAGGTCAG 13 | >chr4 14 | CCCCCCCCCCCC 15 | ACGT 16 | >chr5 17 | A 18 | -------------------------------------------------------------------------------- /sub-packages/bionemo-noodles/tests/bionemo/noodles/data/sample.fasta.fai: -------------------------------------------------------------------------------- 1 | chr1 12 6 12 13 2 | chr2 12 25 12 13 3 | chr3 84 44 12 13 4 | chr4 16 141 12 13 5 | chr5 1 165 1 2 6 | -------------------------------------------------------------------------------- /sub-packages/bionemo-scdl/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-scdl/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.7 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-scdl/assets/disk_space.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/sub-packages/bionemo-scdl/assets/disk_space.png -------------------------------------------------------------------------------- /sub-packages/bionemo-scdl/assets/throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/sub-packages/bionemo-scdl/assets/throughput.png -------------------------------------------------------------------------------- /sub-packages/bionemo-scdl/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-scdl" 7 | readme = "README.md" 8 | description = "SCDL, a Dataset class for Single Cell data." 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | # bionemo sub-packages 15 | 'bionemo-core>=2.2.1', 16 | # external 17 | 'anndata>=0.11.0', 18 | 'pandas>=2.2.1', 19 | 'pyarrow>=16.0.0', 20 | 'scipy>=1.11.1', 21 | 'pydantic[email]', 22 | ] 23 | 24 | [project.scripts] 25 | convert_h5ad_to_scdl = "bionemo.scdl.scripts.convert_h5ad_to_scdl:main" 26 | 27 | [tool.setuptools.packages.find] 28 | where = ["src"] 29 | include = ["bionemo.*"] 30 | namespaces = true 31 | exclude = ["test*."] 32 | 33 | [tool.setuptools.dynamic] 34 | version = { file = "VERSION" } 35 | 36 | [tool.uv] 37 | cache-keys = [{ git = true }] 38 | -------------------------------------------------------------------------------- /sub-packages/bionemo-scdl/src/bionemo/scdl/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-scdl/src/bionemo/scdl/api/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-scdl/src/bionemo/scdl/index/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-scdl/src/bionemo/scdl/io/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-scdl/src/bionemo/scdl/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-scdl/src/bionemo/scdl/util/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-size-aware-batching/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-size-aware-batching/VERSION: -------------------------------------------------------------------------------- 1 | 1.0.0 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-size-aware-batching/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-size-aware-batching" 7 | readme = "README.md" 8 | description = "Provides a simple way to create mini-batches in a memory consumption-aware manner, making it useful for tasks like training models on datasets with varying memory requirements." 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | # bionemo sub-packages 15 | 'bionemo-core', 16 | # external 17 | ] 18 | 19 | [tool.setuptools.packages.find] 20 | where = ["src"] 21 | include = ["bionemo.*"] 22 | namespaces = true 23 | exclude = ["test*."] 24 | 25 | [tool.setuptools.dynamic] 26 | version = { file = "VERSION" } 27 | 28 | [tool.uv] 29 | cache-keys = [{ git = true }] 30 | -------------------------------------------------------------------------------- /sub-packages/bionemo-size-aware-batching/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/bionemo-framework/9ac892bd46d49eec05df35965ee5b90b5b8fe763/sub-packages/bionemo-size-aware-batching/requirements.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-size-aware-batching/src/bionemo/size_aware_batching/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-testing/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-testing/README.md: -------------------------------------------------------------------------------- 1 | # bionemo-testing 2 | 3 | A package of test-time requirements and utilities for bionemo sub-packages. In particular, the `bionemo-testing` package 4 | handles downloading and caching data and other assets for running unit tests and example notebooks. For more information 5 | on test data handling, see [BioNeMo test data management](https://github.com/NVIDIA/bionemo-framework/blob/main/sub-packages/bionemo-testing/src/bionemo/testing/data/README.md) 6 | -------------------------------------------------------------------------------- /sub-packages/bionemo-testing/VERSION: -------------------------------------------------------------------------------- 1 | 2.4.1 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-testing/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-testing" 7 | readme = "README.md" 8 | description = "Utilities aiding test creation for BioNeMo sub-packages." 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | # bionemo sub-packages 15 | 'bionemo-core', 16 | 'bionemo-llm>=2.4.5', 17 | # external 18 | 'email-validator', 19 | 'pytest', 20 | 'overrides', 21 | ] 22 | 23 | [tool.setuptools.packages.find] 24 | where = ["src"] 25 | include = ["bionemo.*"] 26 | namespaces = true 27 | exclude = ["test*."] 28 | 29 | [tool.setuptools.dynamic] 30 | version = { file = "VERSION" } 31 | 32 | [tool.uv] 33 | cache-keys = [{ git = true }] 34 | -------------------------------------------------------------------------------- /sub-packages/bionemo-testing/src/bionemo/testing/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-testing/src/bionemo/testing/data/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-testing/src/bionemo/testing/data/load.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from typing import Sequence 16 | 17 | from bionemo.core.data.load import default_ngc_client, default_pbss_client, entrypoint, load 18 | 19 | 20 | _ = entrypoint 21 | # This needs to be around so that ruff doesn't automatically remove it as it's unused. 22 | # We don't want to include it in __all__. 23 | # But older installations __may__ be using the old CLI path (bionemo.core.data.load:entrypoint) 24 | # so this is here for backwards compatability. 25 | 26 | 27 | __all__: Sequence[str] = ( 28 | "default_ngc_client", 29 | "default_pbss_client", 30 | "load", 31 | ) 32 | -------------------------------------------------------------------------------- /sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from typing import Sequence 16 | 17 | from bionemo.core.data.resource import Resource, get_all_resources 18 | 19 | 20 | __all__: Sequence[str] = ( 21 | "Resource", 22 | "get_all_resources", 23 | ) 24 | -------------------------------------------------------------------------------- /sub-packages/bionemo-testing/src/bionemo/testing/harnesses/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-testing/src/bionemo/testing/harnesses/mode.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from enum import Enum, auto 18 | 19 | 20 | class Mode(Enum): 21 | """Mode for stop-go testing.""" 22 | 23 | STOP = auto() 24 | RESUME = auto() 25 | CONTINUOUS = auto() 26 | -------------------------------------------------------------------------------- /sub-packages/bionemo-webdatamodule/LICENSE: -------------------------------------------------------------------------------- 1 | ../../LICENSE/license.txt -------------------------------------------------------------------------------- /sub-packages/bionemo-webdatamodule/VERSION: -------------------------------------------------------------------------------- 1 | 1.0.0 2 | -------------------------------------------------------------------------------- /sub-packages/bionemo-webdatamodule/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bionemo-webdatamodule" 7 | readme = "README.md" 8 | description = "PyTorch Lightning Data Module for WebDataset files." 9 | authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] 10 | requires-python = ">=3.10" 11 | license = { file = "LICENSE" } 12 | dynamic = ["version"] 13 | dependencies = [ 14 | # bionemo sub-packages 15 | 'bionemo-core', 16 | # external 17 | 'webdataset==0.2.96', 18 | ] 19 | 20 | [tool.setuptools.packages.find] 21 | where = ["src"] 22 | include = ["bionemo.*"] 23 | namespaces = true 24 | exclude = ["test*."] 25 | 26 | [tool.setuptools.dynamic] 27 | version = { file = "VERSION" } 28 | 29 | [tool.uv] 30 | cache-keys = [{ git = true }] 31 | -------------------------------------------------------------------------------- /sub-packages/bionemo-webdatamodule/src/bionemo/webdatamodule/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /sub-packages/bionemo-webdatamodule/tests/bionemo/webdatamodule/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: LicenseRef-Apache2 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | --------------------------------------------------------------------------------