├── tests ├── data │ └── __init__.py ├── tasks │ └── __init__.py ├── adapters │ └── __init__.py └── backbones │ └── __init__.py ├── docs ├── .gitignore ├── docs │ ├── assets │ │ └── images │ │ │ ├── icon.png │ │ │ ├── logo.png │ │ │ ├── genbio_logo.png │ │ │ ├── genbio_header.png │ │ │ └── structure_tokenizer │ │ │ ├── cameo_0.png │ │ │ ├── select_files.png │ │ │ ├── launch_protein_viewer.png │ │ │ ├── structure_tokenizer.png │ │ │ ├── structure_prediction_model.png │ │ │ └── visualize_reconstruction.png │ ├── api_reference │ │ ├── callbacks.md │ │ ├── trainer.md │ │ └── adapters.md │ ├── usage │ │ └── embedding_caching.md │ ├── experiment_design │ │ ├── backbones.md │ │ └── data.md │ └── tutorials │ │ ├── kfold_cross_validation.md │ │ ├── dependency_mapping.md │ │ └── finetuning_scheduler.md └── README.md ├── modelgenerator ├── __init__.py ├── distributed │ ├── __init__.py │ └── fsdp │ │ └── __init__.py ├── huggingface_models │ ├── __init__.py │ ├── genbio │ │ ├── __init__.py │ │ └── modeling_genbio.py │ ├── borzoi_pytorch │ │ ├── __init__.py │ │ ├── pytorch_borzoi_helpers.py │ │ └── config_borzoi.py │ ├── scfoundation │ │ └── pretrainmodels │ │ │ ├── __init__.py │ │ │ ├── transformer.py │ │ │ └── select_model.py │ ├── geneformer │ │ ├── token_dictionary_gc95M.pkl │ │ ├── gene_name_id_dict_gc95M.pkl │ │ ├── ensembl_mapping_dict_gc95M.pkl │ │ ├── gene_median_dictionary_gc95M.pkl │ │ └── __init__.py │ ├── enformer_pytorch │ │ ├── precomputed │ │ │ └── tf_gammas.pt │ │ ├── __init__.py │ │ ├── LICENSE │ │ └── config_enformer.py │ ├── rnabert │ │ └── vocab.txt │ ├── fm4bio │ │ └── vocab_protein.txt │ └── scimilarity │ │ └── model_v1.1 │ │ └── layer_sizes.json ├── structure_tokenizer │ ├── __init__.py │ ├── layers │ │ ├── __init__.py │ │ ├── esmfold │ │ │ ├── __init__.py │ │ │ └── categorical_mixture.py │ │ └── equivariant │ │ │ └── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── constants │ │ │ ├── __init__.py │ │ │ ├── structure_tokenizer.py │ │ │ └── residue_constants.py │ │ ├── geometry │ │ │ └── __init__.py │ │ ├── types.py │ │ ├── distributed.py │ │ ├── misc.py │ │ └── init_params.py │ ├── callbacks │ │ └── __init__.py │ ├── configs │ │ ├── __init__.py │ │ └── lightning_configs.py │ ├── datasets │ │ └── __init__.py │ ├── models │ │ └── __init__.py │ └── README.md ├── rna_inv_fold │ └── gRNAde_structure_encoder │ │ ├── src │ │ ├── __init__.py │ │ └── data │ │ │ └── __init__.py │ │ └── LICENSE ├── tasks │ └── __init__.py ├── utils │ └── __init__.py ├── adapters │ ├── __init__.py │ └── base.py └── prot_inv_fold │ └── proteinMPNN │ └── proteinMPNN_model_utils.py ├── experiments ├── AIDO.Cell │ ├── requirements.txt │ ├── README.md │ ├── extract_features.py │ ├── readme.md │ ├── sctab_conversion.py │ └── docker_readme.md ├── AIDO.StructurePrediction │ ├── fold │ │ ├── __init__.py │ │ ├── data │ │ │ ├── __init__.py │ │ │ └── ccd_data.py │ │ ├── metrics │ │ │ └── __init__.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ └── head.py │ │ │ └── layer_norm │ │ │ │ ├── __init__.py │ │ │ │ ├── kernel │ │ │ │ └── compat.h │ │ │ │ └── torch_ext_compile.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── data_process.py │ │ │ ├── hash_encoder.py │ │ │ ├── logger.py │ │ │ ├── file_io.py │ │ │ ├── seed.py │ │ │ └── geometry.py │ │ └── openfold_local │ │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── tools │ │ │ │ ├── __init__.py │ │ │ │ └── utils.py │ │ │ └── errors.py │ │ │ ├── np │ │ │ └── __init__.py │ │ │ ├── model │ │ │ └── __init__.py │ │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── kernel │ │ │ │ ├── __init__.py │ │ │ │ └── csrc │ │ │ │ │ ├── compat.h │ │ │ │ │ ├── softmax_cuda_stub.cpp │ │ │ │ │ └── softmax_cuda.cpp │ │ │ ├── geometry │ │ │ │ ├── utils.py │ │ │ │ ├── __init__.py │ │ │ │ └── quat_rigid.py │ │ │ └── precision_utils.py │ │ │ ├── __init__.py │ │ │ └── README.md │ ├── runner │ │ └── __init__.py │ ├── .gitignore │ ├── src │ │ └── genbio │ │ │ └── aidosp │ │ │ ├── cli │ │ │ ├── __init__.py │ │ │ ├── util │ │ │ │ └── __init__.py │ │ │ ├── completions │ │ │ │ ├── __init__.py │ │ │ │ ├── genbio-aidosp.fish │ │ │ │ ├── genbio-aidosp-complete.bash │ │ │ │ ├── commands.py │ │ │ │ └── genbio-aidosp-complete.zsh │ │ │ ├── base.py │ │ │ └── predict.py │ │ │ ├── scripts │ │ │ ├── __init__.py │ │ │ ├── download_colabfold_envdb.sh │ │ │ └── download_uniref30.sh │ │ │ ├── msa_retrieve │ │ │ ├── __init__.py │ │ │ ├── configs │ │ │ │ ├── __init__.py │ │ │ │ └── mmseqs.yaml │ │ │ ├── msar │ │ │ │ ├── __init__.py │ │ │ │ ├── tools │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── utils.py │ │ │ │ └── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── general.py │ │ │ │ │ ├── errors.py │ │ │ │ │ ├── logger.py │ │ │ │ │ └── io_utils.py │ │ │ └── bin │ │ │ │ └── search_msa.sh │ │ │ └── __init__.py │ ├── .gitattributes │ ├── examples │ │ ├── T1106s1-D1.fasta │ │ ├── T1104-D1.fasta │ │ ├── T1106s2-D1.fasta │ │ └── T1109-D1.fasta │ ├── assets │ │ └── img │ │ │ ├── ana.png │ │ │ ├── hln.png │ │ │ ├── vis.png │ │ │ ├── figure(gt-yellow vs af3-blue).png │ │ │ └── figure(gt-yellow vs our-green).png │ ├── job2fasta.py │ ├── configs │ │ └── mmseqs.yaml │ ├── Dockerfile │ ├── scripts │ │ └── run_inference.sh │ └── buildjob.py ├── AIDO.DNA │ ├── dependency_mapping │ │ ├── DNA.txt │ │ ├── requirements.txt │ │ ├── config.yaml │ │ ├── depmap.csv │ │ └── README.md │ ├── zeroshot_variant_effect_prediction │ │ ├── Clinvar_300M_zeroshot_Diff.yaml │ │ ├── Clinvar_7B_zeroshot_Distance.yaml │ │ └── Clinvar_300M_zeroshot_Distance.yaml │ └── sequence_classification │ │ ├── nt_promoter_all.yaml │ │ ├── gue_core_promoter_all.yaml │ │ ├── nt_enhancers.yaml │ │ └── gue_splice_reconstruction.yaml ├── AIDO.RNA │ ├── dependency_mapping │ │ ├── DNA.txt │ │ ├── requirements.txt │ │ ├── depmap.csv │ │ ├── config.yaml │ │ └── README.md │ ├── mean_ribosome_load_prediction │ │ ├── ft_schedules │ │ │ └── two_step.yaml │ │ └── README.md │ ├── modification_site_prediction.sh │ ├── ncrna_family_classfification.sh │ ├── splice_site_prediction.sh │ ├── rna_inverse_folding │ │ └── rna_inv_fold_test.yaml │ ├── expression_level_prediction.sh │ ├── translation_efficiency_prediction.sh │ ├── protein_abundance_prediction.sh │ ├── transcript_abundance_prediction.sh │ ├── rna_secondary_structure_prediction │ │ ├── rna_secondary_structure_prediction.sh │ │ └── ft_schedules │ │ │ └── layers_0_32.yaml │ ├── multimodal_isoform_expression │ │ └── isoform_expression_prediction.sh │ └── demo_mrna_vaccine │ │ └── get_mean_embeddings.py ├── AIDO.StructureTokenizer │ ├── protein2structoken_example_input.csv │ ├── decode_example_input.tsv │ ├── decode.yaml │ ├── encode_decode.yaml │ ├── encode.yaml │ ├── structure_encoding.sh │ ├── protein2structoken_16b.yaml │ └── extract_structure_tokenizer_codebook.py ├── AIDO.Protein-RAG │ ├── xTrimo_RAG │ │ ├── configs │ │ │ ├── wandb.yaml │ │ │ └── prediction_writer.yaml │ │ └── init_env.sh │ └── DMS_RAG │ │ ├── configs │ │ ├── wandb.yaml │ │ └── prediction_writer.yaml │ │ └── init_env.sh ├── AIDO.Protein │ ├── DMS │ │ ├── train_indels_LP.sh │ │ ├── train_indels_LoRA_DDP.sh │ │ ├── train_sub_LoRA_DDP.sh │ │ ├── train_sub_LoRA_FSDP.sh │ │ └── configs │ │ │ ├── indels_LP_DDP.yaml │ │ │ ├── indels_LoRA_DDP.yaml │ │ │ ├── substitution_LoRA_DDP.yaml │ │ │ └── substitution_LoRA_FSDP.yaml │ ├── protein_inverse_folding │ │ ├── merge_ckpt.py │ │ ├── protein_inv_fold_test.yaml │ │ └── end2end_inference.sh │ └── xTrimo │ │ ├── ssp_q3.sh │ │ ├── fold_prediction.sh │ │ ├── tcr_pmhc_affinity.sh │ │ ├── fluorescence_prediction.sh │ │ ├── contact_prediction_binary.sh │ │ └── peptide_HLA_MHC_affinity.sh └── AIDO.Tissue │ └── emb.xenium.yaml ├── configs ├── examples │ ├── quick_dev_run.yaml │ ├── save_predictions.yaml │ ├── wandb.yaml │ └── lora_backbone.yaml └── defaults.yaml ├── scripts └── wandb_sweep │ ├── slurm_sweep.yaml │ ├── slurm_agent.sh │ └── README.md ├── .github └── workflows │ ├── publish.yml │ ├── tests.yml │ └── docs.yml ├── .pre-commit-config.yaml ├── Dockerfile └── CONTRIBUTING.md /tests/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | site/ 2 | -------------------------------------------------------------------------------- /modelgenerator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/adapters/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/distributed/fsdp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.Cell/requirements.txt: -------------------------------------------------------------------------------- 1 | scanpy 2 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/genbio/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/runner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/.gitignore: -------------------------------------------------------------------------------- 1 | version.py 2 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/layers/esmfold/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/utils/constants/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/utils/geometry/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/model/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/rna_inv_fold/gRNAde_structure_encoder/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/layers/equivariant/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.DNA/dependency_mapping/DNA.txt: -------------------------------------------------------------------------------- 1 | A 2 | T 3 | G 4 | C 5 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/dependency_mapping/DNA.txt: -------------------------------------------------------------------------------- 1 | A 2 | T 3 | G 4 | C 5 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/np/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelgenerator/rna_inv_fold/gRNAde_structure_encoder/src/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/data/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/utils/kernel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/completions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/configs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb filter=strip-notebook-output 2 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/AIDO.DNA/dependency_mapping/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | seaborn 3 | pandas 4 | logomaker 5 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/dependency_mapping/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | seaborn 3 | pandas 4 | logomaker 5 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/model/layer_norm/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .layer_norm import FusedLayerNorm 3 | -------------------------------------------------------------------------------- /experiments/AIDO.StructureTokenizer/protein2structoken_example_input.csv: -------------------------------------------------------------------------------- 1 | idx,aa_seq 2 | example,KEFWNLDKNLQLRLGIVFLG 3 | -------------------------------------------------------------------------------- /modelgenerator/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from modelgenerator.tasks.base import * 2 | from modelgenerator.tasks.tasks import * 3 | -------------------------------------------------------------------------------- /docs/docs/assets/images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/icon.png -------------------------------------------------------------------------------- /docs/docs/assets/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/logo.png -------------------------------------------------------------------------------- /experiments/AIDO.RNA/mean_ribosome_load_prediction/ft_schedules/two_step.yaml: -------------------------------------------------------------------------------- 1 | 0: 2 | - adapter.* 3 | 3: 4 | - backbone.* 5 | -------------------------------------------------------------------------------- /docs/docs/assets/images/genbio_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/genbio_logo.png -------------------------------------------------------------------------------- /docs/docs/assets/images/genbio_header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/genbio_header.png -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/examples/T1106s1-D1.fasta: -------------------------------------------------------------------------------- 1 | >T1106s1-D1 2 | TAQSKRSLWDFASPGYTFHGLHRAQDYRRELDTLQSLLTTSQSSELQAAAALLKCQQDDDRLLQIILNLLH 3 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/borzoi_pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .pytorch_borzoi_model import Borzoi, AnnotatedBorzoi 2 | # from .gene_utils import Transcriptome 3 | -------------------------------------------------------------------------------- /docs/docs/assets/images/structure_tokenizer/cameo_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/cameo_0.png -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/assets/img/ana.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/experiments/AIDO.StructurePrediction/assets/img/ana.png -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/assets/img/hln.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/experiments/AIDO.StructurePrediction/assets/img/hln.png -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/assets/img/vis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/experiments/AIDO.StructurePrediction/assets/img/vis.png -------------------------------------------------------------------------------- /docs/docs/assets/images/structure_tokenizer/select_files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/select_files.png -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/__init__.py: -------------------------------------------------------------------------------- 1 | from .version import version as __version__ 2 | 3 | 4 | def get_version() -> str: 5 | return __version__ 6 | -------------------------------------------------------------------------------- /configs/examples/quick_dev_run.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | fast_dev_run: true 3 | accelerator: auto 4 | devices: 1 5 | precision: 32 6 | detect_anomaly: true 7 | log_every_n_steps: 1 8 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/scfoundation/pretrainmodels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 BioMap (Beijing) Intelligence Technology Limited 2 | 3 | from .select_model import select_model 4 | -------------------------------------------------------------------------------- /docs/docs/assets/images/structure_tokenizer/launch_protein_viewer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/launch_protein_viewer.png -------------------------------------------------------------------------------- /docs/docs/assets/images/structure_tokenizer/structure_tokenizer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/structure_tokenizer.png -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/examples/T1104-D1.fasta: -------------------------------------------------------------------------------- 1 | >T1104-D1 2 | QLEDSEVEAVAKGLEEMYANGVTEDNFKNYVKNNFAQQEISSVEEELNVNISDSCVANKIKDEFFAMISISAIVKAAQKKAWKELAVTVLRFAKANGLKTNAIIVAGQLALWAVQCG 3 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/examples/T1106s2-D1.fasta: -------------------------------------------------------------------------------- 1 | >T1106s2-D1 2 | NITLTKRQQEFLLLNGWLQLQCGHAERACILLDALLTLNPEHLAGRRCRLVALLNNNQGERAEKEAQWLISHDPLQAGNWLCLSRAQQLNGDLDKARHAYQHYLELKDHNE 3 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/geneformer/token_dictionary_gc95M.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/modelgenerator/huggingface_models/geneformer/token_dictionary_gc95M.pkl -------------------------------------------------------------------------------- /modelgenerator/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from modelgenerator.utils.kwargs_doc import GoogleKwargsDocstringInheritanceInitMeta 2 | 3 | __all__ = [ 4 | "GoogleKwargsDocstringInheritanceInitMeta", 5 | ] 6 | -------------------------------------------------------------------------------- /configs/examples/save_predictions.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | callbacks: 3 | - class_path: modelgenerator.callbacks.PredictionWriter 4 | dict_kwargs: 5 | output_dir: predictions 6 | filetype: pt 7 | -------------------------------------------------------------------------------- /docs/docs/assets/images/structure_tokenizer/structure_prediction_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/structure_prediction_model.png -------------------------------------------------------------------------------- /docs/docs/assets/images/structure_tokenizer/visualize_reconstruction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/visualize_reconstruction.png -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/geneformer/gene_name_id_dict_gc95M.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/modelgenerator/huggingface_models/geneformer/gene_name_id_dict_gc95M.pkl -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/enformer_pytorch/precomputed/tf_gammas.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/modelgenerator/huggingface_models/enformer_pytorch/precomputed/tf_gammas.pt -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/geneformer/ensembl_mapping_dict_gc95M.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/modelgenerator/huggingface_models/geneformer/ensembl_mapping_dict_gc95M.pkl -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/rnabert/vocab.txt: -------------------------------------------------------------------------------- 1 | [PAD] 2 | [MASK] 3 | [CLS] 4 | [SEP] 5 | [UNK] 6 | A 7 | G 8 | C 9 | T 10 | U 11 | N 12 | [BOS] 13 | [EOS] 14 | [UNUSED1] 15 | [UNUSED2] 16 | [UNUSED3] -------------------------------------------------------------------------------- /experiments/AIDO.StructureTokenizer/decode_example_input.tsv: -------------------------------------------------------------------------------- 1 | uid sequences predictions 2 | example KEFWNLDKNLQLRLGIVFLG [355, 364, 490, 132, 81, 181, 176, 59, 19, 386, 176, 173, 199, 7, 35, 196, 113, 132, 284, 321] 3 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/geneformer/gene_median_dictionary_gc95M.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/modelgenerator/huggingface_models/geneformer/gene_median_dictionary_gc95M.pkl -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/assets/img/figure(gt-yellow vs af3-blue).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/experiments/AIDO.StructurePrediction/assets/img/figure(gt-yellow vs af3-blue).png -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/assets/img/figure(gt-yellow vs our-green).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/experiments/AIDO.StructurePrediction/assets/img/figure(gt-yellow vs our-green).png -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | from .writer_pdb_callback import WriterPDBCallback 2 | from .struct_tokens_callback import StructTokensCallback 3 | 4 | __all__ = ["WriterPDBCallback", "StructTokensCallback"] 5 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein-RAG/xTrimo_RAG/configs/wandb.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | logger: 3 | class_path: lightning.pytorch.loggers.WandbLogger 4 | init_args: 5 | name: test 6 | save_dir: logs 7 | project: xTrimo_Benchmark 8 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein-RAG/DMS_RAG/configs/wandb.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | logger: 3 | class_path: lightning.pytorch.loggers.WandbLogger 4 | init_args: 5 | name: test 6 | save_dir: logs 7 | project: Protein_RAG_no_structure 8 | -------------------------------------------------------------------------------- /configs/examples/wandb.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | logger: 3 | class_path: lightning.pytorch.loggers.wandb.WandbLogger 4 | init_args: 5 | name: "my-experiment-name" 6 | save_dir: "logs" 7 | project: "my-project" 8 | save_code: true 9 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/utils/general.py: -------------------------------------------------------------------------------- 1 | 2 | import hashlib 3 | 4 | def seq_encoder(sequence, method="md5"): 5 | hasher = eval(f"hashlib.{method}") 6 | return hasher(sequence.encode(encoding="utf-8")).hexdigest() 7 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_configs import ( 2 | ProteinDatasetConfig, 3 | StructTokensDatasetConfig, 4 | ProteinDataConfig, 5 | ) 6 | 7 | 8 | __all__ = ["ProteinDatasetConfig", "StructTokensDatasetConfig", "ProteinDataConfig"] 9 | -------------------------------------------------------------------------------- /modelgenerator/adapters/__init__.py: -------------------------------------------------------------------------------- 1 | from modelgenerator.adapters.base import * 2 | from modelgenerator.adapters.adapters import * 3 | from modelgenerator.adapters.fusion import ( 4 | MMFusionTokenAdapter as MMFusionTokenAdapter, 5 | MMFusionSeqAdapter as MMFusionSeqAdapter, 6 | ) 7 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/utils/errors.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Error(Exception): 4 | """Base class for exceptions.""" 5 | 6 | 7 | class MultipleChainsError(Error): 8 | """An error indicating that multiple chains were found for a given ID.""" 9 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/examples/T1109-D1.fasta: -------------------------------------------------------------------------------- 1 | >T1109-D1 2 | PRPPFHITIPIYPGVDLLDVAAPVELFSWMADAWKARATTITLAAEHLTPLKTRDGLTLTPQRQFADYADAAAPQPQTHLLWVPGGAPDVLRKLMRGGPYLDFLKAQSAGADHVSSVCEGALLLAAAGLLDGYRATTHWAFIPCLQQFPAIKVAEGFPRYVIDGNRITGGGISSGLAEALAIVARVAGQDIAKHVQMITQYFPDPPFEQTIVPATHCPLQ 3 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .protein_lightning_datamodule import ProteinLightningDataModule 2 | from .struct_tokens_lightning_datamodule import StructTokensLightningDataModule 3 | 4 | 5 | __all__ = [ 6 | "ProteinLightningDataModule", 7 | "StructTokensLightningDataModule", 8 | ] 9 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/utils/kernel/csrc/compat.h: -------------------------------------------------------------------------------- 1 | // modified from https://github.com/NVIDIA/apex/blob/master/csrc/compat.h 2 | 3 | #ifndef TORCH_CHECK 4 | #define TORCH_CHECK AT_CHECK 5 | #endif 6 | 7 | #ifdef VERSION_GE_1_3 8 | #define DATA_PTR data_ptr 9 | #else 10 | #define DATA_PTR data 11 | #endif 12 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/dependency_mapping/depmap.csv: -------------------------------------------------------------------------------- 1 | id,sequence 2 | >NR_002092.1|RNAseP|Drosophila,AGTCAGTTGCAAACTAGCATCTGGGGCCCACACAACGAGTATCTGATTACTCCACAAACCATTGCCCCGGGAAGGTCTGAGAATCGGCCGAGCCAGCTGTTTGTTGCGGCTTCATTTCCCAGCAGGAAACCTGTGTGATTGCAGGGCGAAAGTACCAGAAATCCTGCTACCAGGTGTTGCCGTTGCCCCCGGTGACCGCCGCCTGGTTGGCATTGAAACCTTTCGTGGCCAGCGTTTTTAGTGCGATGTGCTTGCTGCCTCTAAGGCAGAACTCAATTCAGACTAATCTGTGACTGACT 3 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .equiformer_encoder import EquiformerEncoderLightning 2 | from .esmfold_decoder import ESMFoldDecoderLightning 3 | from .structure_tokenizer_lightning import StructureTokenizerLightning 4 | 5 | 6 | __all__ = [ 7 | "StructureTokenizerLightning", 8 | "EquiformerEncoderLightning", 9 | "ESMFoldDecoderLightning", 10 | ] 11 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/base.py: -------------------------------------------------------------------------------- 1 | import click 2 | from genbio.aidosp.cli.completions.commands import completion 3 | from genbio.aidosp.cli.util.commands import util 4 | 5 | 6 | @click.group(help="GenBio AIDO Structure Prediction CLI") 7 | def cli(): ... 8 | 9 | 10 | cli.add_command(completion) 11 | cli.add_command(util) 12 | if __name__ == "__main__": 13 | cli() 14 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/fm4bio/vocab_protein.txt: -------------------------------------------------------------------------------- 1 | [PAD] 2 | L 3 | A 4 | G 5 | V 6 | S 7 | E 8 | R 9 | T 10 | I 11 | D 12 | P 13 | K 14 | Q 15 | N 16 | F 17 | Y 18 | M 19 | H 20 | W 21 | C 22 | X 23 | B 24 | U 25 | Z 26 | O 27 | . 28 | - 29 | [MASK] 30 | [gMASK] 31 | [sMASK] 32 | [eod] 33 | [sop] 34 | [eop] 35 | [SEP] 36 | [HC] 37 | [LC] 38 | [HUMAN] 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Making Documentation 2 | 3 | We use mkdocs for markdown rendering with mkdocstrings for autodocumentation. 4 | 5 | ## Installation 6 | 7 | To build the website 8 | 1. `pip install -r requirements.txt` 9 | 2. `mkdocs serve` 10 | 11 | ## Useful Links 12 | 13 | [mkdocs for markdown rendering](https://www.mkdocs.org/user-guide/) 14 | [mkdocstrings for automatic documentation](https://mkdocstrings.github.io/usage/) 15 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/utils/types.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from dataclasses import dataclass 3 | 4 | import torch 5 | 6 | ShapeLike = int | torch.Size 7 | PathLike = str | Path 8 | 9 | 10 | # for what is irreps, see https://docs.e3nn.org/en/stable/api/o3/o3_irreps.html 11 | # here you can just think of it as a tuple of scalars and vectors 12 | @dataclass 13 | class IrrepShape: 14 | s: ShapeLike 15 | v: ShapeLike 16 | -------------------------------------------------------------------------------- /configs/examples/lora_backbone.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | class_path: modelgenerator.tasks.SequenceClassification 3 | init_args: 4 | n_classes: 2 5 | optimizer: 6 | class_path: torch.optim.AdamW 7 | init_args: 8 | lr: 0.001 9 | weight_decay: 0.01 10 | backbone: 11 | class_path: modelgenerator.backbones.aido_dna_dummy 12 | init_args: 13 | use_peft: True 14 | lora_r: 16 15 | lora_alpha: 32 16 | lora_dropout: 0.1 17 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein-RAG/DMS_RAG/configs/prediction_writer.yaml: -------------------------------------------------------------------------------- 1 | # lightning.pytorch==2.4.0 2 | seed_everything: 42 3 | trainer: 4 | callbacks: 5 | - class_path: modelgenerator.callbacks.PredictionWriter 6 | dict_kwargs: 7 | output_dir: DMS_output/ 8 | filetype: tsv 9 | write_cols: ['uid', 'sequences', 'predictions', 'labels'] 10 | remove_duplicates: true 11 | delete_intermediate_files: true 12 | data: 13 | init_args: 14 | generate_uid: true 15 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein-RAG/xTrimo_RAG/configs/prediction_writer.yaml: -------------------------------------------------------------------------------- 1 | # lightning.pytorch==2.4.0 2 | seed_everything: 42 3 | trainer: 4 | callbacks: 5 | - class_path: modelgenerator.callbacks.PredictionWriter 6 | dict_kwargs: 7 | output_dir: xTrimo_output/ 8 | filetype: tsv 9 | write_cols: ['uid', 'sequences', 'predictions', 'labels'] 10 | remove_duplicates: true 11 | delete_intermediate_files: true 12 | data: 13 | init_args: 14 | generate_uid: true 15 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/utils/distributed.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as dist 2 | 3 | 4 | def get_world_size(): 5 | if not dist.is_available(): 6 | return 1 7 | if not dist.is_initialized(): 8 | return 1 9 | return dist.get_world_size() 10 | 11 | 12 | def all_reduce(tensor, op=dist.ReduceOp.SUM): 13 | world_size = get_world_size() 14 | if world_size == 1: 15 | return tensor 16 | dist.all_reduce(tensor, op=op) 17 | return tensor 18 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein-RAG/DMS_RAG/init_env.sh: -------------------------------------------------------------------------------- 1 | SCRIPT_PATH=`dirname "$(realpath ${BASH_SOURCE[0]})"` 2 | 3 | source ~/.bash_profile 4 | # echo "source ~/.bash_profile" 5 | eval init_conda_env 6 | conda activate python3.11 7 | lib_nvjitlink 8 | 9 | # which torchrun 10 | MG_PATH=$(realpath ${SCRIPT_PATH}/../../..) 11 | export PYTHONPATH=${MG_PATH}:${PYTHONPATH} 12 | export OMP_NUM_THREADS=1 13 | # export HF_DATASETS_OFFLINE=1 14 | # export PL_GLOBAL_SEED=0 15 | # export TF_ENABLE_ONEDNN_OPTS=0 16 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein-RAG/xTrimo_RAG/init_env.sh: -------------------------------------------------------------------------------- 1 | SCRIPT_PATH=`dirname "$(realpath ${BASH_SOURCE[0]})"` 2 | 3 | source ~/.bash_profile 4 | # echo "source ~/.bash_profile" 5 | eval init_conda_env 6 | conda activate python3.11 7 | lib_nvjitlink 8 | 9 | # which torchrun 10 | MG_PATH=$(realpath ${SCRIPT_PATH}/../../..) 11 | export PYTHONPATH=${MG_PATH}:${PYTHONPATH} 12 | export OMP_NUM_THREADS=1 13 | # export HF_DATASETS_OFFLINE=1 14 | # export PL_GLOBAL_SEED=0 15 | # export TF_ENABLE_ONEDNN_OPTS=0 16 | -------------------------------------------------------------------------------- /scripts/wandb_sweep/slurm_sweep.yaml: -------------------------------------------------------------------------------- 1 | program: mgen 2 | project: autotune-test # CHANGE_ME 3 | method: grid 4 | metric: 5 | goal: minimize 6 | name: val_loss 7 | parameters: 8 | model.optimizer.lr: 9 | values: [0.01, 0.001, 0.0001] 10 | command: 11 | - echo # DO NOT CHANGE 12 | - ${program} 13 | - fit 14 | - --config 15 | - .local/test.yaml # CHANGE_ME 16 | # more mgen arguments here e.g., 17 | # - --trainer.devices 18 | # - 2 19 | - ${args} # paramenters set by wandb agents 20 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/borzoi_pytorch/pytorch_borzoi_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | def predict_tracks(models, sequence_one_hot, slices): 5 | predicted_tracks = [] 6 | for fold_ix in range(len(models)): 7 | with torch.no_grad(): 8 | yh = models[fold_ix](sequence_one_hot[None, ...])[:, None, ...].numpy(force = True)[:,:,slices] 9 | predicted_tracks.append(yh) 10 | 11 | predicted_tracks = np.concatenate(predicted_tracks,axis=1).swapaxes(3,2) 12 | 13 | return predicted_tracks -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/utils/constants/structure_tokenizer.py: -------------------------------------------------------------------------------- 1 | from modelgenerator.structure_tokenizer.utils.constants import residue_constants 2 | 3 | SCALE_POSITIONS: float = 10 4 | QUANTIZE_IDX_MASK: int = ( 5 | -100 6 | ) # -100 is a special value that will be ignored in the loss function 7 | 8 | DISTOGRAM_BINS: int = 64 9 | LDDT_BINS: int = 50 10 | 11 | # Tokens to predict residues 12 | # 0 is padding, N + 1 is mask. 13 | N_TOKENS = residue_constants.restype_num + 2 14 | PAD_IDX = 0 15 | UNK_IDX = N_TOKENS - 2 16 | MASK_IDX = N_TOKENS - 1 17 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/enformer_pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # from enformer_pytorch.config_enformer import EnformerConfig 2 | # from enformer_pytorch.modeling_enformer import Enformer, from_pretrained, SEQUENCE_LENGTH, AttentionPool 3 | # from enformer_pytorch.data import seq_indices_to_one_hot, str_to_one_hot, GenomeIntervalDataset, FastaInterval 4 | from .config_enformer import EnformerConfig 5 | from .modeling_enformer import Enformer, from_pretrained, SEQUENCE_LENGTH, AttentionPool 6 | from .data import seq_indices_to_one_hot, str_to_one_hot, GenomeIntervalDataset, FastaInterval -------------------------------------------------------------------------------- /experiments/AIDO.DNA/dependency_mapping/config.yaml: -------------------------------------------------------------------------------- 1 | # Caleb Ellington 2 | model: 3 | class_path: Inference 4 | init_args: 5 | backbone: aido_dna_7b 6 | data: 7 | class_path: DependencyMappingDataModule 8 | init_args: 9 | path: experiments/AIDO.DNA/dependency_mapping/ 10 | test_split_files: 11 | - depmap.csv 12 | vocab_file: experiments/AIDO.DNA/dependency_mapping/DNA.txt 13 | batch_size: 32 14 | trainer: 15 | callbacks: 16 | - class_path: modelgenerator.callbacks.PredictionWriter 17 | dict_kwargs: 18 | output_dir: depmap_predictions 19 | filetype: pt 20 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/dependency_mapping/config.yaml: -------------------------------------------------------------------------------- 1 | # Caleb Ellington 2 | model: 3 | class_path: Inference 4 | init_args: 5 | backbone: aido_rna_1b600m 6 | data: 7 | class_path: DependencyMappingDataModule 8 | init_args: 9 | path: experiments/AIDO.RNA/dependency_mapping/ 10 | test_split_files: 11 | - depmap.csv 12 | vocab_file: experiments/AIDO.RNA/dependency_mapping/DNA.txt 13 | batch_size: 32 14 | trainer: 15 | callbacks: 16 | - class_path: modelgenerator.callbacks.PredictionWriter 17 | dict_kwargs: 18 | output_dir: depmap_predictions 19 | filetype: pt 20 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/geneformer/__init__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from pathlib import Path 3 | 4 | warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") 5 | 6 | GENE_MEDIAN_FILE = Path(__file__).parent / "gene_median_dictionary_gc95M.pkl" 7 | TOKEN_DICTIONARY_FILE = Path(__file__).parent / "token_dictionary_gc95M.pkl" 8 | ENSEMBL_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict_gc95M.pkl" 9 | ENSEMBL_MAPPING_FILE = Path(__file__).parent / "ensembl_mapping_dict_gc95M.pkl" 10 | 11 | from . import ( 12 | tokenizer, 13 | ) 14 | 15 | from .tokenizer import TranscriptomeTokenizer 16 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/DMS/train_indels_LP.sh: -------------------------------------------------------------------------------- 1 | # Ning Sun 2 | TASK_NAME='FECA_ECOLI_Tsuboyama_2023_2D1U_indels' 3 | MUTATION_TYPE='indels' 4 | for FOLD in {0..4} 5 | do 6 | RUN_NAME=${TASK_NAME}_fold${FOLD} 7 | srun mgen fit --config experiments/AIDO.Protein/DMS/configs/indels_LP_DDP.yaml \ 8 | --data.train_split_files "[\"${MUTATION_TYPE}/${TASK_NAME}.tsv\"]" \ 9 | --trainer.logger.name ${RUN_NAME} \ 10 | --trainer.logger.id ${RUN_NAME} \ 11 | --data.cv_test_fold_id ${FOLD} \ 12 | --trainer.num_nodes 1 \ 13 | --trainer.devices 1 \ 14 | --data.batch_size 8 \ 15 | &> output_logs/protein/${RUN_NAME}.log 16 | done 17 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/DMS/train_indels_LoRA_DDP.sh: -------------------------------------------------------------------------------- 1 | # Ning Sun 2 | TASK_NAME='B1LPA6_ECOSM_Russ_2020_indels' 3 | MUTATION_TYPE='indels' 4 | for FOLD in {0..4} 5 | do 6 | RUN_NAME=${TASK_NAME}_fold${FOLD} 7 | srun mgen fit --config experiments/AIDO.Protein/DMS/configs/indels_LoRA_DDP.yaml \ 8 | --data.train_split_files "[\"${MUTATION_TYPE}/${TASK_NAME}.tsv\"]" \ 9 | --trainer.logger.name ${RUN_NAME} \ 10 | --trainer.logger.id ${RUN_NAME} \ 11 | --data.cv_test_fold_id ${FOLD} \ 12 | --trainer.num_nodes 2 \ 13 | --data.batch_size 1 \ 14 | --trainer.callbacks.patience 5 \ 15 | &> output_logs/protein/${RUN_NAME}.log 16 | done 17 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/DMS/train_sub_LoRA_DDP.sh: -------------------------------------------------------------------------------- 1 | # Ning Sun 2 | TASK_NAME='A4GRB6_PSEAI_Chen_2020' 3 | MUTATION_TYPE='singles_substitutions' 4 | for FOLD in {0..4} 5 | do 6 | RUN_NAME=${TASK_NAME}_fold${FOLD} 7 | srun mgen fit --config experiments/AIDO.Protein/DMS/configs/substitution_LoRA_DDP.yaml \ 8 | --data.train_split_files "[\"${MUTATION_TYPE}/${TASK_NAME}.tsv\"]" \ 9 | --trainer.logger.name ${RUN_NAME} \ 10 | --trainer.logger.id ${RUN_NAME} \ 11 | --data.cv_test_fold_id ${FOLD} \ 12 | --trainer.num_nodes 4 \ 13 | --data.batch_size 2 \ 14 | --trainer.callbacks.patience 5 \ 15 | &> output_logs/protein/${RUN_NAME}.log 16 | done 17 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/DMS/train_sub_LoRA_FSDP.sh: -------------------------------------------------------------------------------- 1 | # Ning Sun 2 | TASK_NAME='CP2C9_HUMAN_Amorosi_2021_abundance' 3 | MUTATION_TYPE='singles_substitutions' 4 | for FOLD in {0..4} 5 | do 6 | RUN_NAME=${TASK_NAME}_fold${FOLD} 7 | srun mgen fit --config experiments/AIDO.Protein/DMS/configs/substitution_LoRA_FSDP.yaml \ 8 | --data.train_split_files "[\"${MUTATION_TYPE}/${TASK_NAME}.tsv\"]" \ 9 | --trainer.logger.name ${RUN_NAME} \ 10 | --trainer.logger.id ${RUN_NAME} \ 11 | --data.cv_test_fold_id ${FOLD} \ 12 | --trainer.num_nodes 4 \ 13 | --data.batch_size 2 \ 14 | --trainer.callbacks.patience 1 \ 15 | &> output_logs/protein/${RUN_NAME}.log 16 | done 17 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/utils/logger.py: -------------------------------------------------------------------------------- 1 | 2 | # -*-coding:utf-8-*- 3 | import logging.config 4 | 5 | 6 | def singleton(cls): 7 | instances = {} 8 | 9 | def get_instance(): 10 | if cls not in instances: 11 | instances[cls] = cls() 12 | return instances[cls] 13 | 14 | return get_instance() 15 | 16 | 17 | @singleton 18 | class Logger: 19 | def __init__(self): 20 | logging.basicConfig( 21 | format="%(asctime)s %(levelname)s %(process)d [%(filename)s:%(lineno)d] %(message)s", 22 | level=logging.INFO, 23 | ) 24 | self.logger = logging.getLogger("root") 25 | -------------------------------------------------------------------------------- /experiments/AIDO.DNA/zeroshot_variant_effect_prediction/Clinvar_300M_zeroshot_Diff.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | logger: false 5 | callbacks: 6 | - class_path: modelgenerator.callbacks.PredictionWriter 7 | dict_kwargs: 8 | output_dir: output_logs 9 | filetype: tsv 10 | write_cols: ['score','label'] 11 | model: 12 | class_path: modelgenerator.tasks.ZeroshotPredictionDiff 13 | init_args: 14 | backbone: 15 | class_path: modelgenerator.backbones.aido_dna_300m 16 | init_args: 17 | frozen: true 18 | data: 19 | class_path: modelgenerator.data.ClinvarRetrieve 20 | init_args: 21 | method: Diff 22 | window: 512 23 | batch_size: 5 24 | -------------------------------------------------------------------------------- /modelgenerator/adapters/base.py: -------------------------------------------------------------------------------- 1 | class SequenceAdapter: 2 | """Base class only for type hinting purposes. Used for Callable[[int, int] SequenceAdapter] types.""" 3 | 4 | pass 5 | 6 | 7 | class TokenAdapter: 8 | """Base class only for type hinting purposes. Used with Callable[[int, int] TokenAdapter] types.""" 9 | 10 | pass 11 | 12 | 13 | class ConditionalGenerationAdapter: 14 | """Base class only for type hinting purposes. Used for Callable[[int, int, int, nn.Module] ConditionalGenerationAdapter] types.""" 15 | 16 | pass 17 | 18 | 19 | class FusionAdapter: 20 | """Base class only for type hinting purposes. Used with Callable[[int, int, int, int] FusionAdapter] types.""" 21 | 22 | pass 23 | -------------------------------------------------------------------------------- /experiments/AIDO.Cell/README.md: -------------------------------------------------------------------------------- 1 | # AIDO.Cell 2 | 3 | AIDO.Cell-100M is GenBio AI’s SOTA cellular foundation model trained on 50 million cells over a diverse set of human tissues and organs. The AIDO.Cell models are capable of handling the entire human transcriptome as input, thus learning accurate and general representations of the human cell's entire transcriptional context. AIDO.Cell achieves state-of-the-art results in tasks such as zero-shot clustering, cell-type classification, and perturbation modeling. 4 | 5 | ## Resources 6 | - [Quick Start](./quickstart.ipynb) 7 | - [Cell Classification Tutorial](./tutorial_cell_classification.ipynb) 8 | - [AIDO.Cell HuggingFace Collection](https://huggingface.co/collections/genbio-ai/aidocell-6750f409bb20d8cd2cf14a25) 9 | -------------------------------------------------------------------------------- /experiments/AIDO.Cell/extract_features.py: -------------------------------------------------------------------------------- 1 | import anndata as ad 2 | import numpy as np 3 | import torch 4 | import sys 5 | from modelgenerator.tasks import Embed 6 | 7 | device = 'cuda' 8 | batch_size = 4 9 | 10 | model = Embed.from_config({ 11 | "model.backbone": "aido_cell_3m", 12 | "model.batch_size": batch_size 13 | }).eval() 14 | model = model.to(device).to(torch.float16) 15 | 16 | adata = ad.read_h5ad('../../modelgenerator/cell-downstream-tasks/zheng/zheng_train.h5ad') 17 | 18 | batch_np = adata[:batch_size].X.toarray() 19 | batch_tensor = torch.from_numpy(batch_np).to(torch.float16).to(device) 20 | batch_transformed = model.transform({'sequences': batch_tensor}) 21 | embs = model(batch_transformed) 22 | 23 | print(embs) 24 | -------------------------------------------------------------------------------- /experiments/AIDO.Cell/readme.md: -------------------------------------------------------------------------------- 1 | # AIDO.Cell 2 | 3 | AIDO.Cell-100M is GenBio AI’s SOTA cellular foundation model trained on 50 million cells over a diverse set of human tissues and organs. The AIDO.Cell models are capable of handling the entire human transcriptome as input, thus learning accurate and general representations of the human cell's entire transcriptional context. AIDO.Cell achieves state-of-the-art results in tasks such as zero-shot clustering, cell-type classification, and perturbation modeling. 4 | 5 | ## Resources 6 | - [Quick Start](./quickstart.ipynb) 7 | - [Cell Classification Tutorial](./tutorial_cell_classification.ipynb) 8 | - [AIDO.Cell HuggingFace Collection](https://huggingface.co/collections/genbio-ai/aidocell-6750f409bb20d8cd2cf14a25) 9 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/completions/genbio-aidosp.fish: -------------------------------------------------------------------------------- 1 | function _genbio_aidosp_completion; 2 | set -l response (env _GENBIO_AIDOSP_COMPLETE=fish_complete COMP_WORDS=(commandline -cp) COMP_CWORD=(commandline -t) genbio-aidosp); 3 | 4 | for completion in $response; 5 | set -l metadata (string split "," $completion); 6 | 7 | if test $metadata[1] = "dir"; 8 | __fish_complete_directories $metadata[2]; 9 | else if test $metadata[1] = "file"; 10 | __fish_complete_path $metadata[2]; 11 | else if test $metadata[1] = "plain"; 12 | echo $metadata[2]; 13 | end; 14 | end; 15 | end; 16 | 17 | complete --no-files --command genbio-aidosp --arguments "(_genbio_aidosp_completion)"; 18 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/protein_inverse_folding/merge_ckpt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | def reassemble_chunks(output_file, chunks_dir): 5 | with open(output_file, "wb") as f_out: 6 | for chunk_file in sorted(os.listdir(chunks_dir)): 7 | chunk_path = os.path.join(chunks_dir, chunk_file) 8 | with open(chunk_path, "rb") as f_in: 9 | f_out.write(f_in.read()) 10 | print(f"Reassembled model saved to {output_file}") 11 | 12 | 13 | if len(sys.argv) < 3: 14 | print("Usage: python merge_ckpt.py ") 15 | sys.exit(1) 16 | 17 | # Parameters 18 | chunks_dir = sys.argv[1] #"model_chunks" # Directory containing downloaded chunks 19 | output_file = sys.argv[2] 20 | 21 | reassemble_chunks(output_file, chunks_dir) 22 | -------------------------------------------------------------------------------- /experiments/AIDO.DNA/zeroshot_variant_effect_prediction/Clinvar_7B_zeroshot_Distance.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | logger: false 5 | callbacks: 6 | - class_path: modelgenerator.callbacks.PredictionWriter 7 | dict_kwargs: 8 | output_dir: output_logs 9 | filetype: tsv 10 | write_cols: ['score','norm_type','labels','num_layer'] 11 | model: 12 | class_path: modelgenerator.tasks.ZeroshotPredictionDistance 13 | init_args: 14 | backbone: 15 | class_path: modelgenerator.backbones.aido_dna_7b 16 | init_args: 17 | frozen: true 18 | all_hidden_states: True 19 | shared_ref: False 20 | data: 21 | class_path: modelgenerator.data.ClinvarRetrieve 22 | init_args: 23 | method: Distance 24 | window: 512 25 | batch_size: 5 26 | -------------------------------------------------------------------------------- /experiments/AIDO.DNA/zeroshot_variant_effect_prediction/Clinvar_300M_zeroshot_Distance.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | logger: false 5 | callbacks: 6 | - class_path: modelgenerator.callbacks.PredictionWriter 7 | dict_kwargs: 8 | output_dir: output_logs 9 | filetype: tsv 10 | write_cols: ['score','norm_type','labels','num_layer'] 11 | model: 12 | class_path: modelgenerator.tasks.ZeroshotPredictionDistance 13 | init_args: 14 | backbone: 15 | class_path: modelgenerator.backbones.aido_dna_300m 16 | init_args: 17 | frozen: true 18 | all_hidden_states: True 19 | shared_ref: False 20 | data: 21 | class_path: modelgenerator.data.ClinvarRetrieve 22 | init_args: 23 | method: Distance 24 | window: 512 25 | batch_size: 5 26 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/xTrimo/ssp_q3.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | 4 | RUN_NAME=ssp_AIDO.Protein_16B 5 | PROJECT=xtrimo_benchmark 6 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/ssp_q3.yaml 7 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME} 8 | 9 | if [ $MODE == "train" ]; then 10 | # using slurm script with 2 nodes (8 gpus in total) for training 11 | srun mgen fit --config $CONFIG_FILE \ 12 | --trainer.logger.name $RUN_NAME \ 13 | --trainer.logger.project $PROJECT \ 14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \ 15 | --trainer.num_nodes 2 16 | else 17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val* 18 | mgen test --config $CONFIG_FILE \ 19 | --data.batch_size 4 \ 20 | --model.strict_loading False \ 21 | --model.reset_optimizer_states True \ 22 | --trainer.logger null \ 23 | --ckpt_path $CKPT_PATH 24 | fi 25 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/modification_site_prediction.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | 4 | RUN_NAME=msp_aido_rna_1b600m 5 | PROJECT=rna_tasks 6 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME} 7 | CONFIG_FILE=experiments/AIDO.RNA/configs/modification_site_prediction.yaml 8 | 9 | if [ $MODE == "train" ]; then 10 | # using slurm script with 4 nodes (16 gpus in total) for training 11 | srun mgen fit --config $CONFIG_FILE \ 12 | --trainer.logger.name $RUN_NAME \ 13 | --trainer.logger.project $PROJECT \ 14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \ 15 | --trainer.num_nodes 4 16 | else 17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val* 18 | mgen test --config $CONFIG_FILE \ 19 | --data.batch_size 16 \ 20 | --model.strict_loading False \ 21 | --model.reset_optimizer_states True \ 22 | --trainer.logger null \ 23 | --ckpt_path $CKPT_PATH 24 | fi 25 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/xTrimo/fold_prediction.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | 4 | RUN_NAME=fold_AIDO.Protein_16B 5 | PROJECT=xtrimo_benchmark 6 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME} 7 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/fold_prediction.yaml 8 | 9 | if [ $MODE == "train" ]; then 10 | # using slurm script with 4 nodes (16 gpus in total) for training 11 | srun mgen fit --config $CONFIG_FILE \ 12 | --trainer.logger.name $RUN_NAME \ 13 | --trainer.logger.project $PROJECT \ 14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \ 15 | --trainer.num_nodes 4 16 | else 17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val* 18 | mgen test --config $CONFIG_FILE \ 19 | --data.batch_size 16 \ 20 | --model.strict_loading False \ 21 | --model.reset_optimizer_states True \ 22 | --trainer.logger null \ 23 | --ckpt_path $CKPT_PATH 24 | fi 25 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/scimilarity/model_v1.1/layer_sizes.json: -------------------------------------------------------------------------------- 1 | {"network.0.1.weight": [1024, 28231], "network.0.1.bias": [1024], "network.0.2.weight": [1024], "network.0.2.bias": [1024], "network.0.2.running_mean": [1024], "network.0.2.running_var": [1024], "network.0.2.num_batches_tracked": [], "network.0.3.weight": [1], "network.1.1.weight": [1024, 1024], "network.1.1.bias": [1024], "network.1.2.weight": [1024], "network.1.2.bias": [1024], "network.1.2.running_mean": [1024], "network.1.2.running_var": [1024], "network.1.2.num_batches_tracked": [], "network.1.3.weight": [1], "network.2.1.weight": [1024, 1024], "network.2.1.bias": [1024], "network.2.2.weight": [1024], "network.2.2.bias": [1024], "network.2.2.running_mean": [1024], "network.2.2.running_var": [1024], "network.2.2.num_batches_tracked": [], "network.2.3.weight": [1], "network.3.weight": [128, 1024], "network.3.bias": [128]} -------------------------------------------------------------------------------- /experiments/AIDO.Protein/xTrimo/tcr_pmhc_affinity.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | 4 | RUN_NAME=tcr_pmhc_affinity_AIDO.Protein_16B 5 | PROJECT=xtrimo_benchmark 6 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME} 7 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/tcr_pmhc_affinity.yaml 8 | 9 | if [ $MODE == "train" ]; then 10 | # using slurm script with 2 nodes (8 gpus in total) for training 11 | srun mgen fit --config $CONFIG_FILE \ 12 | --trainer.logger.name $RUN_NAME \ 13 | --trainer.logger.project $PROJECT \ 14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \ 15 | --trainer.num_nodes 2 16 | else 17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val* 18 | mgen test --config $CONFIG_FILE \ 19 | --data.batch_size 16 \ 20 | --model.strict_loading False \ 21 | --model.reset_optimizer_states True \ 22 | --trainer.logger null \ 23 | --ckpt_path $CKPT_PATH 24 | fi 25 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/ncrna_family_classfification.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | BOUNDARY_NOISE=bnoise0 #'bnoise0' 'bnoise200' 4 | 5 | RUN_NAME=nfc_${BOUNDARY_NOISE}_aido_rna_1b600m 6 | if [ $MODE == "train" ]; then 7 | CKPT_SAVE_DIR=logs/rna_tasks/${RUN_NAME} 8 | CUDA_VISIBLE_DEVICES=0,1 mgen fit --config experiments/AIDO.RNA/configs/ncrna_family_classification.yaml \ 9 | --data.config_name ncrna_family_${BOUNDARY_NOISE} \ 10 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR 11 | else 12 | CKPT_PATH=logs/rna_tasks/${RUN_NAME}/best_val* 13 | mgen test --config experiments/AIDO.RNA/configs/ncrna_family_classification.yaml \ 14 | --data.config_name ncrna_family_${BOUNDARY_NOISE} \ 15 | --data.batch_size 256 \ 16 | --model.strict_loading False \ 17 | --model.reset_optimizer_states True \ 18 | --trainer.logger null \ 19 | --ckpt_path $CKPT_PATH 20 | fi 21 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload to PyPI on release 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | pypi-publish: 9 | name: Publish release to PyPI 10 | runs-on: ubuntu-latest 11 | environment: 12 | name: pypi 13 | url: https://pypi.org/p/modelgenerator 14 | permissions: 15 | id-token: write 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up Python 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: "3.x" 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install build hatchling twine 26 | - name: Build package 27 | run: | 28 | python -m build 29 | - name: Publish package distributions to PyPI 30 | uses: pypa/gh-action-pypi-publish@release/v1 31 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/xTrimo/fluorescence_prediction.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | 4 | RUN_NAME=fluorescence_AIDO.Protein_16B 5 | PROJECT=xtrimo_benchmark 6 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/fluorescence_prediction.yaml 7 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME} 8 | 9 | if [ $MODE == "train" ]; then 10 | # using slurm script with 2 nodes (8 gpus in total) for training 11 | srun mgen fit --config $CONFIG_FILE \ 12 | --trainer.logger.name $RUN_NAME \ 13 | --trainer.logger.project $PROJECT \ 14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \ 15 | --trainer.num_nodes 2 16 | else 17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val* 18 | mgen test --config $CONFIG_FILE \ 19 | --data.batch_size 16 \ 20 | --model.strict_loading False \ 21 | --model.reset_optimizer_states True \ 22 | --trainer.logger null \ 23 | --ckpt_path $CKPT_PATH 24 | fi 25 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/xTrimo/contact_prediction_binary.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | 4 | RUN_NAME=AIDO.Protein_16B_fsdp_bs4 5 | PROJECT=xtrimo_benchmark 6 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/contact_prediction_binary.yaml 7 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME} 8 | 9 | if [ $MODE == "train" ]; then 10 | # using slurm script with 1 nodes (4 gpus in total) for training 11 | srun mgen fit --config $CONFIG_FILE \ 12 | --trainer.logger.name $RUN_NAME \ 13 | --trainer.logger.project $PROJECT \ 14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \ 15 | --trainer.num_nodes 1 16 | else 17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val*.ckpt 18 | mgen test --config $CONFIG_FILE \ 19 | --data.batch_size 1 \ 20 | --model.strict_loading False \ 21 | --model.reset_optimizer_states True \ 22 | --trainer.logger null \ 23 | --ckpt_path $CKPT_PATH 24 | fi 25 | -------------------------------------------------------------------------------- /docs/docs/api_reference/callbacks.md: -------------------------------------------------------------------------------- 1 | # Callbacks 2 | 3 | Callbacks can be used with the LightningCLI trainer to inject custom behavior into the training process. 4 | Callbacks are configured in the `trainer` section of the YAML configuration file. 5 | 6 | We provide a few custom callbacks for common use cases, but many more are available in the Lightning ecosystem. 7 | Check the [Trainer documentation](../trainer) for more details. 8 | 9 | ```yaml 10 | # Example Callback Configuration 11 | trainer: 12 | callbacks: 13 | - class_path: modelgenerator.callbacks.PredictionWriter 14 | dict_kwargs: 15 | output_dir: my_predictions 16 | filetype: tsv 17 | write_cols: 18 | - id 19 | - prediction 20 | - label 21 | model: 22 | ... 23 | data: 24 | ... 25 | ``` 26 | 27 | ::: modelgenerator.callbacks.PredictionWriter 28 | 29 | ::: modelgenerator.callbacks.FTScheduler 30 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/xTrimo/peptide_HLA_MHC_affinity.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | 4 | RUN_NAME=peptide_HLA_MHC_affinity_AIDO.Protein_16B 5 | PROJECT=xtrimo_benchmark 6 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME} 7 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/peptide_HLA_MHC_affinity.yaml 8 | 9 | if [ $MODE == "train" ]; then 10 | # using slurm script with 2 nodes (8 gpus in total) for training 11 | srun mgen fit --config $CONFIG_FILE \ 12 | --trainer.logger.name $RUN_NAME \ 13 | --trainer.logger.project $PROJECT \ 14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \ 15 | --trainer.num_nodes 2 16 | else 17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val* 18 | mgen test --config $CONFIG_FILE \ 19 | --data.batch_size 16 \ 20 | --model.strict_loading False \ 21 | --model.reset_optimizer_states True \ 22 | --trainer.logger null \ 23 | --ckpt_path $CKPT_PATH 24 | fi 25 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies and run tests with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Pytest 5 | 6 | on: [push] 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python-version: ["3.10", "3.11", "3.12"] 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v3 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | python -m pip install . 27 | - name: Test with pytest 28 | run: | 29 | pytest tests/ 30 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/completions/genbio-aidosp-complete.bash: -------------------------------------------------------------------------------- 1 | _genbio_aidosp_completion() { 2 | local IFS=$'\n' 3 | local response 4 | 5 | response=$(env COMP_WORDS="${COMP_WORDS[*]}" COMP_CWORD=$COMP_CWORD _GENBIO_AIDOSP_COMPLETE=bash_complete $1) 6 | 7 | for completion in $response; do 8 | IFS=',' read type value <<< "$completion" 9 | 10 | if [[ $type == 'dir' ]]; then 11 | COMPREPLY=() 12 | compopt -o dirnames 13 | elif [[ $type == 'file' ]]; then 14 | COMPREPLY=() 15 | compopt -o default 16 | elif [[ $type == 'plain' ]]; then 17 | COMPREPLY+=($value) 18 | fi 19 | done 20 | 21 | return 0 22 | } 23 | 24 | _genbio_aidosp_completion_setup() { 25 | complete -o nosort -F _genbio_aidosp_completion genbio-aidosp 26 | } 27 | 28 | _genbio_aidosp_completion_setup; 29 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/utils/data_process.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | def make_them_on_same_device(*args): 17 | if "cpu" in [d.device.type for d in args]: 18 | out = [d.cpu() for d in args] 19 | return out 20 | else: 21 | return args 22 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | on: 3 | push: 4 | branches: 5 | - main 6 | permissions: 7 | contents: write 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Configure Git Credentials 14 | run: | 15 | git config user.name github-actions[bot] 16 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com 17 | - uses: actions/setup-python@v5 18 | with: 19 | python-version: 3.10.15 20 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 21 | - uses: actions/cache@v4 22 | with: 23 | key: mkdocs-material-${{ env.cache_id }} 24 | path: .cache 25 | restore-keys: | 26 | mkdocs-material- 27 | - run: PIP_NO_CACHE_DIR=1 pip install ".[dev]" 28 | - run: mkdocs gh-deploy --config-file docs/mkdocs.yml --force 29 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2025 GenBio AI. 3 | # Copyright 2024 ByteDance and/or its affiliates. 4 | # Copyright 2021 AlQuraishi Laboratory 5 | # Copyright 2021 DeepMind Technologies Limited 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | from . import data, model, np, utils 21 | 22 | __all__ = ["model", "utils", "np", "data"] 23 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/README.md: -------------------------------------------------------------------------------- 1 | # Protein Structure Tokenizer 2 | This is the implementation for [genbio-ai/AIDO.StructureTokenizer](https://huggingface.co/genbio-ai/AIDO.StructureTokenizer). Due to the properties of protein data, it has a standalone data pipeline. The overall structure of this folder is as follows: 3 | 4 | - `callbacks`: Contains the callbacks used in saving structure tokens and PDB files 5 | - `configs`: Contains the configuration files for the model and data 6 | - `datasets`: Contains the dataset classes for handling PDB data and the data module 7 | - `layers`: Contains the custom layers used in the model 8 | - `models`: Contains the encoder (`equiformer_encoder.py`), decoder (`esmfold_decoder.py`), the full model (`structure_tokenizer.py`), and its lightning module (`structure_tokenizer_lightning.py`) 9 | - `utils`: Miscellaneous utility functions 10 | 11 | For the usage of this model, please refer to `experiments/AIDO.StructureTokenizer/README.md`. -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/model/layer_norm/kernel/compat.h: -------------------------------------------------------------------------------- 1 | // modified from https://github.com/NVIDIA/apex/blob/master/csrc/compat.h 2 | // Copyright 2021- HPC-AI Technology Inc. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | #ifndef TORCH_CHECK 17 | #define TORCH_CHECK AT_CHECK 18 | #endif 19 | 20 | #ifdef VERSION_GE_1_3 21 | #define DATA_PTR data_ptr 22 | #else 23 | #define DATA_PTR data 24 | #endif 25 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/tools/utils.py: -------------------------------------------------------------------------------- 1 | 2 | """Common utilities for data pipeline tools.""" 3 | import contextlib 4 | import datetime 5 | import logging 6 | import shutil 7 | import tempfile 8 | import time 9 | from typing import Optional 10 | 11 | 12 | @contextlib.contextmanager 13 | def tmpdir_manager(base_dir: Optional[str] = None): 14 | """Context manager that deletes a temporary directory on exit.""" 15 | tmpdir = tempfile.mkdtemp(dir=base_dir) 16 | try: 17 | yield tmpdir 18 | finally: 19 | shutil.rmtree(tmpdir, ignore_errors=True) 20 | 21 | 22 | @contextlib.contextmanager 23 | def timing(msg: str): 24 | logging.info("Started %s", msg) 25 | tic = time.perf_counter() 26 | yield 27 | toc = time.perf_counter() 28 | logging.info("Finished %s in %.3f seconds", msg, toc - tic) 29 | 30 | 31 | def to_date(s: str): 32 | return datetime.datetime(year=int(s[:4]), month=int(s[5:7]), day=int(s[8:10])) 33 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/job2fasta.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | def json2fasta(input_json, output_fasta): 5 | with open(input_json, 'r') as f: 6 | data = json.load(f) 7 | 8 | fasta_lines = [] 9 | for i, job in enumerate(data): 10 | for j, entity in enumerate(job['sequences']): 11 | if "proteinChain" in entity: 12 | sequence = entity['proteinChain']['sequence'] 13 | fasta_lines.append(f">job_{i}_entity_{j}\n{sequence}") 14 | 15 | with open(output_fasta, 'w') as f: 16 | f.write("\n".join(fasta_lines)) 17 | 18 | 19 | if __name__ == '__main__': 20 | import argparse 21 | parser = argparse.ArgumentParser(description='Construct FASTA from job JSON for protein MSA retrieval.') 22 | parser.add_argument('--input', type=str, help='Input JSON file') 23 | parser.add_argument('--output', type=str, help='Output FASTA file') 24 | args = parser.parse_args() 25 | json2fasta(args.input, args.output) 26 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/bin/search_msa.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI 2 | 3 | set -xe 4 | 5 | source activate msa 6 | 7 | PROJECT_DIR=$(cd "$(dirname $0)" && pwd)/.. 8 | 9 | export PATH=/workspace/env/mmseqs/bin/:$PATH 10 | 11 | #input=${PROJECT_DIR}/data/example/ # input a folder, could include multiple .fasta files 12 | input=${PROJECT_DIR}/data/example/T1104-D1.fasta # input a fasta, could include multiple sequences 13 | output_dir=local_msa_database/ 14 | mkdir -p ${output_dir} 15 | 16 | #config_yaml_path=${PROJECT_DIR}/yamls/mmseqs.yaml 17 | config_yaml_path=${PROJECT_DIR}/yamls/mmseqs_api.yaml 18 | 19 | cpus_per_task=4 20 | no_tasks=60 # tuning this number according to machine setting. 21 | 22 | python ${PROJECT_DIR}/search_msa.py \ 23 | --input=${input} \ 24 | --output_dir=${output_dir} \ 25 | --cpus_per_task=${cpus_per_task} \ 26 | --no_tasks=${no_tasks} \ 27 | --config_yaml_path=${config_yaml_path} \ 28 | --shuffle_file_list \ 29 | -------------------------------------------------------------------------------- /experiments/AIDO.StructureTokenizer/decode.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | max_steps: -1 5 | max_epochs: -1 6 | gradient_clip_val: 1 7 | default_root_dir: "logs/protstruct_decode/" 8 | logger: false 9 | callbacks: 10 | - class_path: modelgenerator.structure_tokenizer.callbacks.WriterPDBCallback 11 | dict_kwargs: 12 | dirpath: "logs/protstruct_decode/" 13 | 14 | model: 15 | class_path: modelgenerator.structure_tokenizer.models.ESMFoldDecoderLightning 16 | init_args: 17 | pretrained_model_name_or_path: "genbio-ai/AIDO.StructureDecoder" 18 | 19 | data: 20 | class_path: modelgenerator.structure_tokenizer.datasets.StructTokensLightningDataModule 21 | init_args: 22 | config: 23 | num_workers: 0 24 | struct_tokens_datasets_configs: 25 | - name: "casp15" 26 | struct_tokens_path: "logs/protstruct_encode/casp15_struct_tokens.pt" 27 | codebook_path: "logs/protstruct_encode/codebook.pt" 28 | batch_size: 2 29 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/splice_site_prediction.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | SPLICE_SITE=acceptor #'acceptor' 'donor' 4 | 5 | RUN_NAME=csp_${SPLICE_SITE}_aido_rna_1b600m 6 | if [ $MODE == "train" ]; then 7 | CKPT_SAVE_DIR=logs/rna_tasks/${RUN_NAME} 8 | CUDA_VISIBLE_DEVICES=0,1 mgen fit --config experiments/AIDO.RNA/configs/splice_site_prediction.yaml \ 9 | --data.config_name splice_site_${SPLICE_SITE} \ 10 | --trainer.logger.name $RUN_NAME \ 11 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR 12 | else 13 | CKPT_PATH=logs/rna_tasks/${RUN_NAME}/best_val* 14 | for TEST_TYPE in danio fly worm thaliana 15 | do 16 | echo $TEST_TYPE 17 | mgen test --config experiments/AIDO.RNA/configs/splice_site_prediction.yaml \ 18 | --data.config_name splice_site_${SPLICE_SITE} \ 19 | --data.test_split_name test_$TEST_TYPE \ 20 | --data.batch_size 256 \ 21 | --model.strict_loading False \ 22 | --model.reset_optimizer_states True \ 23 | --trainer.logger null \ 24 | --ckpt_path $CKPT_PATH 25 | done 26 | fi 27 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/utils/geometry/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI. 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # Copyright 2021 AlQuraishi Laboratory 4 | # Copyright 2021 DeepMind Technologies Limited 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Utils for geometry library.""" 19 | 20 | import dataclasses 21 | 22 | 23 | def get_field_names(cls): 24 | fields = dataclasses.fields(cls) 25 | field_names = [f.name for f in fields] 26 | return field_names 27 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/utils/hash_encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import hashlib 17 | 18 | 19 | def hash_seq(seq, method='md5'): 20 | """ 21 | hash the string sequence 22 | :param seq: 23 | :param method: 24 | :return: 25 | """ 26 | if method == "md5": 27 | hasher = hashlib.md5 28 | else: 29 | raise NotImplementedError 30 | code = hasher(seq.encode(encoding='utf-8')).hexdigest() 31 | 32 | return code 33 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/rna_inverse_folding/rna_inv_fold_test.yaml: -------------------------------------------------------------------------------- 1 | ckpt_path: null 2 | 3 | # Data Loading 4 | data: 5 | class_path: modelgenerator.rna_inv_fold.data_inverse_folding.datamodule.RNAInverseFoldingDataModule 6 | init_args: 7 | # path: /mgen_data/modelgenerator/datasets/rna_inv_fold/structure_encoding/ 8 | path: null 9 | 10 | # Model Arguments 11 | model: 12 | class_path: modelgenerator.rna_inv_fold.rif_task.RNAInvFold 13 | init_args: 14 | backbone: 15 | class_path: modelgenerator.backbones.aido_rna_1b600m 16 | custom_invfold_config: 17 | ## diffusion 18 | num_denoise_steps: 3 19 | diffusion_verbose: 1 20 | 21 | # Training Configuration 22 | trainer: 23 | accelerator: auto 24 | devices: 1 25 | max_steps: -1 26 | max_epochs: -1 27 | gradient_clip_val: null 28 | precision: 32 29 | default_root_dir: "/mgen_data/modelgenerator/logs/rna_inv_fold/" 30 | detect_anomaly: true 31 | 32 | # DDP strategy 33 | strategy: 34 | class_path: lightning.pytorch.strategies.DDPStrategy 35 | dict_kwargs: 36 | find_unused_parameters: true 37 | -------------------------------------------------------------------------------- /experiments/AIDO.StructureTokenizer/encode_decode.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | max_steps: -1 5 | max_epochs: -1 6 | gradient_clip_val: 1 7 | default_root_dir: "logs/protstruct_model/" 8 | logger: false 9 | callbacks: 10 | - class_path: modelgenerator.structure_tokenizer.callbacks.WriterPDBCallback 11 | dict_kwargs: 12 | dirpath: "logs/protstruct_model/" 13 | 14 | model: 15 | class_path: modelgenerator.structure_tokenizer.models.StructureTokenizerLightning 16 | init_args: 17 | pretrained_model_name_or_path: "genbio-ai/AIDO.StructureTokenizer" 18 | 19 | data: 20 | class_path: modelgenerator.structure_tokenizer.datasets.ProteinLightningDataModule 21 | init_args: 22 | config: 23 | num_workers: 0 24 | seed: 0 25 | proteins_datasets_configs: 26 | - name: "casp15" 27 | registry_path: "data/protstruct_sample_data/registries/casp15_merged.csv" 28 | folder_path: "data/protstruct_sample_data/CASP15_merged/" 29 | max_nb_res: 1024 30 | batch_size: 2 31 | seed: 0 32 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/utils/precision_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI. 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # Copyright 2021 AlQuraishi Laboratory 4 | # Copyright 2021 DeepMind Technologies Limited 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | import torch 20 | 21 | 22 | def is_fp16_enabled(): 23 | # Autocast world 24 | fp16_enabled = torch.get_autocast_gpu_dtype() == torch.float16 25 | fp16_enabled = fp16_enabled and torch.is_autocast_enabled() 26 | 27 | return fp16_enabled 28 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/expression_level_prediction.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | CELL_LINE=pc3 #'Muscle' 'HEK' 'pc3' 4 | 5 | if [ $MODE == "train" ]; then 6 | for FOLD in {0..9} 7 | do 8 | RUN_NAME=el_${CELL_LINE}_aido_rna_1b600m_fold${FOLD} 9 | CKPT_SAVE_DIR=logs/rna_tasks/${RUN_NAME} 10 | CUDA_VISIBLE_DEVICES=0 mgen fit --config experiments/AIDO.RNA/configs/expression_level.yaml \ 11 | --data.config_name expression_${CELL_LINE} \ 12 | --data.cv_test_fold_id $FOLD \ 13 | --trainer.logger.name $RUN_NAME \ 14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR 15 | done 16 | else 17 | for FOLD in {0..9} 18 | do 19 | CKPT_PATH=logs/rna_tasks/el_${CELL_LINE}_aido_rna_1b600m_fold${FOLD}/best_val* 20 | echo ">>> Fold ${FOLD}" 21 | mgen test --config experiments/AIDO.RNA/configs/expression_level.yaml \ 22 | --data.config_name expression_${CELL_LINE} \ 23 | --data.cv_test_fold_id $FOLD \ 24 | --model.strict_loading True \ 25 | --model.reset_optimizer_states True \ 26 | --trainer.logger null \ 27 | --ckpt_path $CKPT_PATH 28 | done 29 | fi 30 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/data/errors.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI. 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # Copyright 2021 AlQuraishi Laboratory 4 | # Copyright 2021 DeepMind Technologies Limited 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | """General-purpose errors used throughout the data pipeline""" 20 | 21 | 22 | class Error(Exception): 23 | """Base class for exceptions.""" 24 | 25 | 26 | class MultipleChainsError(Error): 27 | """An error indicating that multiple chains were found for a given ID.""" 28 | -------------------------------------------------------------------------------- /experiments/AIDO.StructureTokenizer/encode.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | max_steps: -1 5 | max_epochs: -1 6 | gradient_clip_val: 1 7 | default_root_dir: "logs/protstruct_encode/" 8 | logger: false 9 | callbacks: 10 | - class_path: modelgenerator.structure_tokenizer.callbacks.StructTokensCallback 11 | dict_kwargs: 12 | output_dir: "logs/protstruct_encode/" 13 | write_interval: "epoch" 14 | 15 | model: 16 | class_path: modelgenerator.structure_tokenizer.models.EquiformerEncoderLightning 17 | init_args: 18 | pretrained_model_name_or_path: "genbio-ai/AIDO.StructureEncoder" 19 | 20 | data: 21 | class_path: modelgenerator.structure_tokenizer.datasets.ProteinLightningDataModule 22 | init_args: 23 | config: 24 | num_workers: 0 25 | seed: 0 26 | proteins_datasets_configs: 27 | - name: "casp15" 28 | registry_path: "data/protstruct_sample_data/registries/casp15_merged.csv" 29 | folder_path: "data/protstruct_sample_data/CASP15_merged/" 30 | max_nb_res: 1024 31 | batch_size: 2 32 | seed: 0 33 | -------------------------------------------------------------------------------- /experiments/AIDO.StructureTokenizer/structure_encoding.sh: -------------------------------------------------------------------------------- 1 | # 1. download the sample dataset 2 | huggingface-cli download genbio-ai/sample-structure-dataset --repo-type dataset --local-dir ./data/protstruct_sample_data/ 3 | 4 | set -ex 5 | 6 | # 2. run encoding and then decoding 7 | # check logs/protstruct_model/casp15_pdb_files 8 | # *_input.pdb are the original pdb files 9 | # *_output.pdb are the reconstructed pdb files 10 | echo "run encoding and then decoding" 11 | CUDA_VISIBLE_DEVICES=0 mgen predict --config=experiments/AIDO.StructureTokenizer/encode_decode.yaml 12 | 13 | 14 | # 3. run encoding only 15 | # check logs/protstruct_encode/casp15_struct_tokens.pt for the output tokens 16 | # logs/protstruct_encode/codebook.pt for the codebook 17 | echo "run encoding only" 18 | CUDA_VISIBLE_DEVICES=0 mgen predict --config=experiments/AIDO.StructureTokenizer/encode.yaml 19 | 20 | # 4. decode the tokens from step 3 21 | # check logs/protstruct_decode/casp15_pdb_files for the output structures 22 | echo "decode the tokens from the encoding step" 23 | CUDA_VISIBLE_DEVICES=0 mgen predict --config=experiments/AIDO.StructureTokenizer/decode.yaml 24 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/protein_inverse_folding/protein_inv_fold_test.yaml: -------------------------------------------------------------------------------- 1 | ckpt_path: null 2 | 3 | # Data Loading 4 | data: 5 | class_path: modelgenerator.prot_inv_fold.data_inverse_folding.datamodule.ProteinInverseFoldingDataModule 6 | init_args: 7 | # path: /mgen_data/modelgenerator/datasets/protein_inv_fold/cath_4.2/ 8 | path: null 9 | 10 | # Model Arguments 11 | model: 12 | class_path: modelgenerator.prot_inv_fold.pif_task.ProteinInvFold 13 | init_args: 14 | backbone: 15 | class_path: modelgenerator.backbones.aido_protein_16b 16 | custom_invfold_config: 17 | ## custom setting for diffusion 18 | num_denoise_steps: 3 19 | diffusion_verbose: 1 20 | 21 | # Training Configuration 22 | trainer: 23 | accelerator: auto 24 | devices: 3, 25 | max_steps: -1 26 | max_epochs: -1 27 | gradient_clip_val: null 28 | precision: 32 29 | default_root_dir: "/mgen_data/modelgenerator/logs/protein_inv_fold/" 30 | detect_anomaly: true 31 | 32 | # DDP strategy 33 | strategy: 34 | class_path: lightning.pytorch.strategies.DDPStrategy 35 | dict_kwargs: 36 | find_unused_parameters: true 37 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/translation_efficiency_prediction.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | CELL_LINE=Muscle #'Muscle' 'HEK' 'pc3' 4 | 5 | if [ $MODE == "train" ]; then 6 | for FOLD in {0..9} 7 | do 8 | RUN_NAME=te_${CELL_LINE}_aido_rna_1b600m_fold${FOLD} 9 | CKPT_SAVE_DIR=logs/rna_tasks/${RUN_NAME} 10 | CUDA_VISIBLE_DEVICES=0 mgen fit --config experiments/AIDO.RNA/configs/translation_efficiency.yaml \ 11 | --data.config_name translation_efficiency_${CELL_LINE} \ 12 | --data.cv_test_fold_id $FOLD \ 13 | --trainer.logger.name $RUN_NAME \ 14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR 15 | done 16 | else 17 | for FOLD in {0..9} 18 | do 19 | CKPT_PATH=logs/rna_tasks/te_${CELL_LINE}_aido_rna_1b600m_fold${FOLD}/best_val* 20 | echo ">>> Fold ${FOLD}" 21 | mgen test --config experiments/AIDO.RNA/configs/translation_efficiency.yaml \ 22 | --data.config_name translation_efficiency_${CELL_LINE} \ 23 | --data.cv_test_fold_id $FOLD \ 24 | --model.strict_loading True \ 25 | --model.reset_optimizer_states True \ 26 | --trainer.logger null \ 27 | --ckpt_path $CKPT_PATH 28 | done 29 | fi 30 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/enformer_pytorch/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Phil Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /modelgenerator/rna_inv_fold/gRNAde_structure_encoder/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Chaitanya K. Joshi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /experiments/AIDO.RNA/protein_abundance_prediction.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | ORGANISM=hsapiens #'athaliana' 'dmelanogaster' 'ecoli' 'hsapiens' 'scerevisiae' 4 | 5 | PROJECT=rna_tasks 6 | if [ $MODE == "train" ]; then 7 | for FOLD in {0..4} 8 | do 9 | RUN_NAME=pa_${ORGANISM}_aido_rna_1b600mـcds_fold${FOLD} 10 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME} 11 | mgen fit --config experiments/AIDO.RNA/configs/protein_abundance.yaml \ 12 | --data.config_name protein_abundance_${ORGANISM} \ 13 | --data.cv_test_fold_id $FOLD \ 14 | --trainer.logger.name $RUN_NAME \ 15 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR 16 | done 17 | else 18 | for FOLD in {0..4} 19 | do 20 | RUN_NAME=pa_${ORGANISM}_aido_rna_1b600mـcds_fold${FOLD} 21 | CKPT_PATH=logs/${PROJECT}/${RUN_NAME}/best_val* 22 | echo ">>> Fold ${FOLD}" 23 | mgen test --config experiments/AIDO.RNA/configs/protein_abundance.yaml \ 24 | --data.config_name protein_abundance_${ORGANISM} \ 25 | --data.cv_test_fold_id $FOLD \ 26 | --model.strict_loading False \ 27 | --model.reset_optimizer_states True \ 28 | --trainer.logger null \ 29 | --ckpt_path $CKPT_PATH 30 | done 31 | fi 32 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.11.7 4 | hooks: 5 | - id: ruff 6 | args: [--config, pyproject.toml, --fix] 7 | - id: ruff-format 8 | args: [--config, pyproject.toml] 9 | - repo: https://github.com/pre-commit/pre-commit-hooks 10 | rev: v5.0.0 11 | hooks: 12 | - id: trailing-whitespace 13 | exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer) 14 | - id: end-of-file-fixer 15 | exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer) 16 | - id: check-yaml 17 | exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer) 18 | - id: debug-statements 19 | exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer) 20 | - id: check-added-large-files 21 | exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer) 22 | - repo: https://github.com/python-poetry/poetry 23 | rev: 2.1.2 24 | hooks: 25 | - id: poetry-check 26 | - id: poetry-lock 27 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/transcript_abundance_prediction.sh: -------------------------------------------------------------------------------- 1 | # Shuxian Zou 2 | MODE=train 3 | ORGANISM=ecoli #'athaliana' 'dmelanogaster' 'ecoli' 'hsapiens' 'scerevisiae' 'ppastoris' 'hvolcanii' 4 | 5 | PROJECT=rna_tasks 6 | if [ $MODE == "train" ]; then 7 | for FOLD in {0..4} 8 | do 9 | RUN_NAME=ta_${ORGANISM}_aido_rna_1b600m_fold${FOLD} 10 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME} 11 | mgen fit --config experiments/AIDO.RNA/configs/transcript_abundance.yaml \ 12 | --data.config_name transcript_abundance_${ORGANISM} \ 13 | --data.cv_test_fold_id $FOLD \ 14 | --trainer.logger.name $RUN_NAME \ 15 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR 16 | done 17 | else 18 | for FOLD in {0..4} 19 | do 20 | RUN_NAME=ta_${ORGANISM}_aido_rna_1b600m_fold${FOLD} 21 | CKPT_PATH=logs/${PROJECT}/${RUN_NAME}/best_val* 22 | echo ">>> Fold ${FOLD}" 23 | mgen test --config experiments/AIDO.RNA/configs/transcript_abundance.yaml \ 24 | --data.config_name transcript_abundance_${ORGANISM} \ 25 | --data.cv_test_fold_id $FOLD \ 26 | --model.strict_loading False \ 27 | --model.reset_optimizer_states True \ 28 | --trainer.logger null \ 29 | --ckpt_path $CKPT_PATH 30 | done 31 | fi 32 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/borzoi_pytorch/config_borzoi.py: -------------------------------------------------------------------------------- 1 | from transformers import PretrainedConfig 2 | 3 | class BorzoiConfig(PretrainedConfig): 4 | model_type = "borzoi" 5 | 6 | def __init__( 7 | self, 8 | dim = 1536, 9 | depth = 8, 10 | heads = 8, 11 | # output_heads = dict(human = 5313, mouse= 1643), 12 | return_center_bins_only = True, 13 | attn_dim_key = 64, 14 | attn_dim_value = 192, 15 | dropout_rate = 0.2, 16 | attn_dropout = 0.05, 17 | pos_dropout = 0.01, 18 | enable_mouse_head = False, 19 | bins_to_return = 6144, 20 | **kwargs, 21 | ): 22 | self.dim = dim 23 | self.depth = depth 24 | self.heads = heads 25 | # self.output_heads = output_heads 26 | self.attn_dim_key = attn_dim_key 27 | self.attn_dim_value = attn_dim_value 28 | self.dropout_rate = dropout_rate 29 | self.attn_dropout = attn_dropout 30 | self.pos_dropout = pos_dropout 31 | self.return_center_bins_only = return_center_bins_only 32 | self.enable_mouse_head = enable_mouse_head 33 | self.bins_to_return = bins_to_return 34 | super().__init__(**kwargs) -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/data/ccd_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from pathlib import Path 17 | 18 | COMPONENTS_FILE = None 19 | RKDIT_MOL_PKL = None 20 | 21 | def set_components_file(components_file): 22 | global COMPONENTS_FILE 23 | COMPONENTS_FILE = components_file 24 | 25 | def set_rkdit_mol_pkl(rkdit_mol_pkl): 26 | global RKDIT_MOL_PKL 27 | RKDIT_MOL_PKL = Path(rkdit_mol_pkl) 28 | 29 | 30 | def get_components_file(): 31 | global COMPONENTS_FILE 32 | return COMPONENTS_FILE 33 | 34 | def get_rkdit_mol_pkl(): 35 | global RKDIT_MOL_PKL 36 | return RKDIT_MOL_PKL 37 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/utils/misc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from omegaconf import OmegaConf 4 | 5 | 6 | def get_config_from_dict(config_dict: dict, config: type): 7 | schema = OmegaConf.structured(config) 8 | config = OmegaConf.create(config_dict) 9 | merged = OmegaConf.merge(schema, config) 10 | return OmegaConf.to_object(merged) 11 | 12 | 13 | def cdist( 14 | x: torch.Tensor, 15 | y: torch.Tensor, 16 | mask_x: torch.Tensor | None = None, 17 | mask_y: torch.Tensor | None = None, 18 | zero_diag: bool = False, 19 | ) -> torch.Tensor: 20 | # where mask is False, the distance is set to inf 21 | cdist = torch.cdist(x, y) 22 | if zero_diag: 23 | assert ( 24 | cdist.shape[-1] == cdist.shape[-2] 25 | ), f"Zeroing diagonal is only supported for square matrix, got {cdist.shape}" 26 | N = cdist.shape[-1] 27 | device = cdist.device 28 | eye = torch.eye(N, dtype=torch.bool, device=device) 29 | cdist = torch.where(eye, 0, cdist) 30 | if mask_x is not None: 31 | cdist = torch.where(mask_x[..., :, None].bool(), cdist, np.inf) 32 | if mask_y is not None: 33 | cdist = torch.where(mask_y[..., None, :].bool(), cdist, np.inf) 34 | return cdist 35 | -------------------------------------------------------------------------------- /experiments/AIDO.DNA/dependency_mapping/depmap.csv: -------------------------------------------------------------------------------- 1 | id,sequence 2 | >KIF26B_1(+)|TATA|1,GCGCCGCCACATAAAATGGATCCCGGCCGGCGCGGCGAGGGCGGCAGGTTCCCGAGGCTCCTCCGCGCTGCGCCCGGGCGCACACGCGCGCTGTGACCGCCGACCGCTCCCGGGCCACGCGGAGCCGCCCCTCTCCCGGCCCTCGCGCAACTGTCAGGCGAAACGGGCCGGCGGATATTGGCTCGGCGACACGCCGAGGCTCCTCCCCGAGTCTGGATCTTTATATTTTGGGAGAATTTCTTTGAACTCAGTTACCAAGCTCGGTGAAGGAGACAAGTTCCCACAGCTGACTCGGCTCGG 3 | >M3ZUZ2_XIPMA|1,GATAAAACATTAAGTTGTCCTGAAGCGGTTTGACGTTACGTTTCACTGTTTAAGGACAAGGAGGCCGCGTCACGATGGTCCCCATCTTCACACTGAAGCTAAACCACAAGATTAACCCCCGCATGGTGGCTGTTGGAAAGTTTGATGGAGTGCACCCATGTCTAACAGCAGCAACACAAGCAGGAAAGGTGAGGGGAATATGTAGCATAACTGCTCAGCCTGCAGGAGGCTTCAAAGTTGCTGAAGGAACATACAGTATATCAAATATTTTCATTTTAACCTTAACTGTTCTTCATTTACAGGTTTTCATTCACAACCCTCATGATCGTGGTCAGAGACCTGCGACCCATCGACTGAGCCAGAGCACCCAGGACTCTGATATCTCTCTTCTCAACATCAACCAGGCCGTAACATGTTTGACTGCAGGGACACTGGGACCAAACACCACAGGAGACACGCTTCTGGTGGGATCTCAGACCAATCTGTTGGCCTATGATGTTCACGACAATACAGATGTTTTTTACAGAGACGTAAGTGGAAGAACTATCTTTGGGGTCACTGGATGTAGAGCAGACTCCTTTTTTTGTGATGTTTGTTCAT 4 | >SV2C_1(+)|no_TATA|1,CCCAGTCCCACACCGCAGCAGCGCCTCAGCACCGCGACTTGCCGGAGCACCGCGAGTGGCGCGCGGGTCCCGCCTCCCCCGCGCGCCGTGACTCCCTGCGCACCGCTGGTACTCTCGCCACGCCGCCGCCCGGCACTGCAGCACCAGGGGGAGGAGGCAGGCGGAGGAGAGGAGGAGGACCGCAGCGTGCAAGCCGGGAGCCACTTTCCCGCCCCTCCTCTCGCCGCTGACACGCTCAGAGGAGTCACCACTCCGCGCGCTGCAGGCGAGAGTGGCAGACGGAGGCAGCCCGGGGAAGCG 5 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/utils/logger.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2025 GenBio AI 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # -*-coding:utf-8-*- 18 | 19 | import logging.config 20 | 21 | 22 | def singleton(cls): 23 | instances = {} 24 | 25 | def get_instance(): 26 | if cls not in instances: 27 | instances[cls] = cls() 28 | return instances[cls] 29 | 30 | return get_instance() 31 | 32 | 33 | @singleton 34 | class Logger: 35 | def __init__(self): 36 | logging.basicConfig( 37 | format="%(asctime)s %(levelname)s %(process)d [%(filename)s:%(lineno)d] %(message)s", 38 | level=logging.INFO, 39 | ) 40 | self.logger = logging.getLogger("root") 41 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/configs/lightning_configs.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from enum import Enum 3 | from pathlib import Path 4 | from typing import Any 5 | 6 | 7 | class Device(str, Enum): 8 | CPU = "cpu" 9 | CUDA = "cuda" 10 | 11 | 12 | @dataclass 13 | class ValidationExperimentConfig: 14 | name: str # name of the project 15 | wandb_project: str 16 | output_dir: str | Path # runs directory 17 | path_ckpt: str | Path 18 | device: Device 19 | log_every_n_steps: int 20 | loggers: list[Any] 21 | devices: Any = "auto" # list[int] | str | int 22 | seed: int = 0 # not necessary if no cropping 23 | 24 | 25 | @dataclass 26 | class EncodingConfig: 27 | folder_name: str # folder name 28 | output_dir: str | Path # output directory 29 | path_ckpt: str | Path 30 | device: Device 31 | return_predictions: bool 32 | devices: Any = "auto" # list[int] | str | int 33 | seed: int = 0 # not necessary if no cropping 34 | 35 | 36 | @dataclass 37 | class DecodingConfig: 38 | folder_name: str # folder name 39 | output_dir: str | Path # output directory 40 | path_ckpt: str | Path 41 | device: Device 42 | devices: Any = "auto" # list[int] | str | int 43 | seed: int = 0 # not necessary ? 44 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/rna_secondary_structure_prediction/rna_secondary_structure_prediction.sh: -------------------------------------------------------------------------------- 1 | # Sazan Mahbub 2 | MODE=$1 ## set it to "train" for finetuning the RNA-FM for RNA secondary structure prediction 3 | 4 | RUN_NAME=rna_ss 5 | DATASET_NAME=$2 ## set the named of the dataset 6 | CKPT_SAVE_DIR=logs/${RUN_NAME}/${DATASET_NAME} 7 | 8 | if [ $MODE == "train" ]; then 9 | mgen fit --config rna_ss_prediction.yaml \ 10 | --data.path ${MGEN_DATA_DIR}/modelgenerator/datasets/rna_ss_data/ \ 11 | --data.dataset ${DATASET_NAME} \ 12 | --trainer.default_root_dir ${CKPT_SAVE_DIR} \ 13 | --trainer.callbacks.ft_schedule_path ft_schedules/layers_0_32.yaml \ 14 | --trainer.devices 0,1,2,3 15 | 16 | else 17 | # CKPT_PATH=${MGEN_DATA_DIR}/modelgenerator/huggingface_models/rna_ss/AIDO.RNA-1.6B-${DATASET_NAME}_secondary_structure_prediction/model.ckpt 18 | CKPT_PATH=$3 ## set the path to the checkpoint file (example shown in the commented line above) 19 | mgen test --config rna_ss_prediction.yaml \ 20 | --data.path ${MGEN_DATA_DIR}/modelgenerator/datasets/rna_ss_data/ \ 21 | --data.dataset ${DATASET_NAME} \ 22 | --trainer.default_root_dir ${CKPT_SAVE_DIR} \ 23 | --trainer.callbacks.ft_schedule_path ft_schedules/layers_0_32.yaml \ 24 | --ckpt_path ${CKPT_PATH} \ 25 | --trainer.devices 0, 26 | fi 27 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/multimodal_isoform_expression/isoform_expression_prediction.sh: -------------------------------------------------------------------------------- 1 | MODE=train 2 | 3 | RUN_NAME=enformer_rnafm1.6b-cds_esm2_concat_fusion 4 | CONFIG_FILE=experiments/AIDO.MM/configs/isoform_expression_dna_rna_prot_concat.yaml 5 | 6 | # RUN_NAME=enformer_rnafm1.6b-cds_concat_fusion 7 | # CONFIG_FILE=experiments/AIDO.MM/configs/isoform_expression_dna_rna_concat.yaml 8 | 9 | # RUN_NAME=enformer_aidorna650m_esm2_attention_fusion 10 | # CONFIG_FILE=experiments/AIDO.MM/configs/isoform_expression_dna_rna_prot_attention.yaml 11 | 12 | # RUN_NAME=enformer_aidorna650m_attention_fusion 13 | # CONFIG_FILE=experiments/AIDO.MM/configs/isoform_expression_dna_rna_attention.yaml 14 | 15 | PROJECT=isoform_tasks 16 | CKPT_SAVE_DIR=${GENBIO_DATA_DIR}/genbio_finetune/logs/${PROJECT}/${RUN_NAME} 17 | 18 | if [ $MODE == "train" ]; then 19 | mgen fit --config $CONFIG_FILE \ 20 | --trainer.logger.name $RUN_NAME \ 21 | --trainer.logger.project $PROJECT \ 22 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \ 23 | --model.optimizer.lr 1e-4 \ 24 | --data.batch_size 1 25 | else 26 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val* 27 | mgen test --config $CONFIG_FILE \ 28 | --data.batch_size 16 \ 29 | --trainer.logger null \ 30 | --model.strict_loading False \ 31 | --model.reset_optimizer_states True \ 32 | --ckpt_path $CKPT_PATH 33 | fi 34 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/utils/kernel/csrc/softmax_cuda_stub.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2021 AlQuraishi Laboratory 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // modified from fastfold/model/fastnn/kernel/cuda_native/csrc/softmax_cuda.cpp 16 | 17 | #include 18 | 19 | void attn_softmax_inplace_forward_( 20 | at::Tensor input, 21 | long long rows, int cols 22 | ) 23 | { 24 | throw std::runtime_error("attn_softmax_inplace_forward_ not implemented on CPU"); 25 | }; 26 | void attn_softmax_inplace_backward_( 27 | at::Tensor output, 28 | at::Tensor d_ov, 29 | at::Tensor values, 30 | long long rows, 31 | int cols_output, 32 | int cols_values 33 | ) 34 | { 35 | throw std::runtime_error("attn_softmax_inplace_backward_ not implemented on CPU"); 36 | }; 37 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/utils/geometry/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI. 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # Copyright 2021 AlQuraishi Laboratory 4 | # Copyright 2021 DeepMind Technologies Limited 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | from fold.openfold_local.utils.geometry import rigid_matrix_vector 20 | from fold.openfold_local.utils.geometry import rotation_matrix 21 | from fold.openfold_local.utils.geometry import vector 22 | 23 | Rot3Array = rotation_matrix.Rot3Array 24 | Rigid3Array = rigid_matrix_vector.Rigid3Array 25 | 26 | Vec3Array = vector.Vec3Array 27 | square_euclidean_distance = vector.square_euclidean_distance 28 | euclidean_distance = vector.euclidean_distance 29 | dihedral_angle = vector.dihedral_angle 30 | dot = vector.dot 31 | cross = vector.cross 32 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/completions/commands.py: -------------------------------------------------------------------------------- 1 | from importlib.resources import files 2 | 3 | import click 4 | 5 | 6 | @click.command() 7 | @click.option( 8 | "-s", 9 | "--shell", 10 | type=click.Choice(["bash", "zsh", "fish"]), 11 | default="bash", 12 | help="Shell to generate completion script for", 13 | ) 14 | def completion(shell: str) -> None: 15 | """Generate shell completion script and exit 16 | 17 | This command generates a shell completion script for the GenBio AIDO Structure Prediction CLI. 18 | The generated script should be added to your shell configuration file: 19 | 20 | \b 21 | - bash: ~/.bashrc 22 | - zsh: ~/.zshrc 23 | - fish: ~/.config/fish/completions/genbio-aidosp.fish 24 | 25 | After modifying the shell config, you need to start a new shell in order for the changes to be loaded. 26 | """ 27 | completions = files("genbio.aidosp.cli.completions") 28 | if shell == "bash": 29 | completion_script = completions.joinpath("genbio-aidosp-complete.bash") 30 | elif shell == "zsh": 31 | completion_script = completions.joinpath("genbio-aidosp-complete.zsh") 32 | elif shell == "fish": 33 | completion_script = completions.joinpath("genbio-aidosp.fish") 34 | else: 35 | raise click.BadParameter(f"Unsupported shell: {shell}") 36 | 37 | click.echo(completion_script.read_text()) 38 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/enformer_pytorch/config_enformer.py: -------------------------------------------------------------------------------- 1 | from transformers import PretrainedConfig 2 | 3 | class EnformerConfig(PretrainedConfig): 4 | model_type = "enformer" 5 | 6 | def __init__( 7 | self, 8 | dim = 1536, 9 | depth = 11, 10 | heads = 8, 11 | output_heads = dict(human = 5313, mouse= 1643), 12 | target_length = 896, 13 | attn_dim_key = 64, 14 | dropout_rate = 0.4, 15 | attn_dropout = 0.05, 16 | pos_dropout = 0.01, 17 | use_checkpointing = False, 18 | use_convnext = False, 19 | num_downsamples = 7, # genetic sequence is downsampled 2 ** 7 == 128x in default Enformer - can be changed for higher resolution 20 | dim_divisible_by = 128, 21 | use_tf_gamma = False, 22 | **kwargs, 23 | ): 24 | self.dim = dim 25 | self.depth = depth 26 | self.heads = heads 27 | self.output_heads = output_heads 28 | self.target_length = target_length 29 | self.attn_dim_key = attn_dim_key 30 | self.dropout_rate = dropout_rate 31 | self.attn_dropout = attn_dropout 32 | self.pos_dropout = pos_dropout 33 | self.use_checkpointing = use_checkpointing 34 | self.num_downsamples = num_downsamples 35 | self.dim_divisible_by = dim_divisible_by 36 | self.use_tf_gamma = use_tf_gamma 37 | 38 | super().__init__(**kwargs) -------------------------------------------------------------------------------- /experiments/AIDO.Cell/sctab_conversion.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import anndata as ad 5 | import tiledbsoma.io 6 | from tqdm import tqdm 7 | 8 | # Change the root path to your downloaded location 9 | sctab_root_path = "merlin_cxg_2023_05_15_sf-log1p/" 10 | # Output path 11 | soma_exp_output_path = "soma-exp-scTab/" 12 | 13 | if not os.path.isdir(soma_exp_output_path): 14 | os.makedirs(soma_exp_output_path) 15 | 16 | for split in ["train", "val", "test"]: 17 | df = pd.DataFrame() 18 | for fname in tqdm(os.listdir(os.path.join(sctab_root_path, split)), desc=f"Loading {split} data files"): 19 | if not fname.endswith('.parquet'): 20 | continue 21 | fpath = os.path.join(sctab_root_path, split, fname) 22 | # Read the parquet file into a pandas DataFrame 23 | df = pd.concat([df, pd.read_parquet(fpath)]) 24 | 25 | print("Converting ...") 26 | # Create AnnData object with the data 27 | adata = ad.AnnData(np.array(list(df['X']))) 28 | adata.obs = df[['cell_type']] 29 | adata.var = pd.read_parquet(os.path.join(sctab_root_path, "var.parquet")) 30 | # Save the data object into a TileDB experiment folder 31 | tiledbsoma.io.from_anndata( 32 | experiment_uri=os.path.join(soma_exp_output_path, split), 33 | measurement_name="RNA", 34 | anndata=adata 35 | ) 36 | print(f"Data conversion for split '{split}' is done!" 37 | ) 38 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/utils/io_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import ml_collections 3 | import yaml 4 | from genbio.aidosp.msa_retrieve.msar.utils.logger import Logger 5 | 6 | logger = Logger.logger 7 | 8 | 9 | def load_yaml(yaml_path): 10 | with open(yaml_path) as f: 11 | hyp = yaml.load(f, Loader=yaml.SafeLoader) 12 | config = ml_collections.ConfigDict(hyp) 13 | 14 | return config 15 | 16 | 17 | def read_fasta(fasta_string: str): 18 | """Parses FASTA string and returns list of strings with amino-acid sequences. 19 | 20 | Arguments: 21 | fasta_string: The string contents of a FASTA file. 22 | 23 | Returns: 24 | A tuple of two lists: 25 | * A list of sequences. 26 | * A list of sequence descriptions taken from the comment lines. In the 27 | same order as the sequences. 28 | """ 29 | sequences = [] 30 | descriptions = [] 31 | index = -1 32 | for line in fasta_string.splitlines(): 33 | line = line.strip() 34 | if line.startswith(">"): 35 | index += 1 36 | descriptions.append(line[1:]) # Remove the '>' at the beginning. 37 | sequences.append("") 38 | continue 39 | elif line.startswith("#"): 40 | continue 41 | elif not line: 42 | continue # Skip blank lines. 43 | sequences[index] += line 44 | 45 | return sequences, descriptions 46 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/completions/genbio-aidosp-complete.zsh: -------------------------------------------------------------------------------- 1 | #compdef genbio-aidosp 2 | 3 | _genbio_aidosp_completion() { 4 | local -a completions 5 | local -a completions_with_descriptions 6 | local -a response 7 | (( ! $+commands[genbio-aidosp] )) && return 1 8 | 9 | response=("${(@f)$(env COMP_WORDS="${words[*]}" COMP_CWORD=$((CURRENT-1)) _GENBIO_AIDOSP_COMPLETE=zsh_complete genbio-aidosp)}") 10 | 11 | for type key descr in ${response}; do 12 | if [[ "$type" == "plain" ]]; then 13 | if [[ "$descr" == "_" ]]; then 14 | completions+=("$key") 15 | else 16 | completions_with_descriptions+=("$key":"$descr") 17 | fi 18 | elif [[ "$type" == "dir" ]]; then 19 | _path_files -/ 20 | elif [[ "$type" == "file" ]]; then 21 | _path_files -f 22 | fi 23 | done 24 | 25 | if [ -n "$completions_with_descriptions" ]; then 26 | _describe -V unsorted completions_with_descriptions -U 27 | fi 28 | 29 | if [ -n "$completions" ]; then 30 | compadd -U -V unsorted -a completions 31 | fi 32 | } 33 | 34 | if [[ $zsh_eval_context[-1] == loadautofunc ]]; then 35 | # autoload from fpath, call function directly 36 | _genbio_aidosp_completion "$@" 37 | else 38 | # eval/source/. command, register function for later 39 | compdef _genbio_aidosp_completion genbio-aidosp 40 | fi 41 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/cuda:12.4.0-devel-ubuntu22.04 AS build 2 | WORKDIR /workspace 3 | # TODO: using conda just to get a Python binary is probably overkill 4 | RUN apt update && apt install -y wget git 5 | RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda3.sh && \ 6 | bash miniconda3.sh -b -u -p /opt/conda 7 | RUN /opt/conda/bin/conda create -y -n finetune python=3.10 8 | ENV PATH=/opt/conda/envs/finetune/bin:$PATH 9 | 10 | # TODO: change to git clone when repos are public 11 | COPY modelgenerator modelgenerator 12 | COPY pyproject.toml . 13 | COPY README.md . 14 | 15 | RUN pip install --upgrade pip 16 | RUN pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 17 | RUN pip install flash_attn==2.7.4.post1 18 | 19 | ## RNA and Protein inverse folding requirements 20 | RUN pip install torch_geometric==2.6.1 21 | pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.6.0+cu124.html 22 | RUN pip install biopython==1.84 23 | RUN pip install MDAnalysis==2.8.0 24 | RUN pip install biotite==1.0.1 25 | RUN pip install OmegaConf 26 | 27 | WORKDIR /workspace 28 | RUN pip install -e . 29 | 30 | FROM nvcr.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04 31 | WORKDIR /workspace 32 | COPY --from=build /opt/conda/envs /opt/conda/envs 33 | ENV PATH=/opt/conda/envs/finetune/bin:$PATH 34 | COPY modelgenerator modelgenerator 35 | ENV MGEN_DATA_DIR=/mgen_data 36 | RUN mkdir ${MGEN_DATA_DIR} 37 | -------------------------------------------------------------------------------- /scripts/wandb_sweep/slurm_agent.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ################################ 3 | # SLURM options # 4 | ################################ 5 | # uncomment to run multiple agents in parallel. 6 | # --array=1-X where X is number agents. 7 | ##SBATCH --array=1-X 8 | #SBATCH --ntasks-per-node=1 # same as trainer.devices 9 | #SBATCH --nodes=1 # same as trainer.num_nodes 10 | #SBATCH --output=logs/R-%x.%j.out 11 | #SBATCH --error=logs/R-%x.%j.err 12 | 13 | ################################ 14 | # Python environment setup # 15 | ################################ 16 | eval "$(~/miniconda3/bin/conda shell.bash hook)" 17 | conda activate finetune 18 | 19 | ################################ 20 | #Required wandb sweep settings # 21 | ################################ 22 | export WANDB_PROJECT="autotune-test" 23 | SWEEP_ID="" 24 | 25 | ################################ 26 | # No change required below # 27 | ################################ 28 | { 29 | IFS=$'\n' read -r -d '' AGENT_DETAILS; 30 | IFS=$'\n' read -r -d '' AGENT_COMMAND; 31 | } < <((printf '\0%s\0' "$(timeout 30 srun --ntasks=1 wandb agent --count 1 $SWEEP_ID)" 1>&2) 2>&1) 32 | RUN_ID=$(echo $AGENT_DETAILS | sed -e "s/.*\[\([^]]*\)\].*/\1/g" -e "s/[\'\']//g") 33 | if [[ -z "$RUN_ID" ]]; then 34 | echo wandb agent timed out. >&2 35 | exit 1 36 | fi 37 | AGENT_COMMAND="${AGENT_COMMAND} --trainer.logger.version ${RUN_ID}" 38 | echo Training command: $AGENT_COMMAND 39 | 40 | wait 41 | srun $AGENT_COMMAND 42 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/utils/kernel/csrc/softmax_cuda.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2021 AlQuraishi Laboratory 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // modified from fastfold/model/fastnn/kernel/cuda_native/csrc/softmax_cuda.cpp 16 | 17 | #include 18 | 19 | void attn_softmax_inplace_forward_( 20 | at::Tensor input, 21 | long long rows, int cols 22 | ); 23 | void attn_softmax_inplace_backward_( 24 | at::Tensor output, 25 | at::Tensor d_ov, 26 | at::Tensor values, 27 | long long rows, 28 | int cols_output, 29 | int cols_values 30 | ); 31 | 32 | 33 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 34 | m.def( 35 | "forward_", 36 | &attn_softmax_inplace_forward_, 37 | "Softmax forward (CUDA)" 38 | ); 39 | m.def( 40 | "backward_", 41 | &attn_softmax_inplace_backward_, 42 | "Softmax backward (CUDA)" 43 | ); 44 | } 45 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/layers/esmfold/categorical_mixture.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import torch 6 | 7 | 8 | class CategoricalMixture: 9 | def __init__(self, param, bins=50, start=0, end=1): 10 | # All tensors are of shape ..., bins. 11 | self.logits = param 12 | bins = torch.linspace( 13 | start, end, bins + 1, device=self.logits.device, dtype=self.logits.dtype 14 | ) 15 | self.v_bins = (bins[:-1] + bins[1:]) / 2 16 | 17 | def log_prob(self, true): 18 | # Shapes are: 19 | # self.probs: ... x bins 20 | # true : ... 21 | true_index = ( 22 | ( 23 | true.unsqueeze(-1) 24 | - self.v_bins[ 25 | [ 26 | None, 27 | ] 28 | * true.ndim 29 | ] 30 | ) 31 | .abs() 32 | .argmin(-1) 33 | ) 34 | nll = self.logits.log_softmax(-1) 35 | return torch.take_along_dim(nll, true_index.unsqueeze(-1), dim=-1).squeeze(-1) 36 | 37 | def mean(self): 38 | return (self.logits.softmax(-1) @ self.v_bins.unsqueeze(1)).squeeze(-1) 39 | 40 | 41 | def categorical_lddt(logits, bins=50): 42 | # Logits are ..., 37, bins. 43 | return CategoricalMixture(logits, bins=bins).mean() 44 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/utils/file_io.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | from pathlib import Path 19 | from typing import Any, Union 20 | 21 | from fold.utils.torch_utils import map_values_to_list 22 | 23 | 24 | def save_json(data: dict, output_fpath: Union[str, Path], indent: int = 4): 25 | """ 26 | Save a dictionary to a JSON file. 27 | 28 | Args: 29 | data (dict): The dictionary to be saved. 30 | output_fpath (Union[str, Path]): The output file path. 31 | indent (int, optional): The indentation level for the JSON file. Defaults to 4. 32 | """ 33 | data_json = data.copy() 34 | data_json = map_values_to_list(data_json) 35 | with open(output_fpath, "w") as f: 36 | if indent is not None: 37 | json.dump(data_json, f, indent=indent) 38 | else: 39 | json.dump(data_json, f) 40 | -------------------------------------------------------------------------------- /experiments/AIDO.Tissue/emb.xenium.yaml: -------------------------------------------------------------------------------- 1 | seed_everything: 42 2 | data: 3 | class_path: modelgenerator.data.CellWithNeighborDataModule 4 | init_args: 5 | path: './downloads/' 6 | batch_size: 1 7 | train_split_files: 8 | - 'processed_fetal_lung_visium_xenium.xenium.convert.h5ad' 9 | valid_split_files: 10 | - 'processed_fetal_lung_visium_xenium.xenium.convert.h5ad' 11 | test_split_files: 12 | - 'processed_fetal_lung_visium_xenium.xenium.convert.h5ad' 13 | filter_columns: 14 | - 'cell_type' 15 | - 'x' 16 | - 'y' 17 | rename_columns: 18 | - 'labels' 19 | - 'x' 20 | - 'y' 21 | neighbor_num: 10 22 | num_workers: 4 23 | persistent_workers: True 24 | generate_uid: True 25 | model: 26 | class_path: modelgenerator.tasks.Embed 27 | init_args: 28 | backbone: 29 | class_path: modelgenerator.backbones.aido_tissue_3m 30 | init_args: 31 | from_scratch: False 32 | trainer: 33 | log_every_n_steps: 10 34 | precision: bf16 35 | devices: 1 36 | max_epochs: 10 37 | gradient_clip_val: 0 38 | profiler: null 39 | default_root_dir: './logs/emb.xenium' 40 | strategy: 41 | class_path: lightning.pytorch.strategies.DDPStrategy 42 | callbacks: 43 | class_path: modelgenerator.callbacks.PredictionWriter 44 | init_args: 45 | output_dir: './logs/emb.xenium/lightning_logs/pred_output' 46 | filetype: 'pt' 47 | write_cols: 48 | - 'predictions' 49 | - 'uid' 50 | return_predictions: True 51 | # TODO: Clean up parameter dependencies. 52 | -------------------------------------------------------------------------------- /docs/docs/api_reference/trainer.md: -------------------------------------------------------------------------------- 1 | # Trainer 2 | 3 | AIDO.ModelGenerator uses the LightningCLI for configuring runs with the PyTorch Lightning Trainer. 4 | The entrypoint for the CLI is `mgen`, which can be used with the `fit`, `test`, `validate`, and `predict` commands and the `--model`, `--data`, and `--trainer` arguments and their sub-arguments. 5 | ```bash 6 | mgen fit --model ConditionalDiffusion --model.backbone aido_dna_300m \ 7 | --data ConditionalDiffusionDataModule --data.path "genbio-ai/100m-random-promoters" \ 8 | --trainer.max_epochs 1 --trainer.accelerator auto --trainer.devices auto 9 | ``` 10 | 11 | For detailed information about the LightningCLI, see the [LightningCLI documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_advanced.html). 12 | 13 | ```yaml 14 | # Example Trainer Configuration 15 | trainer: 16 | accelerator: auto 17 | strategy: lightning.pytorch.strategies.DDPStrategy 18 | devices: auto 19 | num_nodes: 1 20 | precision: bf16-mixed 21 | logger: null 22 | callbacks: 23 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint 24 | init_args: 25 | filename: best_val:{step}-{val_loss:.3f}-{train_loss:.3f} 26 | monitor: val_loss 27 | save_top_k: 1 28 | fast_dev_run: false 29 | max_epochs: 100 30 | limit_val_batches: null 31 | val_check_interval: null 32 | check_val_every_n_epoch: 1 33 | log_every_n_steps: 50 34 | accumulate_grad_batches: 1 35 | gradient_clip_val: 1 36 | gradient_clip_algorithm: null 37 | detect_anomaly: false 38 | default_root_dir: logs 39 | model: 40 | ... 41 | data: 42 | ... 43 | ``` 44 | -------------------------------------------------------------------------------- /experiments/AIDO.DNA/sequence_classification/nt_promoter_all.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | max_epochs: 30 5 | gradient_clip_val: 1 6 | default_root_dir: logs 7 | logger: false 8 | callbacks: 9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt 10 | init_args: 11 | filename: epoch_{epoch}-val_mcc:{val_mcc:.3f} 12 | monitor: val_mcc 13 | mode: max 14 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping 15 | dict_kwargs: 16 | monitor: val_mcc 17 | mode: max 18 | patience: 30 19 | model: 20 | class_path: modelgenerator.tasks.SequenceClassification 21 | init_args: 22 | backbone: 23 | class_path: modelgenerator.backbones.aido_dna_7b 24 | init_args: 25 | use_peft: true 26 | lora_r: 16 27 | lora_alpha: 32 28 | lora_dropout: 0.1 29 | lora_target_modules: 30 | - query 31 | - value 32 | adapter: modelgenerator.adapters.LinearCLSAdapter 33 | n_classes: 2 34 | optimizer: 35 | class_path: torch.optim.AdamW 36 | init_args: 37 | lr: 0.0005 38 | weight_decay: 0.1 39 | lr_scheduler: 40 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup 41 | init_args: 42 | warmup_ratio: 0.1 43 | data: 44 | class_path: modelgenerator.data.NTClassification 45 | init_args: 46 | config_name: promoter_all 47 | train_split_name: train 48 | test_split_name: test 49 | valid_split_size: 0.1 50 | batch_size: 4 51 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/scripts/download_colabfold_envdb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2021 AlQuraishi Laboratory 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Downloads and unzips the BFD database for AlphaFold. 18 | # 19 | # Usage: bash download_bfd.sh /path/to/download/directory 20 | set -e 21 | 22 | if [[ $# -eq 0 ]]; then 23 | echo "Error: download directory must be provided as an input argument." 24 | exit 1 25 | fi 26 | 27 | if ! command -v aria2c &> /dev/null ; then 28 | echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." 29 | exit 1 30 | fi 31 | 32 | DOWNLOAD_DIR="$1" 33 | ROOT_DIR="${DOWNLOAD_DIR}" 34 | SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz" 35 | BASENAME=$(basename "${SOURCE_URL}") 36 | MAX_CONNECTIONS="${2:-4}" 37 | 38 | mkdir -p "${ROOT_DIR}" 39 | aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x "${MAX_CONNECTIONS}" --check-certificate=false 40 | tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \ 41 | --directory="${ROOT_DIR}" 42 | -------------------------------------------------------------------------------- /experiments/AIDO.DNA/sequence_classification/gue_core_promoter_all.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | max_epochs: 20 5 | gradient_clip_val: 1 6 | default_root_dir: logs 7 | logger: false 8 | callbacks: 9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt 10 | init_args: 11 | dirpath: null 12 | filename: epoch_{epoch}-val_mcc:{val_mcc:.3f} 13 | monitor: val_mcc 14 | mode: max 15 | every_n_epochs: 1 16 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping 17 | dict_kwargs: 18 | monitor: val_mcc 19 | mode: max 20 | patience: 10 21 | model: 22 | class_path: modelgenerator.tasks.SequenceClassification 23 | init_args: 24 | backbone: 25 | class_path: modelgenerator.backbones.aido_dna_7b 26 | init_args: 27 | use_peft: true 28 | lora_r: 16 29 | lora_alpha: 32 30 | lora_dropout: 0.1 31 | lora_target_modules: 32 | - query 33 | - value 34 | n_classes: 2 35 | optimizer: 36 | class_path: torch.optim.AdamW 37 | init_args: 38 | lr: 0.0005 39 | weight_decay: 0.1 40 | adapter: modelgenerator.adapters.LinearCLSAdapter 41 | lr_scheduler: 42 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup 43 | init_args: 44 | warmup_ratio: 0.1 45 | data: 46 | class_path: modelgenerator.data.GUEClassification 47 | init_args: 48 | config_name: prom_core_all 49 | train_split_name: train 50 | test_split_name: test 51 | batch_size: 4 52 | -------------------------------------------------------------------------------- /experiments/AIDO.DNA/sequence_classification/nt_enhancers.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | max_epochs: 30 5 | gradient_clip_val: 1 6 | default_root_dir: logs 7 | logger: false 8 | callbacks: 9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt 10 | init_args: 11 | dirpath: null 12 | filename: epoch_{epoch}-val_mcc:{val_mcc:.3f} 13 | monitor: val_mcc 14 | mode: max 15 | every_n_epochs: 1 16 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping 17 | dict_kwargs: 18 | monitor: val_mcc 19 | mode: max 20 | patience: 30 21 | model: 22 | class_path: modelgenerator.tasks.SequenceClassification 23 | init_args: 24 | backbone: 25 | class_path: modelgenerator.backbones.aido_dna_7b 26 | init_args: 27 | use_peft: true 28 | lora_r: 16 29 | lora_alpha: 32 30 | lora_dropout: 0.1 31 | lora_target_modules: 32 | - query 33 | - value 34 | adapter: modelgenerator.adapters.LinearCLSAdapter 35 | n_classes: 2 36 | optimizer: 37 | class_path: torch.optim.AdamW 38 | init_args: 39 | lr: 0.001 40 | weight_decay: 0.1 41 | lr_scheduler: 42 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup 43 | init_args: 44 | warmup_ratio: 0.1 45 | data: 46 | class_path: modelgenerator.data.NTClassification 47 | init_args: 48 | config_name: enhancers 49 | train_split_name: train 50 | test_split_name: test 51 | valid_split_size: 0.1 52 | batch_size: 8 53 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/utils/seed.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import random 18 | 19 | import numpy as np 20 | import torch 21 | 22 | 23 | def seed_everything(seed, deterministic): 24 | random.seed(seed) 25 | np.random.seed(seed) 26 | torch.random.manual_seed(seed) 27 | torch.cuda.manual_seed_all(seed) 28 | if deterministic: 29 | torch.backends.cudnn.benchmark = False 30 | # torch.backends.cudnn.deterministic=True applies to CUDA convolution operations, and nothing else. 31 | torch.backends.cudnn.deterministic = True 32 | # torch.use_deterministic_algorithms(True) affects all the normally-nondeterministic operations listed here https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html?highlight=use_deterministic#torch.use_deterministic_algorithms 33 | torch.use_deterministic_algorithms(True) 34 | # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility 35 | os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" 36 | -------------------------------------------------------------------------------- /docs/docs/usage/embedding_caching.md: -------------------------------------------------------------------------------- 1 | # Embedding Caching (Experimental) 2 | 3 | AIDO.ModelGenerator provides seamless file-based persistent embedding caching for all freezable backbones (e.g. backbones with the option `frozen=True`). This feature aims to boost training speed and reduce overall resource consumption by skipping backbone forwarding and redundant data loading. 4 | 5 | ## Create and resume from cache 6 | 7 | Embedding caching is enabled by setting `--model.backbone.enable_cache true`. It works for all mgen subcommands including fit, validate, test and predict. 8 | 9 | ### Examples 10 | **Train a model and save cache at the same time** 11 | ```bash 12 | mgen fit --config my_config.yaml --model.backbone.enable_cache true --model.backbone.file_cache_dir my/cache/folder 13 | ``` 14 | As training progresses, cached backbone output will be saved to disk and automatically used in future steps. For example, if your first epoch iterates through all the training data, cached embeddings will be utilized starting from the second epoch automatically. 15 | 16 | **Resume training from an existing cache** 17 | ```bash 18 | mgen fit --config my_config.yaml --model.backbone.enable_cache true --model.backbone.file_cache_dir my/cache/folder 19 | ``` 20 | No change to the command is required, just make sure `--model.backbone.file_cache_dir` points to the right folder. Cached embedding will be used from the first step. 21 | 22 | **Create cache without training the model** 23 | ```bash 24 | mgen predict --config my_config.yaml --model.backbone.enable_cache true --model.backbone.file_cache_dir my/cache/folder 25 | ``` 26 | The best practice in this case is to use the `Embed` task, which is minimal and contains the backbone only. 27 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/protein_inverse_folding/end2end_inference.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | DATA_DIR=data ## Set the path of the directory where you want to keep/download the PDB/CIF file. 4 | 5 | PDB_ID=5YH2 6 | CHAIN_ID=A 7 | 8 | # ### Download and merge the Protein-IF checkpoint 9 | # mkdir -p ${MGEN_DATA_DIR}/modelgenerator/huggingface_models/protein_inv_fold/AIDO.ProteinIF-16B/ 10 | # ## Download chunks 11 | # huggingface-cli download genbio-ai/AIDO.ProteinIF-16B \ 12 | # --repo-type model \ 13 | # --local-dir ${MGEN_DATA_DIR}/modelgenerator/huggingface_models/protein_inv_fold/AIDO.ProteinIF-16B/ 14 | # ## Merge chunks 15 | # python merge_ckpt.py ${MGEN_DATA_DIR}/modelgenerator/huggingface_models/protein_inv_fold/AIDO.ProteinIF-16B/model_chunks ${MGEN_DATA_DIR}/modelgenerator/huggingface_models/protein_inv_fold/AIDO.ProteinIF-16B/model.ckpt 16 | 17 | ### Download a single structure from somewhere like PDB 18 | mkdir -p ${DATA_DIR}/ 19 | wget https://files.rcsb.org/download/${PDB_ID}.cif -P ${DATA_DIR}/ 20 | 21 | ### Put it into our format 22 | python preprocess_PDB.py ${DATA_DIR}/${PDB_ID}.cif ${CHAIN_ID} ${DATA_DIR}/ 23 | 24 | ### Run inference to generate sequence 25 | # export CUDA_VISIBLE_DEVICES=6, 26 | mgen test \ 27 | --config protein_inv_fold_test.yaml \ 28 | --trainer.default_root_dir ${MGEN_DATA_DIR}/modelgenerator/logs/protein_inv_fold/ \ 29 | --ckpt_path ${MGEN_DATA_DIR}/modelgenerator/huggingface_models/protein_inv_fold/AIDO.ProteinIF-16B/model.ckpt \ 30 | --trainer.devices 0, \ 31 | --data.path ${DATA_DIR}/ 32 | 33 | ### The results will be saved under the folder "/experiments/AIDO.Protein/protein_inverse_folding/proteinIF_outputs" in a file named "results_acc_{recovery_accuracy}.txt". 34 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/DMS/configs/indels_LP_DDP.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | max_steps: 1000 5 | gradient_clip_val: 0.1 6 | default_root_dir: logs 7 | logger: false 8 | callbacks: 9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt 10 | init_args: 11 | filename: step_{step}_valloss_{val_loss} 12 | save_last: true 13 | save_top_k: 1 14 | save_weights_only: false 15 | mode: min 16 | every_n_train_steps: 500 17 | model: 18 | class_path: modelgenerator.tasks.SequenceRegression 19 | init_args: 20 | backbone: 21 | class_path: modelgenerator.backbones.aido_protein_16b 22 | init_args: 23 | frozen: true 24 | max_length: 2048 25 | adapter: 26 | class_path: modelgenerator.adapters.MLPPoolAdapter 27 | init_args: 28 | hidden_sizes: 29 | - 128 30 | dropout: 0.1 31 | dropout_in_middle: false 32 | optimizer: 33 | class_path: torch.optim.AdamW 34 | init_args: 35 | lr: 0.001 36 | weight_decay: 0.01 37 | lr_scheduler: 38 | class_path: modelgenerator.lr_schedulers.ConstantWithWarmup 39 | init_args: 40 | warmup_ratio: 0.05 41 | data: 42 | class_path: modelgenerator.data.DMSFitnessPrediction 43 | init_args: 44 | path: genbio-ai/ProteinGYM-DMS 45 | train_split_files: 46 | - indels/B1LPA6_ECOSM_Russ_2020_indels.tsv 47 | train_split_name: train 48 | random_seed: 42 49 | batch_size: 32 50 | cv_num_folds: 5 51 | cv_test_fold_id: 0 52 | cv_enable_val_fold: false 53 | cv_fold_id_col: fold_id 54 | ckpt_path: null 55 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/demo_mrna_vaccine/get_mean_embeddings.py: -------------------------------------------------------------------------------- 1 | # Takes a directory with *.pt files as argument 2 | # For each file, loads the embeddings and computes the mean along the sequence dimension 3 | # Compiles all mean embeddings as well as sequences into a single pt file 4 | 5 | import sys 6 | import os 7 | import torch 8 | import pandas as pd 9 | import argparse 10 | from tqdm import tqdm 11 | 12 | def compile_mean_embeddings(directory): 13 | all_sequences = [] 14 | mean_embeddings = [] 15 | for file in tqdm(os.listdir(directory)): 16 | if file.endswith('.pt'): 17 | vals = torch.load(os.path.join(directory, file)) 18 | embeddings = torch.tensor(vals['predictions']).cpu() 19 | sequences = vals['sequences'] 20 | attention_masks = torch.tensor(vals['attention_mask']).cpu() 21 | special_tokens_mask = torch.tensor(vals['special_tokens_mask']).cpu() 22 | for i in range(len(embeddings)): 23 | embedding = embeddings[i][attention_masks[i] == 1 & ~special_tokens_mask[i]] 24 | mean_embedding = embedding.mean(dim=0) 25 | mean_embeddings.append(mean_embedding) 26 | all_sequences.append(sequences[i]) 27 | mean_embeddings = torch.stack(mean_embeddings) 28 | torch.save({'sequences': all_sequences, 'mean_embeddings': mean_embeddings}, os.path.join(directory, 'mean_embeddings.pt')) 29 | 30 | 31 | 32 | if __name__ == '__main__': 33 | parser = argparse.ArgumentParser(description='Compile mean embeddings') 34 | parser.add_argument('--directory', type=str, help='Path to directory with *.pt files') 35 | args = parser.parse_args() 36 | compile_mean_embeddings(args.directory) 37 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/data/tools/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI. 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # Copyright 2021 AlQuraishi Laboratory 4 | # Copyright 2021 DeepMind Technologies Limited 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | """Common utilities for data pipeline tools.""" 20 | import contextlib 21 | import datetime 22 | import logging 23 | import shutil 24 | import tempfile 25 | import time 26 | from typing import Optional 27 | 28 | 29 | @contextlib.contextmanager 30 | def tmpdir_manager(base_dir: Optional[str] = None): 31 | """Context manager that deletes a temporary directory on exit.""" 32 | tmpdir = tempfile.mkdtemp(dir=base_dir) 33 | try: 34 | yield tmpdir 35 | finally: 36 | shutil.rmtree(tmpdir, ignore_errors=True) 37 | 38 | 39 | @contextlib.contextmanager 40 | def timing(msg: str): 41 | logging.info("Started %s", msg) 42 | tic = time.perf_counter() 43 | yield 44 | toc = time.perf_counter() 45 | logging.info("Finished %s in %.3f seconds", msg, toc - tic) 46 | 47 | 48 | def to_date(s: str): 49 | return datetime.datetime(year=int(s[:4]), month=int(s[5:7]), day=int(s[8:10])) 50 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/rna_secondary_structure_prediction/ft_schedules/layers_0_32.yaml: -------------------------------------------------------------------------------- 1 | 0: 2 | - adapter.* 3 | 3: 4 | - backbone.encoder.encoder.ln.* 5 | - backbone.encoder.encoder.layer.32.* 6 | - backbone.encoder.encoder.layer.31.* 7 | - backbone.encoder.encoder.layer.30.* 8 | 6: 9 | - backbone.encoder.encoder.layer.29.* 10 | - backbone.encoder.encoder.layer.28.* 11 | - backbone.encoder.encoder.layer.27.* 12 | 9: 13 | - backbone.encoder.encoder.layer.26.* 14 | - backbone.encoder.encoder.layer.25.* 15 | - backbone.encoder.encoder.layer.24.* 16 | 12: 17 | - backbone.encoder.encoder.layer.23.* 18 | - backbone.encoder.encoder.layer.22.* 19 | - backbone.encoder.encoder.layer.21.* 20 | 15: 21 | - backbone.encoder.encoder.layer.20.* 22 | - backbone.encoder.encoder.layer.19.* 23 | - backbone.encoder.encoder.layer.18.* 24 | 18: 25 | - backbone.encoder.encoder.layer.17.* 26 | - backbone.encoder.encoder.layer.16.* 27 | - backbone.encoder.encoder.layer.15.* 28 | 21: 29 | - backbone.encoder.encoder.layer.14.* 30 | - backbone.encoder.encoder.layer.13.* 31 | - backbone.encoder.encoder.layer.12.* 32 | 24: 33 | - backbone.encoder.encoder.layer.11.* 34 | - backbone.encoder.encoder.layer.10.* 35 | - backbone.encoder.encoder.layer.9.* 36 | 27: 37 | - backbone.encoder.encoder.layer.8.* 38 | - backbone.encoder.encoder.layer.7.* 39 | - backbone.encoder.encoder.layer.6.* 40 | 30: 41 | - backbone.encoder.encoder.layer.5.* 42 | - backbone.encoder.encoder.layer.4.* 43 | - backbone.encoder.encoder.layer.3.* 44 | 33: 45 | - backbone.encoder.encoder.layer.2.* 46 | - backbone.encoder.encoder.layer.1.* 47 | - backbone.encoder.encoder.layer.0.* 48 | 36: 49 | - backbone.encoder.* 50 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/utils/geometry.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import numpy as np 18 | from scipy.spatial.transform import Rotation 19 | 20 | 21 | def angle_3p(a, b, c): 22 | """ 23 | Calculate the angle between three points in a 2D space. 24 | 25 | Args: 26 | a (list or array-like): The coordinates of the first point. 27 | b (list or array-like): The coordinates of the second point. 28 | c (list or array-like): The coordinates of the third point. 29 | 30 | Returns: 31 | float: The angle in degrees (0, 180) between the vectors 32 | from point a to point b and point b to point c. 33 | """ 34 | a = np.array(a) 35 | b = np.array(b) 36 | c = np.array(c) 37 | 38 | ab = b - a 39 | bc = c - b 40 | 41 | dot_product = np.dot(ab, bc) 42 | 43 | norm_ab = np.linalg.norm(ab) 44 | norm_bc = np.linalg.norm(bc) 45 | 46 | cos_theta = np.clip(dot_product / (norm_ab * norm_bc + 1e-4), -1, 1) 47 | theta_radians = np.arccos(cos_theta) 48 | theta_degrees = np.degrees(theta_radians) 49 | return theta_degrees 50 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/DMS/configs/indels_LoRA_DDP.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | max_steps: 10000 5 | gradient_clip_val: 0.1 6 | default_root_dir: logs 7 | logger: false 8 | callbacks: 9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt 10 | init_args: 11 | filename: epoch_{epoch}-val_mcc:{val_spearman:.3f} 12 | monitor: val_spearman 13 | mode: max 14 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping 15 | dict_kwargs: 16 | monitor: val_spearman 17 | mode: max 18 | patience: 10 19 | model: 20 | class_path: modelgenerator.tasks.SequenceRegression 21 | init_args: 22 | backbone: 23 | class_path: modelgenerator.backbones.aido_protein_16b 24 | init_args: 25 | use_peft: true 26 | lora_dropout: 0.05 27 | max_length: 2048 28 | adapter: 29 | class_path: modelgenerator.adapters.MLPPoolAdapter 30 | init_args: 31 | hidden_sizes: 32 | - 128 33 | dropout: 0.1 34 | dropout_in_middle: false 35 | optimizer: 36 | class_path: torch.optim.AdamW 37 | init_args: 38 | lr: 0.0001 39 | weight_decay: 0.01 40 | lr_scheduler: 41 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup 42 | init_args: 43 | warmup_ratio: 0.05 44 | data: 45 | class_path: modelgenerator.data.DMSFitnessPrediction 46 | init_args: 47 | path: genbio-ai/ProteinGYM-DMS 48 | train_split_files: 49 | - indels/B1LPA6_ECOSM_Russ_2020_indels.tsv 50 | train_split_name: train 51 | random_seed: 42 52 | batch_size: 32 53 | cv_num_folds: 5 54 | cv_test_fold_id: 0 55 | cv_enable_val_fold: true 56 | cv_fold_id_col: fold_id 57 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/DMS/configs/substitution_LoRA_DDP.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | logger: false 5 | callbacks: 6 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt 7 | init_args: 8 | filename: epoch_{epoch}-val_mcc:{val_spearman:.3f} 9 | monitor: val_spearman 10 | mode: max 11 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping 12 | dict_kwargs: 13 | monitor: val_spearman 14 | mode: max 15 | patience: 10 16 | max_steps: 10000 17 | gradient_clip_val: 0.1 18 | default_root_dir: logs 19 | model: 20 | class_path: modelgenerator.tasks.SequenceRegression 21 | init_args: 22 | backbone: 23 | class_path: modelgenerator.backbones.aido_protein_16b_v1 24 | init_args: 25 | use_peft: true 26 | max_length: 2048 27 | adapter: 28 | class_path: modelgenerator.adapters.MLPPoolAdapter 29 | init_args: 30 | hidden_sizes: 31 | - 128 32 | dropout: 0.1 33 | dropout_in_middle: false 34 | optimizer: 35 | class_path: torch.optim.AdamW 36 | init_args: 37 | lr: 0.0001 38 | weight_decay: 0.01 39 | lr_scheduler: 40 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup 41 | init_args: 42 | warmup_ratio: 0.05 43 | data: 44 | class_path: modelgenerator.data.DMSFitnessPrediction 45 | init_args: 46 | path: genbio-ai/ProteinGYM-DMS 47 | train_split_files: 48 | - singles_substitutions/VRPI_BPT7_Tsuboyama_2023_2WNM.tsv 49 | train_split_name: 'train' 50 | random_seed: 42 51 | batch_size: 32 52 | cv_num_folds: 5 53 | cv_test_fold_id: 0 54 | cv_enable_val_fold: true 55 | cv_fold_id_col: fold_id 56 | ckpt_path: null 57 | -------------------------------------------------------------------------------- /experiments/AIDO.DNA/sequence_classification/gue_splice_reconstruction.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | max_epochs: 20 5 | gradient_clip_val: 1 6 | default_root_dir: logs 7 | logger: false 8 | callbacks: 9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt 10 | init_args: 11 | dirpath: null 12 | filename: epoch_{epoch}-val_mcc:{val_mcc:.3f} 13 | monitor: val_mcc 14 | mode: max 15 | every_n_epochs: 1 16 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping 17 | dict_kwargs: 18 | monitor: val_mcc 19 | mode: max 20 | patience: 10 21 | model: 22 | class_path: modelgenerator.tasks.SequenceClassification 23 | init_args: 24 | backbone: 25 | class_path: modelgenerator.backbones.aido_dna_7b 26 | init_args: 27 | use_peft: true 28 | lora_r: 16 29 | lora_alpha: 32 30 | lora_dropout: 0.1 31 | lora_target_modules: 32 | - query 33 | - value 34 | adapter: 35 | class_path: modelgenerator.adapters.MLPPoolAdapter 36 | init_args: 37 | pooling: mean_pooling 38 | hidden_sizes: 39 | - 128 40 | bias: true 41 | dropout: 0.1 42 | n_classes: 3 43 | optimizer: 44 | class_path: torch.optim.AdamW 45 | init_args: 46 | lr: 0.0005 47 | weight_decay: 0.1 48 | lr_scheduler: 49 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup 50 | init_args: 51 | warmup_ratio: 0.1 52 | data: 53 | class_path: modelgenerator.data.GUEClassification 54 | init_args: 55 | config_name: splice_reconstructed 56 | train_split_name: train 57 | test_split_name: test 58 | valid_split_size: 0.1 59 | batch_size: 4 60 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/scfoundation/pretrainmodels/transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 BioMap (Beijing) Intelligence Technology Limited 2 | 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | class pytorchTransformerModule(nn.Module): 9 | def __init__(self, 10 | max_seq_len, 11 | dim, 12 | depth, 13 | heads, 14 | ff_mult=4, 15 | norm_first=False, 16 | ): 17 | super(pytorchTransformerModule, self).__init__() 18 | 19 | self.max_seq_len = max_seq_len 20 | self.depth = depth 21 | layers = [] 22 | for i in range(depth): 23 | layers.append(nn.TransformerEncoderLayer(d_model=dim, nhead=heads, 24 | dim_feedforward=dim * ff_mult, 25 | batch_first=True, 26 | norm_first=norm_first, 27 | #activation="gelu", 28 | )) 29 | 30 | self.transformer_encoder = nn.ModuleList(layers) 31 | self.norm = nn.LayerNorm(dim) 32 | 33 | def forward(self, x, padding_mask): 34 | b, n, _, device = *x.shape, x.device 35 | assert n <= self.max_seq_len, f'sequence length {n} must be less than the max sequence length {self.max_seq_len}' 36 | 37 | # x get encodings [B, N, D] , batch_first is True 38 | for mod in self.transformer_encoder: 39 | x = mod(x, src_key_padding_mask=padding_mask) # , src_mask=mask, src_key_padding_mask=src_key_padding_mask) 40 | # x = self.transformer_encoder(x) 41 | x = self.norm(x) 42 | 43 | return x 44 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/README.md: -------------------------------------------------------------------------------- 1 | ## openfold_local 2 | 3 | This is copy from [openfold](https://github.com/aqlaboratory/openfold), commit id: [bb3f51](https://github.com/aqlaboratory/openfold/commit/bb3f51e5a2cf2d5e3b709fe8f7d7a083c870222e) 4 | 5 | Openfold is a great work. Protenix try to reuse it when building models. However, A few modifications has been made for protenix project. 6 | We reuse protenix's code. 7 | 8 | * In [protenix/openfold_local/model/primitives.py](model/primitives.py), we add a custom [`Layernorm`](../model/layer_norm/) implementation, it accelerate protenix about 30%-50% during different training stages 9 | 10 | If you use our work, please also cite Openfold: 11 | 12 | ```bibtex 13 | @article {Ahdritz2022.11.20.517210, 14 | author = {Ahdritz, Gustaf and Bouatta, Nazim and Floristean, Christina and Kadyan, Sachin and Xia, Qinghui and Gerecke, William and O{\textquoteright}Donnell, Timothy J and Berenberg, Daniel and Fisk, Ian and Zanichelli, Niccolò and Zhang, Bo and Nowaczynski, Arkadiusz and Wang, Bei and Stepniewska-Dziubinska, Marta M and Zhang, Shang and Ojewole, Adegoke and Guney, Murat Efe and Biderman, Stella and Watkins, Andrew M and Ra, Stephen and Lorenzo, Pablo Ribalta and Nivon, Lucas and Weitzner, Brian and Ban, Yih-En Andrew and Sorger, Peter K and Mostaque, Emad and Zhang, Zhao and Bonneau, Richard and AlQuraishi, Mohammed}, 15 | title = {{O}pen{F}old: {R}etraining {A}lpha{F}old2 yields new insights into its learning mechanisms and capacity for generalization}, 16 | elocation-id = {2022.11.20.517210}, 17 | year = {2022}, 18 | doi = {10.1101/2022.11.20.517210}, 19 | publisher = {Cold Spring Harbor Laboratory}, 20 | URL = {https://www.biorxiv.org/content/10.1101/2022.11.20.517210}, 21 | eprint = {https://www.biorxiv.org/content/early/2022/11/22/2022.11.20.517210.full.pdf}, 22 | journal = {bioRxiv} 23 | } 24 | ``` 25 | -------------------------------------------------------------------------------- /configs/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Default fit config 2 | # If no other --config is specified with mgen fit, this is used 3 | # If another --config is specified, this is ignored 4 | # To continue using this with another --config, specify this as another config 5 | # e.g. mgen fit --config defaults.yaml --config my_custom_config.yaml 6 | # LightningCLI will override the defaults with the custom config while keeping the remaining defaults 7 | trainer: 8 | accelerator: auto 9 | devices: auto 10 | strategy: auto 11 | max_steps: -1 12 | max_epochs: -1 13 | precision: 32-true 14 | log_every_n_steps: 50 15 | default_root_dir: logs 16 | profiler: 17 | class_path: lightning.pytorch.profilers.PyTorchProfiler 18 | dict_kwargs: 19 | profile_memory: true 20 | callbacks: 21 | - class_path: lightning.pytorch.callbacks.LearningRateMonitor 22 | dict_kwargs: 23 | logging_interval: "step" 24 | # Save a checkpoint for min val loss 25 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint 26 | dict_kwargs: 27 | monitor: val_loss 28 | mode: min 29 | save_top_k: 1 30 | filename: "best_val:{step}-{val_loss:.3f}-{train_loss:.3f}" 31 | # Save a checkpoint for min train loss 32 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint 33 | dict_kwargs: 34 | monitor: train_loss 35 | mode: min 36 | save_top_k: 1 37 | filename: "best_train:{step}-{val_loss:.3f}-train:{train_loss:.3f}" 38 | # Save the latest step 39 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint 40 | dict_kwargs: 41 | filename: "last:{step}-{val_loss:.3f}-{train_loss:.3f}" 42 | # Save a checkpoint every 1000 steps 43 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint 44 | dict_kwargs: 45 | every_n_train_steps: 1000 46 | filename: "step:{step}-{val_loss:.3f}-{train_loss:.3f}" 47 | save_top_k: -1 48 | -------------------------------------------------------------------------------- /experiments/AIDO.StructureTokenizer/protein2structoken_16b.yaml: -------------------------------------------------------------------------------- 1 | # Jiayou Zhang 2 | # Usage: mgen predict --config experiments/AIDO.StructureTokenizer/protein2structoken_16b.yaml 3 | # The input amino acid sequences are specified in the test_split_files. 4 | # The model will predict the structure tokens of the input sequences. 5 | # The predictions will be saved in `callbacks.init_args.output_dir` using tsv format. 6 | seed_everything: 42 7 | trainer: 8 | accelerator: auto 9 | strategy: 10 | class_path: lightning.pytorch.strategies.DDPStrategy 11 | devices: auto 12 | num_nodes: 1 13 | precision: 32 14 | logger: 15 | class_path: lightning.pytorch.loggers.WandbLogger 16 | init_args: 17 | name: protein2structoken_16b 18 | save_dir: logs 19 | project: MGEN_AIDO.StructureTokenizer 20 | callbacks: 21 | - class_path: modelgenerator.callbacks.PredictionWriter 22 | init_args: 23 | output_dir: logs/protein2structoken_16b 24 | filetype: tsv 25 | write_cols: [uid, sequences, labels, predictions] 26 | drop_special_tokens: true 27 | argmax_predictions: true 28 | remove_duplicates: true 29 | model: 30 | class_path: modelgenerator.tasks.Inference 31 | init_args: 32 | backbone: 33 | class_path: modelgenerator.backbones.aido_protein2structoken_16b 34 | init_args: 35 | from_scratch: false 36 | max_length: 2048 # 512 is too short for some proteins. The first stage training of the model is done with 2048. The second stage is 1024. 37 | config_overwrites: 38 | hidden_dropout_prob: 0 39 | attention_probs_dropout_prob: 0 40 | use_legacy_adapter: true 41 | strict_loading: true 42 | data: 43 | class_path: modelgenerator.data.StructureTokenDataModule 44 | init_args: 45 | path: genbio-ai/casp14-casp15-cameo-test-proteins 46 | test_split_files: [casp14_csv/test.csv, casp15_csv/test.csv, cameo_csv/test.csv] 47 | batch_size: 1 48 | ckpt_path: null 49 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/scripts/download_uniref30.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2021 AlQuraishi Laboratory 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Downloads and unzips the BFD database for AlphaFold. 18 | # 19 | # Usage: bash download_bfd.sh /path/to/download/directory 20 | # not that, the download uniref30 cannot be used by mmseqs2. if one can download uniref30 from 21 | # http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz, after unzip this download file, use the follow 22 | # script to convert data format. 23 | # mmseqs tsv2exprofiledb uniref30_2103 uniref30_2103_db 24 | # mmseqs createindex uniref30_2103_db tmp 25 | set -e 26 | 27 | if [[ $# -eq 0 ]]; then 28 | echo "Error: download directory must be provided as an input argument." 29 | exit 1 30 | fi 31 | 32 | if ! command -v aria2c &> /dev/null ; then 33 | echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." 34 | exit 1 35 | fi 36 | 37 | DOWNLOAD_DIR="$1" 38 | ROOT_DIR="${DOWNLOAD_DIR}/uniref30" 39 | SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz" 40 | BASENAME=$(basename "${SOURCE_URL}") 41 | MAX_CONNECTIONS="${2:-4}" 42 | 43 | mkdir -p "${ROOT_DIR}" 44 | aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x "${MAX_CONNECTIONS}" --check-certificate=false 45 | tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \ 46 | --directory="${ROOT_DIR}" 47 | rm "${ROOT_DIR}/${BASENAME}" 48 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/predict.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import click 4 | from loguru import logger 5 | 6 | 7 | @click.command() 8 | @click.option( 9 | "-i", 10 | "--input-json", 11 | type=click.Path(exists=True, dir_okay=False), 12 | required=True, 13 | help="Input JSON file containing protein sequences", 14 | ) 15 | @click.option( 16 | "-o", 17 | "--output-dir", 18 | type=click.Path(file_okay=False, dir_okay=True), 19 | required=True, 20 | help="Directory to save the prediction results", 21 | ) 22 | @click.option( 23 | "-m", 24 | "--model-path", 25 | type=click.Path(exists=True, dir_okay=False), 26 | required=True, 27 | help="Path to the model checkpoint", 28 | ) 29 | @click.option( 30 | "--ccd-components", 31 | type=click.Path(exists=True, dir_okay=False), 32 | required=True, 33 | help="Path to the CCD components file", 34 | ) 35 | @click.option( 36 | "--ccd-components-rdkit", 37 | type=click.Path(exists=True, dir_okay=False), 38 | required=True, 39 | help="Path to the CCD components RDKit molecules file", 40 | ) 41 | @click.option( 42 | "--seed", 43 | type=int, 44 | default=1234, 45 | help="Random seed for reproducibility", 46 | ) 47 | @click.option( 48 | "--device-ids", 49 | type=str, 50 | default="0", 51 | help="Comma-separated list of GPU device IDs to use", 52 | ) 53 | @click.option( 54 | "--master-port", 55 | type=int, 56 | default=8803, 57 | help="Master port for distributed training", 58 | ) 59 | def predict( 60 | input_json: Path, 61 | output_dir: Path, 62 | model_path: Path, 63 | ccd_components: Path, 64 | ccd_components_rdkit: Path, 65 | seed: int, 66 | device_ids: str, 67 | master_port: int, 68 | ) -> None: 69 | """Run protein structure prediction""" 70 | logger.info(f"Running prediction with input {input_json}") 71 | logger.info(f"Output will be saved to {output_dir}") 72 | # TODO: Implement 73 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/configs/mmseqs.yaml: -------------------------------------------------------------------------------- 1 | setting: 2 | description: msa retrieve 3 | tools: 4 | mmseqs2: 5 | enable: false 6 | binary_path: mmseqs # after add binary path into $PATH. 7 | dbs: uniref30,envdb 8 | uniref30: 9 | # -s: controls how many similar k-mers should be produced during the seeding stage. This is the most important parameter for speed, a lower value is fast but less sensitive and a higher one is sensitive but slower. The default search is already sensitive 10 | # --db-load-mode 2: MMseqs2 can be forced to use the main memory database by using the parameter 11 | search: "--num-iterations 3 --db-load-mode 2 -s 8 -e 0.1 --max-seqs 10000 -a" 12 | expandaln: "--db-load-mode 2 --expansion-mode 0 -e inf --expand-filter-clusters 1 --max-seq-id 0.95" 13 | align: "--db-load-mode 2 -e 10 --max-accept 100000 --alt-ali 10 -a" 14 | filter: "--db-load-mode 2 --qid 0 --qsc 0.8 --diff 0 --max-seq-id 1.0 --filter-min-enable 100" 15 | result2msa: "--msa-format-mode 6 --db-load-mode 2 --filter-msa 1 --filter-min-enable 1000 --diff 3000 --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95" 16 | envdb: 17 | search: "--num-iterations 3 --db-load-mode 2 -s 8 -e 0.1 --max-seqs 10000 -a" 18 | expandaln: "--db-load-mode 2 --expansion-mode 0 -e inf" 19 | align: "--db-load-mode 2 -e 10 --max-accept 100000 --alt-ali 10 -a" 20 | filter: "--db-load-mode 2 --qid 0 --qsc 0.8 --diff 0 --max-seq-id 1.0 --filter-min-enable 100" 21 | result2msa: "--msa-format-mode 6 --db-load-mode 2 --filter-msa 1 --filter-min-enable 1000 --diff 3000 --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95" 22 | mmseqs2_api: 23 | enable: true 24 | use_env: true 25 | use_filter: true 26 | use_pairing: false 27 | pairing_strategy: greedy # greedy, complete 28 | data: 29 | uniref30: 30 | database_path: /localssd/data/uniref30_mmseqs/uniref30_2103_db 31 | envdb: 32 | database_path: /localssd/data/colabfold_envdb_202108/colabfold_envdb_202108_db 33 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to AIDO.ModelGenerator 2 | Thank you for considering to contribute to AIDO.ModelGenerator! 3 | 4 | ## Merge Requests 5 | We welcome your merge requests (MRs). 6 | For minor fixes (e.g., documentation improvements), feel free to submit a MR directly. 7 | If you would like to implement a new feature or a bug, please make sure you (or someone else) has opened an appropriate [issue](https://github.com/genbio-ai/ModelGenerator/issues) first; in your MR, please mention the issue it addresses. 8 | 9 | ### Creating a Merge Request 10 | 1. [Fork](https://github.com/genbio-ai/ModelGenerator/forks) this repository. 11 | 2. Install locally with `pip install -e .[dev]`. 12 | 3. Make your code changes locally. 13 | 4. **Set up commit hooks:** 14 | Initialize [pre-commit](https://pre-commit.com/) hooks: 15 | ```bash 16 | pre-commit install 17 | ``` 18 | This will automatically check formatting (Ruff with max line length 100), trailing whitespace, end-of-file, YAML syntax, and large files before each commit. 19 | 5. Run `pytest tests/` to test your code. 20 | 6. If dependencies changed, rebuild the lock file with `poetry lock` 21 | 7. Check that your code is properly documented by going into the `docs` directory and running `mkdocs serve` to build the documentation and view it in your browser. 22 | 8. Issue a MR to merge your changes into the `main` branch. 23 | 24 | 25 | ## Issues 26 | We use GitHub issues to track bugs and feature requests. 27 | Before submitting an issue, please make sure: 28 | 29 | 1. You have read the README and documentation and your question is NOT addressed there. 30 | 2. You have done your best to ensure that your issue is NOT a duplicate of one of [the previous issues](https://github.com/genbio-ai/ModelGenerator/issues). 31 | 3. Your issue is either a bug (unexpected/undesirable behavior) or a feature request. 32 | 33 | ## License 34 | By contributing to AIDO.ModelGenerator, you agree that your contributions will be licensed 35 | under the LICENSE file in the root directory of the source tree. 36 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai4s-cn-beijing.cr.volces.com/pytorch-mirror/pytorch:2.3.1-cuda12.1-cudnn8-devel 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | ENV TZ=Asia/Shanghai 5 | RUN apt-get update && \ 6 | apt-get install -y --no-install-recommends \ 7 | wget \ 8 | g++ \ 9 | gcc \ 10 | libc6-dev \ 11 | libaio-dev \ 12 | make zlib1g zlib1g-dev \ 13 | git git-lfs expect zsh vim wget curl unzip zip cmake cmake-curses-gui libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev \ 14 | net-tools \ 15 | && apt-get clean \ 16 | && rm -rf /var/lib/apt/lists/* 17 | 18 | RUN apt update && apt -y install postgresql 19 | 20 | RUN DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ 21 | hmmer cmake cmake-curses-gui \ 22 | && git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \ 23 | && mkdir /tmp/hh-suite/build \ 24 | && cd /tmp/hh-suite/build \ 25 | && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \ 26 | && make -j 32 && make install \ 27 | && ln -s /opt/hhsuite/bin/* /usr/bin \ 28 | && cd - \ 29 | && rm -rf /tmp/hh-suite 30 | 31 | RUN apt-get install -yq --no-install-recommends iproute2 curl 32 | # Add PIP Package 33 | RUN pip3 --no-cache-dir install \ 34 | scipy \ 35 | ml_collections \ 36 | tqdm \ 37 | pandas \ 38 | dm-tree \ 39 | rdkit 40 | 41 | # Add openfold dependency 42 | RUN pip3 --no-cache-dir install \ 43 | biopython==1.83 \ 44 | modelcif==0.7 45 | 46 | # Add datapipeline dependency 47 | RUN pip3 --no-cache-dir install \ 48 | biotite==1.0.1 \ 49 | scikit-learn \ 50 | scikit-learn-extra \ 51 | deepspeed==0.14.4 \ 52 | protobuf==3.20.2 tos icecream ipdb wandb numpy==1.26.3 matplotlib==3.9.2 ipywidgets py3Dmol 53 | 54 | # For H20 compatibility 55 | RUN pip3 install --no-cache-dir nvidia-cublas-cu12==12.4.5.8 --no-deps 56 | RUN git clone -b v3.5.1 https://github.com/NVIDIA/cutlass.git /opt/cutlass 57 | ENV CUTLASS_PATH=/opt/cutlass 58 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/configs/mmseqs.yaml: -------------------------------------------------------------------------------- 1 | setting: 2 | description: msa retrieve 3 | tools: 4 | mmseqs2: 5 | enable: false 6 | binary_path: mmseqs # after add binary path into $PATH. 7 | dbs: uniref30,envdb 8 | uniref30: 9 | # -s: controls how many similar k-mers should be produced during the seeding stage. This is the most important parameter for speed, a lower value is fast but less sensitive and a higher one is sensitive but slower. The default search is already sensitive 10 | # --db-load-mode 2: MMseqs2 can be forced to use the main memory database by using the parameter 11 | search: "--num-iterations 3 --db-load-mode 2 -s 8 -e 0.1 --max-seqs 10000 -a" 12 | expandaln: "--db-load-mode 2 --expansion-mode 0 -e inf --expand-filter-clusters 1 --max-seq-id 0.95" 13 | align: "--db-load-mode 2 -e 10 --max-accept 100000 --alt-ali 10 -a" 14 | filter: "--db-load-mode 2 --qid 0 --qsc 0.8 --diff 0 --max-seq-id 1.0 --filter-min-enable 100" 15 | result2msa: "--msa-format-mode 6 --db-load-mode 2 --filter-msa 1 --filter-min-enable 1000 --diff 3000 --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95" 16 | envdb: 17 | search: "--num-iterations 3 --db-load-mode 2 -s 8 -e 0.1 --max-seqs 10000 -a" 18 | expandaln: "--db-load-mode 2 --expansion-mode 0 -e inf" 19 | align: "--db-load-mode 2 -e 10 --max-accept 100000 --alt-ali 10 -a" 20 | filter: "--db-load-mode 2 --qid 0 --qsc 0.8 --diff 0 --max-seq-id 1.0 --filter-min-enable 100" 21 | result2msa: "--msa-format-mode 6 --db-load-mode 2 --filter-msa 1 --filter-min-enable 1000 --diff 3000 --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95" 22 | mmseqs2_api: 23 | enable: true 24 | use_env: true 25 | use_filter: true 26 | use_pairing: false 27 | pairing_strategy: greedy # greedy, complete 28 | data: 29 | uniref30: 30 | database_path: /localssd/data/uniref30_mmseqs/uniref30_2103_db 31 | envdb: 32 | database_path: /localssd/data/colabfold_envdb_202108/colabfold_envdb_202108_db 33 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/scripts/run_inference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | 5 | [[ "${DEBUG:-}" == "1" ]] && set -x 6 | 7 | : "${LAYERNORM_TYPE:=fast_layernorm}" 8 | : "${USE_DEEPSPEED_EVO_ATTTENTION:=true}" 9 | : "${device_ids:=0,1,2,3}" 10 | : "${master_port:=8803}" 11 | : "${seed:=1234}" 12 | 13 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 14 | PROJECT_DIR="${SCRIPT_DIR}/.." 15 | CHECKPOINT_PATH="/nfs/model" 16 | CCD_COMPONENTS="/nfs/ccd" 17 | CCD_COMPONENTS_RDKIT="/nfs/ccd" 18 | 19 | echo ${PROJECT_DIR} 20 | 21 | : "${CHECKPOINT_PATH:?Environment variable CHECKPOINT_PATH is required}" 22 | : "${CCD_COMPONENTS:?Environment variable CCD_COMPONENTS is required}" 23 | : "${CCD_COMPONENTS_RDKIT:?Environment variable CCD_COMPONENTS_RDKIT is required}" 24 | 25 | yaml_file_path="${PROJECT_DIR}/configs/inference_v0.1.yaml" 26 | checkpoint_path="${CHECKPOINT_PATH}/fold49-v0.1.2.pt" 27 | ccd_components_file="${CCD_COMPONENTS}/components.v20240608.cif" 28 | ccd_components_rdkit_mol_file="${CCD_COMPONENTS_RDKIT}/components.v20240608.cif.rdkit_mol.pkl" 29 | input_json_path="${PROJECT_DIR}/examples/example_built.json" 30 | output_dir="./outputs/example-${seed}" 31 | 32 | for f in "${yaml_file_path}" "${checkpoint_path}" "${ccd_components_file}" "${ccd_components_rdkit_mol_file}" "${input_json_path}"; do 33 | [[ -f "$f" ]] || { echo "Missing required file: $f" >&2; exit 1; } 34 | done 35 | 36 | mkdir -p "${output_dir}" 37 | 38 | export LAYERNORM_TYPE 39 | export USE_DEEPSPEED_EVO_ATTTENTION 40 | 41 | CUDA_VISIBLE_DEVICES="${device_ids}" \ 42 | OMP_NUM_THREADS=1 \ 43 | torchrun --nnodes=1 --nproc_per_node=4 --master_port="${master_port}" \ 44 | "${PROJECT_DIR}/runner/inference.py" \ 45 | --yaml_file_path="${yaml_file_path}" \ 46 | --checkpoint_path="${checkpoint_path}" \ 47 | --ccd_components_file="${ccd_components_file}" \ 48 | --ccd_components_rdkit_mol_file="${ccd_components_rdkit_mol_file}" \ 49 | --seeds="${seed}" \ 50 | --dump_dir="${output_dir}" \ 51 | --input_json_path="${input_json_path}" 52 | -------------------------------------------------------------------------------- /experiments/AIDO.Protein/DMS/configs/substitution_LoRA_FSDP.yaml: -------------------------------------------------------------------------------- 1 | trainer: 2 | accelerator: auto 3 | devices: auto 4 | max_steps: 10000 5 | gradient_clip_val: 0.1 6 | gradient_clip_algorithm: value 7 | default_root_dir: logs 8 | strategy: 9 | class_path: lightning.pytorch.strategies.FSDPStrategy 10 | init_args: 11 | auto_wrap_policy: modelgenerator.distributed.fsdp.wrap.AutoWrapPolicy 12 | sharding_strategy: HYBRID_SHARD 13 | logger: false 14 | callbacks: 15 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt 16 | init_args: 17 | dirpath: null 18 | filename: epoch_{epoch}-val_mcc:{val_spearman:.3f} 19 | monitor: val_spearman 20 | mode: max 21 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping 22 | dict_kwargs: 23 | monitor: val_spearman 24 | mode: max 25 | patience: 10 26 | model: 27 | class_path: modelgenerator.tasks.SequenceRegression 28 | init_args: 29 | backbone: 30 | class_path: modelgenerator.backbones.aido_protein_16b_v1 31 | init_args: 32 | use_peft: true 33 | max_length: 2048 34 | adapter: 35 | class_path: modelgenerator.adapters.MLPPoolAdapter 36 | init_args: 37 | hidden_sizes: 38 | - 128 39 | dropout: 0.1 40 | dropout_in_middle: false 41 | optimizer: 42 | class_path: torch.optim.AdamW 43 | init_args: 44 | lr: 0.0001 45 | weight_decay: 0.01 46 | lr_scheduler: 47 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup 48 | init_args: 49 | warmup_ratio: 0.05 50 | data: 51 | class_path: modelgenerator.data.DMSFitnessPrediction 52 | init_args: 53 | path: genbio-ai/ProteinGYM-DMS 54 | train_split_files: 55 | - singles_substitutions/VRPI_BPT7_Tsuboyama_2023_2WNM.tsv 56 | train_split_name: train 57 | random_seed: 42 58 | batch_size: 32 59 | cv_num_folds: 5 60 | cv_test_fold_id: 0 61 | cv_enable_val_fold: true 62 | cv_fold_id_col: fold_id 63 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/model/layer_norm/torch_ext_compile.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import os 18 | 19 | from torch.utils.cpp_extension import load 20 | 21 | 22 | def compile(name, sources, extra_include_paths, build_directory): 23 | os.environ["TORCH_CUDA_ARCH_LIST"] = "7.0;8.0" 24 | return load( 25 | name=name, 26 | sources=sources, 27 | extra_include_paths=extra_include_paths, 28 | extra_cflags=[ 29 | "-O3", 30 | "-DVERSION_GE_1_1", 31 | "-DVERSION_GE_1_3", 32 | "-DVERSION_GE_1_5", 33 | ], 34 | extra_cuda_cflags=[ 35 | "-O3", 36 | "--use_fast_math", 37 | "-DVERSION_GE_1_1", 38 | "-DVERSION_GE_1_3", 39 | "-DVERSION_GE_1_5", 40 | "-std=c++17", 41 | "-maxrregcount=50", 42 | "-U__CUDA_NO_HALF_OPERATORS__", 43 | "-U__CUDA_NO_HALF_CONVERSIONS__", 44 | "--expt-relaxed-constexpr", 45 | "--expt-extended-lambda", 46 | "-gencode", 47 | "arch=compute_70,code=sm_70", 48 | "-gencode", 49 | "arch=compute_80,code=sm_80", 50 | "-gencode", 51 | "arch=compute_86,code=sm_86", 52 | "-gencode", 53 | "arch=compute_90,code=sm_90", 54 | ], 55 | verbose=True, 56 | build_directory=build_directory, 57 | ) 58 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/model/modules/head.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | import torch.nn as nn 19 | from torch.nn import Linear 20 | 21 | 22 | # Adapted From openfold.model.heads 23 | class DistogramHead(nn.Module): 24 | """Implements Algorithm 1 [Line17] in AF3 25 | Computes a distogram probability distribution. 26 | For use in computation of distogram loss, subsection 1.9.8 (AF2) 27 | """ 28 | 29 | def __init__(self, c_z: int = 128, no_bins: int = 64) -> None: 30 | """ 31 | Args: 32 | c_z (int, optional): hidden dim [for pair embedding]. Defaults to 128. 33 | no_bins (int, optional): Number of distogram bins. Defaults to 64. 34 | """ 35 | super(DistogramHead, self).__init__() 36 | 37 | self.c_z = c_z 38 | self.no_bins = no_bins 39 | 40 | self.linear = Linear(in_features=self.c_z, out_features=self.no_bins) 41 | 42 | def forward(self, z: torch.Tensor) -> torch.Tensor: # [*, N, N, C_z] 43 | """ 44 | Args: 45 | z (torch.Tensor): pair embedding 46 | [*, N_token, N_token, C_z] 47 | 48 | Returns: 49 | torch.Tensor: distogram probability distribution 50 | [*, N_token, N_token, no_bins] 51 | """ 52 | # [*, N, N, no_bins] 53 | logits = self.linear(z) 54 | logits = logits + logits.transpose(-2, -3) 55 | return logits 56 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/genbio/modeling_genbio.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from lightning.pytorch import LightningModule 4 | from lightning.pytorch.core.saving import _load_state 5 | from transformers import PreTrainedModel, PretrainedConfig 6 | 7 | 8 | class GenBioConfig(PretrainedConfig): 9 | model_type = "genbio" 10 | 11 | def __init__(self, hparams=None, **kwargs): 12 | self.hparams = hparams 13 | super().__init__(**kwargs) 14 | 15 | 16 | class GenBioModel(PreTrainedModel): 17 | config_class = GenBioConfig 18 | 19 | def __init__(self, config: GenBioConfig, genbio_model=None, **kwargs): 20 | super().__init__(config, **kwargs) 21 | # if genbio_model is provided, we don't need to initialize it 22 | if genbio_model is not None: 23 | self.genbio_model = genbio_model 24 | return 25 | # otherwise, initialize the model from hyperparameters 26 | cls_path = config.hparams["_class_path"] 27 | module_path, name = cls_path.rsplit(".", 1) 28 | genbio_cls = getattr(__import__(module_path, fromlist=[name]), name) 29 | checkpoint = { 30 | LightningModule.CHECKPOINT_HYPER_PARAMS_KEY: config.hparams, 31 | "state_dict": {}, 32 | } 33 | # TODO: _load_state is a private function and it throws a warning for an 34 | # empty state_dict. We need a fucntion to intialize our model; this 35 | # is the only choice we have for now. 36 | with warnings.catch_warnings(): 37 | warnings.filterwarnings("ignore", "Found keys that are*") 38 | self.genbio_model = _load_state( 39 | genbio_cls, checkpoint, strict_loading=False 40 | ) 41 | 42 | @classmethod 43 | def from_genbio_model(cls, model: LightningModule): 44 | return cls(GenBioConfig(hparams=model.hparams), genbio_model=model) 45 | 46 | def forward(self, *args, **kwargs): 47 | return self.genbio_model(*args, **kwargs) 48 | 49 | 50 | GenBioModel.register_for_auto_class("AutoModel") 51 | GenBioConfig.register_for_auto_class("AutoConfig") 52 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/utils/init_params.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | from typing import Union 4 | 5 | 6 | def init_linear_xavier_(linear: Union[nn.Linear, nn.Embedding, None]): 7 | if linear is None: 8 | return linear 9 | nn.init.xavier_uniform_(linear.weight, gain=1) 10 | if hasattr(linear, "bias") and linear.bias is not None: 11 | nn.init.zeros_(linear.bias) 12 | return linear 13 | 14 | 15 | def init_linear_zero_(linear: Union[nn.Linear, nn.Embedding, None], eps: float = 1e-6): 16 | if linear is None: 17 | return linear 18 | nn.init.normal_(linear.weight, 0, eps) 19 | if hasattr(linear, "bias") and linear.bias is not None: 20 | nn.init.zeros_(linear.bias) 21 | return linear 22 | 23 | 24 | def init_linear_( 25 | linear: Union[nn.Linear, nn.Embedding, None], 26 | init_type: str = "xavier", 27 | eps: float = 1e-6, 28 | ): 29 | if init_type == "xavier": 30 | return init_linear_xavier_(linear) 31 | elif init_type == "zero": 32 | return init_linear_zero_(linear, eps=eps) 33 | else: 34 | raise ValueError(f"Unknown init_type {init_type}") 35 | 36 | 37 | def init_layer_norm_(layer_norm: Union[nn.LayerNorm, None]): 38 | if layer_norm is None: 39 | return layer_norm 40 | nn.init.ones_(layer_norm.weight) 41 | nn.init.zeros_(layer_norm.bias) 42 | return layer_norm 43 | 44 | 45 | def init_params_recursively_(module: nn.Module): 46 | for name, child in module.named_children(): 47 | if hasattr(child, "reset_parameters"): 48 | child.reset_parameters() 49 | elif isinstance(child, nn.Linear): 50 | init_linear_(child) 51 | elif isinstance(child, nn.LayerNorm): 52 | init_layer_norm_(child) 53 | elif isinstance(child, nn.Embedding): 54 | init_linear_(child) 55 | elif isinstance(child, nn.Dropout): 56 | pass 57 | else: 58 | try: 59 | init_params_recursively_(child) 60 | except Exception as e: 61 | print(f"Failed to init {name} with {e}") 62 | -------------------------------------------------------------------------------- /docs/docs/experiment_design/backbones.md: -------------------------------------------------------------------------------- 1 | # Adding Backbones 2 | 3 | Backbones are pre-trained foundation models. 4 | 5 | Foundation models are essential to modern ML but are often difficult to work with. 6 | Design decisions made during pre-training (tokenization, architecture, io format) cannot be changed. 7 | At best, this results in many reimplementations for benchmarking or finetuning tasks, and a high risk of buggy code. 8 | At worst, these decisions can lock in users and exclude certain tasks and use-cases. 9 | 10 | AIDO.ModelGenerator eliminates the need for reimplementation and makes backbones task-agnostic: wrap your backbone in a standard interface, and reuse it across all inference and finetuning tasks. 11 | It also makes compatibility transparent: if a backbone fits the required interface, it can be used for any data-appropriate task. 12 | 13 | > Note: Backbones for 1D sequence modeling are universally supported. Other types of backbones included in AIDO.ModelGenerator (e.g. structure, image) are not yet universally supported, but will be in the future. 14 | 15 | Available Backbones: 16 | 17 | - DNA: `aido_dna_7b`, `aido_dna_300m`, `aido_dna_dummy`, `aido_dna_debug`, `dna_onehot` 18 | - RNA: `aido_rna_1b600m`, `aido_rna_1b600m_cds`, `aido_rna_650m`, `aido_rna_650m_cds`, `aido_rna_300m_mars`, `aido_rna_25m_mars`, `aido_rna_1m_mars`, `aido_dna_dummy`, `aido_dna_debug`, `dna_onehot` 19 | - Protein: `aido_protein_16b`, `aido_protein_16b_v1`, `aido_protein2structoken_16b`, `aido_protein_debug`, `protein_onehot`, `aido_protein_rag_16b`, `aido_protein_rag_3b` 20 | - Cell (gene expression): `aido_cell_100m`, `aido_cell_10m`, `aido_cell_3m`, `geneformer` 21 | - OneHot: dummy model, only tokenizes, useful for non-FM baselines and quick tests 22 | 23 | At their core, backbones are PyTorch `nn.Module` objects with a few extra interfaces. 24 | To implement a new backbone, subclass a backbone interface and implement the required methods. 25 | 26 | ::: modelgenerator.backbones.SequenceBackboneInterface 27 | handler: python 28 | options: 29 | filters: 30 | - "!^__" 31 | show_root_heading: true 32 | show_source: true 33 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/mean_ribosome_load_prediction/README.md: -------------------------------------------------------------------------------- 1 | # Mean Ribosome Load Prediction 2 | Ribosomes are cellular structures responsible for protein synthesis, and the ribosome load on an mRNA molecule can influence the rate and efficiency of protein production, and the success of genetic engineering. Predicting ribosome load can provide valuable insights into gene expression regulation, translation efficiency, and cellular processes. We fully finetune AIDO.RNA-1.6B for mean ribosome load prediction using the dataset by [Sample _et al._](https://www.nature.com/articles/s41587-019-0164-5). We use the same train, test, and validation split used in a previous study [RiNALMo](https://arxiv.org/abs/2403.00043). See the [config file](https://github.com/genbio-ai/ModelGenerator/tree/main/experiments/AIDO.RNA/mean_ribosome_load_prediction/mean_ribosome_load_prediction.yaml) for detailed hyperparameter settings. 3 | 4 | #### Finetuning script 5 | ```shell 6 | RUN_NAME=rna_mrl 7 | CKPT_SAVE_DIR=logs/${RUN_NAME} 8 | mgen fit --config experiments/AIDO.RNA/mean_ribosome_load_prediction/mean_ribosome_load_prediction.yaml \ 9 | --trainer.default_root_dir ${CKPT_SAVE_DIR} \ 10 | --trainer.callbacks.ft_schedule_path experiments/AIDO.RNA/mean_ribosome_load_prediction/ft_schedules/two_step.yaml \ 11 | --trainer.devices 0, 12 | ``` 13 | 14 | Note that here we are using finetuning scheduler. See [this tutorial](https://github.com/genbio-ai/ModelGenerator/blob/main/docs/docs/tutorials/finetuning_scheduler.md) for details. 15 | 16 | #### Evaluation script 17 | ```shell 18 | RUN_NAME=rna_mrl 19 | CKPT_SAVE_DIR=logs/${RUN_NAME} 20 | CKPT_PATH=/path/to/checkpoint ## NOTE: Replace `/path/to/checkpoint` with the actual finetuned checkpoint path. 21 | mgen test --config experiments/AIDO.RNA/mean_ribosome_load_prediction/mean_ribosome_load_prediction.yaml \ 22 | --trainer.default_root_dir ${CKPT_SAVE_DIR}/test \ 23 | --trainer.callbacks.ft_schedule_path experiments/AIDO.RNA/mean_ribosome_load_prediction/ft_schedules/two_step.yaml \ 24 | --trainer.devices 0, \ 25 | --ckpt_path ${CKPT_PATH} 26 | ``` 27 | -------------------------------------------------------------------------------- /experiments/AIDO.StructureTokenizer/extract_structure_tokenizer_codebook.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from modelgenerator.structure_tokenizer.models.structure_tokenizer import StructureTokenizerModel 3 | import argparse 4 | 5 | 6 | def main(): 7 | """ 8 | Main function to extract and save the codebook of the StructureTokenizerModel. 9 | 10 | The codebook is a matrix that represents the embeddings of structure tokens in the model. 11 | It has the shape (num_tokens, embedding_dim), where: 12 | - num_tokens: The number of unique structure tokens in the model's vocabulary. 13 | - embedding_dim: The dimensionality of the token embeddings. 14 | 15 | This script loads a pretrained StructureTokenizerModel, extracts its codebook, and saves 16 | it as a PyTorch tensor file (.pt). 17 | 18 | Usage: 19 | Run the script with the required arguments to specify the output file path and the 20 | pretrained model to use. The codebook will be saved in the specified path. 21 | """ 22 | parser = argparse.ArgumentParser( 23 | description=( 24 | "Extract the codebook of StructureTokenizerModel.\n" 25 | "The codebook is a matrix of shape (num_tokens, embedding_dim), where each row corresponds " 26 | "to the embedding of a structure token. The extracted codebook is saved as a PyTorch tensor (.pt) file." 27 | ) 28 | ) 29 | parser.add_argument("--output_path", type=str, required=True, help="Path to save the codebook in .pt format.") 30 | parser.add_argument( 31 | "--pretrained_model_name_or_path", 32 | type=str, 33 | default="genbio-ai/AIDO.StructureTokenizer", 34 | help=( 35 | "The pretrained model name or local path to load the StructureTokenizerModel.\n" 36 | "Default: 'genbio-ai/AIDO.StructureTokenizer'." 37 | ), 38 | ) 39 | args = parser.parse_args() 40 | 41 | model = StructureTokenizerModel.from_pretrained(args.pretrained_model_name_or_path) 42 | codebook = model.encoder.codebook.data.cpu() 43 | # Save the extracted codebook as a PyTorch tensor (.pt) file 44 | torch.save(codebook, args.output_path) 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /docs/docs/api_reference/adapters.md: -------------------------------------------------------------------------------- 1 | # Adapters 2 | 3 | Adapters work with [`Backbones`](../backbones) and [`Tasks`](../tasks) to adapt pretrained models to new objectives. 4 | They are specified with the `--model.adapter` argument in the CLI or in the `model.adapter` section of a configuration file. 5 | 6 | Adapters are the focal point for architecture design on top of backbones, and can be swapped with other adapters of the same type to benchmark different architectures. 7 | 8 | This reference overviews the available no-code adapters. 9 | If you would like to develop new adapters, see [Experiment Design](../../experiment_design). 10 | 11 | ```yaml 12 | # Example Adapter Configuration 13 | model: 14 | class_path: modelgenerator.tasks.SequenceRegression 15 | init_args: 16 | adapter: 17 | class_path: modelgenerator.adapters.MLPPoolAdapter 18 | init_args: 19 | pooling: mean_pooling 20 | hidden_sizes: 21 | - 512 22 | bias: true 23 | dropout: 0.1 24 | dropout_in_middle: false 25 | data: 26 | ... 27 | trainer: 28 | ... 29 | ``` 30 | 31 | ## Sequence Adapters 32 | 33 | These adapters make a single prediction for the entire input. 34 | 35 | ::: modelgenerator.adapters.MLPAdapter 36 | 37 | ::: modelgenerator.adapters.LinearCLSAdapter 38 | 39 | ::: modelgenerator.adapters.LinearMeanPoolAdapter 40 | 41 | ::: modelgenerator.adapters.LinearMaxPoolAdapter 42 | 43 | ::: modelgenerator.adapters.LinearTransformerAdapter 44 | 45 | ::: modelgenerator.adapters.ResNet2DAdapter 46 | 47 | ::: modelgenerator.adapters.ResNet1DAdapter 48 | 49 | ## Token Adapters 50 | 51 | These adapters make one prediction per token. 52 | 53 | ::: modelgenerator.adapters.LinearAdapter 54 | 55 | ::: modelgenerator.adapters.MLPAdapter 56 | 57 | ::: modelgenerator.adapters.MLPAdapterWithoutOutConcat 58 | 59 | ## Conditional Generation Adapters 60 | 61 | These adapters are used for conditional generation tasks. 62 | 63 | ::: modelgenerator.adapters.ConditionalLMAdapter 64 | 65 | ## Fusion Adapters 66 | 67 | These adapters are used for multi-modal fusion to combine multiple backbones. 68 | 69 | ::: modelgenerator.adapters.MMFusionSeqAdapter 70 | 71 | ::: modelgenerator.adapters.MMFusionTokenAdapter 72 | -------------------------------------------------------------------------------- /experiments/AIDO.Cell/docker_readme.md: -------------------------------------------------------------------------------- 1 | # Data setup 2 | 3 | Data for cell classification tasks can be found in [cell-downstream-tasks](https://huggingface.co/datasets/genbio-ai/cell-downstream-tasks) on Hugging Face. 4 | 5 | To download all cell downstream tasks: 6 | ``` 7 | cd /path/to/ModelGenerator/modelgenerator 8 | git clone git@hf.co:datasets/genbio-ai/cell-downstream-tasks 9 | ``` 10 | 11 | You should only need to do this once. 12 | 13 | # Building the Docker image 14 | 15 | ```bash 16 | cd /path/to/ModelGenerator 17 | docker build -t finetune -f Dockerfile . 18 | ``` 19 | 20 | You should only need to do this once. 21 | 22 | # Hugging Face authentication 23 | 24 | If you need to access private or gated models/data: 25 | 26 | ```bash 27 | huggingface-cli login 28 | ``` 29 | 30 | # Fine-tuning a model 31 | 32 | ```bash 33 | cd /path/to/ModelGenerator 34 | docker run --rm --runtime=nvidia \ 35 | -v /home/user/ModelGenerator/configs:/workspace/configs \ 36 | -v /home/user/ModelGenerator/modelgenerator:/workspace/modelgenerator \ 37 | -v /home/user/ModelGenerator/experiments:/workspace/experiments \ 38 | -v /home/user/.cache/huggingface:/root/.cache/huggingface \ 39 | -v "/home/user/ModelGenerator/logs:/workspace/logs" \ 40 | finetune bash -c "mgen fit --config experiments/AIDO.Cell/cell_type_classification.yaml" 41 | ``` 42 | 43 | # Evaluating a checkpoint 44 | 45 | ```bash 46 | cd /path/to/ModelGenerator 47 | docker run --rm --runtime=nvidia \ 48 | -v /home/user/ModelGenerator/configs:/workspace/configs \ 49 | -v /home/user/ModelGenerator/modelgenerator:/workspace/modelgenerator \ 50 | -v /home/user/ModelGenerator/experiments:/workspace/experiments \ 51 | -v /home/user/.cache/huggingface:/root/.cache/huggingface \ 52 | -v "/home/user/ModelGenerator/logs:/workspace/logs" \ 53 | finetune bash -c "mgen test --config experiments/AIDO.Cell/cell_type_classification.yaml --ckpt_path /workspace/lightning_logs/version_X/checkpoints/my.ckpt" 54 | ``` 55 | 56 | # Other usage examples 57 | 58 | The example above fine-tunes and evaluates a model for cell type classification. Other usage examples are described below. 59 | 60 | ## Transcriptomic Clock Task 61 | Simply replace the `config` argument of `mgen fit` with `experiments/AIDO.Cell/transcriptomic_clock.yaml`. 62 | -------------------------------------------------------------------------------- /docs/docs/experiment_design/data.md: -------------------------------------------------------------------------------- 1 | # Adding Data Loaders 2 | 3 | AIDO.ModelGenerator uses [Lightning DataModules](https://lightning.ai/docs/pytorch/stable/data/datamodule.html) for dataset management and loading. 4 | We also provide a few tools to make data management more convenient, and work with common file types out-of-the-box. 5 | 6 | AIDO.ModelGenerator provides a `DataInterface` class that hides boilerplate, along with a `HFDatasetLoaderMixin` that combines Lightning DataModule structure and [HuggingFace Datasets](https://huggingface.co/docs/datasets) convenience together to quickly load data from HuggingFace or common file formats (e.g. tsv, csv, json, etc). 7 | More convenient mixins and example usage are outlined below. 8 | 9 | Many common tasks and data loaders are already implemented in AIDO.ModelGenerator, and only require setting new paths to run with new data. 10 | See the [Data API Reference](../api_reference/data.md) for all types of available data modules. 11 | 12 | ::: modelgenerator.data.DataInterface 13 | handler: python 14 | options: 15 | filters: 16 | - "!^__" 17 | members: 18 | - setup 19 | - load_and_split_dataset 20 | show_root_heading: true 21 | show_source: true 22 | 23 | ## Useful Mixins 24 | 25 | ::: modelgenerator.data.HFDatasetLoaderMixin 26 | handler: python 27 | options: 28 | filters: 29 | - "!^__" 30 | show_root_heading: true 31 | show_source: true 32 | 33 | ::: modelgenerator.data.KFoldMixin 34 | handler: python 35 | options: 36 | filters: 37 | - "!^__" 38 | show_root_heading: true 39 | show_source: true 40 | 41 | ## Implementing a DataModule 42 | 43 | To transform datasets for task-specific behaviors (e.g. masking for masked language modeling), use `torch.utils.data.Dataset` objects to implement the transformation. 44 | Below is an example. 45 | 46 | ::: modelgenerator.data.MLMDataModule 47 | handler: python 48 | options: 49 | filters: 50 | - "!^__" 51 | members: 52 | - setup 53 | show_root_heading: true 54 | show_source: true 55 | 56 | ::: modelgenerator.data.MLMDataset 57 | handler: python 58 | options: 59 | filters: 60 | - "!^__" 61 | show_root_heading: true 62 | show_source: true 63 | -------------------------------------------------------------------------------- /docs/docs/tutorials/kfold_cross_validation.md: -------------------------------------------------------------------------------- 1 | # K-fold cross validation 2 | 3 | Datasets implementing the `DataInterface` with the `KFoldMixin` support semi-automatic k-fold crossvalidation for uncertainty estimation. 4 | 5 | We use translation efficiency prediction as an example task to demonstrate how to do a k-fold cross validation in ModelGenerator. The logic is to split the dataset into k-fold, and call each fold as a test set iteratively. 6 | 7 | #### Data configs 8 | For cross validation task, we input only one dataset named `train` containing a colomn `fold_id` indicating the fold index for each sample. You need to set `cv_num_folds`, `cv_test_fold_id`, `cv_enable_val_fold`, `cv_fold_id_col` according to your experiment setting. 9 | ```yaml 10 | data: 11 | class_path: modelgenerator.data.TranslationEfficiency 12 | init_args: 13 | path: genbio-ai/rna-downstream-tasks 14 | config_name: translation_efficiency_Muscle 15 | normalize: true 16 | train_split_name: train 17 | random_seed: 42 18 | batch_size: 8 19 | shuffle: true 20 | cv_num_folds: 10 21 | cv_test_fold_id: 0 22 | cv_enable_val_fold: true 23 | cv_fold_id_col: fold_id 24 | ``` 25 | See `experiments/AIDO.RNA/configs/translation_efficiency.yaml` for full hyperparameter settings. 26 | 27 | 28 | #### Finetuning script 29 | ```shell 30 | for FOLD in {0..9} 31 | do 32 | RUN_NAME=te_Muscle_aido_rna_1b600m_fold${FOLD} 33 | CKPT_SAVE_DIR=logs/rna_tasks/${RUN_NAME} 34 | CUDA_VISIBLE_DEVICES=0 mgen fit --config experiments/AIDO.RNA/configs/translation_efficiency.yaml \ 35 | --data.config_name translation_efficiency_Muscle \ 36 | --data.cv_test_fold_id $FOLD \ 37 | --trainer.logger.name $RUN_NAME \ 38 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR 39 | done 40 | ``` 41 | 42 | #### Evaluation script 43 | ```shell 44 | for FOLD in {0..9} 45 | do 46 | CKPT_PATH=logs/rna_tasks/te_Muscle_aido_rna_1b600m_fold${FOLD}/best_val* 47 | echo ">>> Fold ${FOLD}" 48 | mgen test --config experiments/AIDO.RNA/configs/translation_efficiency.yaml \ 49 | --data.config_name translation_efficiency_Muscle \ 50 | --data.cv_test_fold_id $FOLD \ 51 | --model.strict_loading True \ 52 | --model.reset_optimizer_states True \ 53 | --trainer.logger null \ 54 | --ckpt_path $CKPT_PATH 55 | done 56 | ``` 57 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/buildjob.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | 5 | def buildjob(input_json, input_msa_dir, output_json): 6 | # First, make a reverse index of sequences to MSA files 7 | msa_files = glob.glob(os.path.join(input_msa_dir, '*/*/*.fasta')) 8 | # key: protein sequence, values: precomputed_msa_dir, non_pairing_msa_names 9 | msa_dict = {} 10 | for msa_file in msa_files: 11 | # Get msa_file parent directory 12 | msa_dir = os.path.dirname(msa_file) 13 | msa_subdir = msa_dir.split(input_msa_dir)[-1].strip('/') 14 | # Get all the non-fasta files in the msa_dir 15 | msa_files = glob.glob(os.path.join(msa_dir, '*')) 16 | msa_files.remove(msa_file) 17 | msa_files = [os.path.basename(f) for f in msa_files] 18 | # Get the protein sequence from the fasta file 19 | with open(msa_file, 'r') as f: 20 | sequence = f.readlines()[1].strip() 21 | msa_dict[sequence] = { 22 | "precomputed_msa_dir": os.path.join('/msa_database', msa_subdir), 23 | "non_pairing_msa_names": msa_files 24 | } 25 | 26 | with open(input_json, 'r') as f: 27 | data = json.load(f) 28 | 29 | fasta_lines = [] 30 | for i, job in enumerate(data): 31 | for j, entity in enumerate(job['sequences']): 32 | if "proteinChain" in entity: 33 | protein_chain = entity['proteinChain'] 34 | sequence = protein_chain['sequence'] 35 | # Add msa information 36 | assert sequence in msa_dict, f"Sequence {sequence} not found in MSA directory." 37 | protein_chain['msa'] = msa_dict[sequence] 38 | 39 | with open(output_json, 'w') as f: 40 | json.dump(data, f, indent=4) 41 | 42 | 43 | if __name__ == '__main__': 44 | import argparse 45 | parser = argparse.ArgumentParser(description='Construct FASTA from job JSON for protein MSA retrieval.') 46 | parser.add_argument('--input', type=str, help='Input job JSON file') 47 | parser.add_argument('--msa-db', type=str, help='Input directory for MSA files, built using job2msa.py') 48 | parser.add_argument('--output', type=str, help='Output job JSON file') 49 | args = parser.parse_args() 50 | buildjob(args.input, args.msa_db, args.output) 51 | -------------------------------------------------------------------------------- /modelgenerator/structure_tokenizer/utils/constants/residue_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 AlQuraishi Laboratory 2 | # Copyright 2021 DeepMind Technologies Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # This mapping is used when we need to store atom data in a format that requires 17 | # fixed atom data size for every residue (e.g. a numpy array). 18 | atom_types = [ 19 | "N", 20 | "CA", 21 | "C", 22 | "CB", 23 | "O", 24 | "CG", 25 | "CG1", 26 | "CG2", 27 | "OG", 28 | "OG1", 29 | "SG", 30 | "CD", 31 | "CD1", 32 | "CD2", 33 | "ND1", 34 | "ND2", 35 | "OD1", 36 | "OD2", 37 | "SD", 38 | "CE", 39 | "CE1", 40 | "CE2", 41 | "CE3", 42 | "NE", 43 | "NE1", 44 | "NE2", 45 | "OE1", 46 | "OE2", 47 | "CH2", 48 | "NH1", 49 | "NH2", 50 | "OH", 51 | "CZ", 52 | "CZ2", 53 | "CZ3", 54 | "NZ", 55 | "OXT", 56 | ] 57 | atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)} 58 | atom_type_num = len(atom_types) # := 37. 59 | 60 | restype_1to3 = { 61 | "A": "ALA", 62 | "R": "ARG", 63 | "N": "ASN", 64 | "D": "ASP", 65 | "C": "CYS", 66 | "Q": "GLN", 67 | "E": "GLU", 68 | "G": "GLY", 69 | "H": "HIS", 70 | "I": "ILE", 71 | "L": "LEU", 72 | "K": "LYS", 73 | "M": "MET", 74 | "F": "PHE", 75 | "P": "PRO", 76 | "S": "SER", 77 | "T": "THR", 78 | "W": "TRP", 79 | "Y": "TYR", 80 | "V": "VAL", 81 | "X": "UNK", 82 | } 83 | restype_3to1 = {v: k for k, v in restype_1to3.items()} 84 | restype_1toidx = {k: i for i, k in enumerate(restype_1to3.keys())} 85 | restype_idxto1 = {v: k for k, v in restype_1toidx.items()} 86 | restype_num = len(restype_1to3) 87 | unknown_restype_idx = restype_num - 1 88 | -------------------------------------------------------------------------------- /experiments/AIDO.RNA/dependency_mapping/README.md: -------------------------------------------------------------------------------- 1 | # Dependency Mapping 2 | 3 | Dependency mapping is an _in silico_ mutagenesis technique that identifies co-conserved elements in a sequence. 4 | AIDO.ModelGenerator implements the procedure proposed by [Tomaz da Silva et al.](https://www.biorxiv.org/content/10.1101/2024.07.27.605418v1) 5 | We use this to assess structural features learned during pretraining in the [AIDO.RNA](https://www.biorxiv.org/content/10.1101/2024.11.28.625345v1) paper with the [AIDO.RNA-1.6B](https://huggingface.co/genbio-ai/AIDO.RNA-1.6B) model. 6 | This task uses the pre-trained models directly, and does not require finetuning. 7 | 8 | To reproduce the dependency mapping results from the AIDO.RNA paper, run the following from the ModelGenerator root directory: 9 | ``` 10 | # Inference 11 | mgen predict --config experiments/AIDO.RNA/dependency_mapping/config.yaml 12 | 13 | # Plotting 14 | python experiments/AIDO.RNA/dependency_mapping/plot_dependency_maps.py \ 15 | -i depmap_predictions \ 16 | -o depmap_plots \ 17 | -v experiments/AIDO.RNA/dependency_mapping/DNA.txt \ 18 | -t modelgenerator/huggingface_models/rnabert/vocab.txt 19 | ``` 20 | 21 | To create new dependency maps, 22 | 23 | 1. Gather your sequences into a .tsv file with an `id` and `sequence` column. 24 | 2. Run `mgen predict --config config.yaml` where 25 | ``` 26 | model: 27 | class_path: Inference 28 | init_args: 29 | backbone: 30 | data: 31 | class_path: DependencyMappingDataModule 32 | init_args: 33 | path: # Note: this errors for ., use ../dependency_mapping if necessary 34 | test_split_files: 35 | - 36 | vocab_file: .txt # E.g. experiments/AIDO.RNA/dependency_mapping/DNA.txt 37 | trainer: 38 | callbacks: 39 | - class_path: modelgenerator.callbacks.PredictionWriter 40 | dict_kwargs: 41 | output_dir: predictions 42 | filetype: pt 43 | ``` 44 | 45 | 3. Run the plotting tool 46 | ``` 47 | python experiments/AIDO.RNA/dependency_mapping/plot_dependency_maps.py \ 48 | -i \ 49 | -o \ 50 | -v \ 51 | -t 52 | ``` 53 | 54 | The output will be files of the name `.png` in the output directory, with heatmaps of dependencies and logos with sequence information content. 55 | -------------------------------------------------------------------------------- /scripts/wandb_sweep/README.md: -------------------------------------------------------------------------------- 1 | # How to use W&B Sweeps with ModelGenerator for hyperparameter tuning 2 | 3 | ## Caveats 4 | W&B agents cannot launch multi-node training jobs, which causes great difficulties integrating W&B Sweeps with ModelGenerator. This guide is based on a hacky workaround that introduces many limitations. 5 | 6 | ### The workaround 7 | An agent is configured to exit immediately after retrieving the next set of hyperparamenters and outputing the complete training command to stdout. This command is then executed on each node without being monitored by an active agent. 8 | 9 | ### Limitations 10 | 1. All agent functionalities are lost. It is not possible to use agent to start/stop/resume/update training runs. Users must manually terminate training runs or implement early-stopping mechanisms. 11 | 2. Failed runs have to be re-run manually using your own sbatch scripts. The command for that run is availale in stdout of the failed run. 12 | 3. Parameter importance plots use wrong parameters by default, it can be manually fixed by selecting the right parameter names in your mgen config. 13 | 14 | >**NOTE**: Before proceeding, please make sure that your training job uses **WandbLogger**. 15 | ## SLURM 16 | ### Step 1: create a wandb sweep 17 | The default `slurm_sweep.yaml` creates a wandb sweep with the training command `mgen fit --config .local/test.yaml` under the project `autotune-test`. Please modify it to suit your experiments. Key values to change are **project**, **command** and **parameters**. 18 | 19 | Run the following command to create a wandb sweep: 20 | ```bash 21 | wandb sweep scripts/wandb_sweep/slurm_sweep.yaml 22 | ``` 23 | Take a note of your sweep ID for step 2. It looks like `//` and is found in the output: `wandb: Run sweep agent with: wandb agent` 24 | ### Step 2: submit the next training job to SLURM 25 | Similar to step 1, you need to edit `slurm_agent.sh` for your experiment. The most important changes are **WANDB_PROJECT** and **SWEEP_ID**. 26 | 27 | The following command creates one sweep agent that runs training with the next set of hyperparamenters. 28 | ```bash 29 | sbatch scripts/wandb_sweep/slurm_agent.sh 30 | ``` 31 | 32 | >**TIPS**: To queue your other sweep runs, use `sbatch --dependency`. To launch your other sweep runs in parallel, use `sbatch --array=1-X` where `X` is the number of parallel runs. 33 | -------------------------------------------------------------------------------- /docs/docs/tutorials/dependency_mapping.md: -------------------------------------------------------------------------------- 1 | # Dependency Mapping 2 | 3 | Dependency mapping is an _in silico_ mutagenesis technique that identifies co-conserved elements in a sequence. 4 | AIDO.ModelGenerator implements the procedure proposed by [Tomaz da Silva et al.](https://www.biorxiv.org/content/10.1101/2024.07.27.605418v1) 5 | We use this to mine functional genomic elements in the [AIDO.DNA](https://doi.org/10.1101/2024.12.01.625444) paper with the [AIDO.DNA-7B](https://huggingface.co/genbio-ai/AIDO.DNA-7B) and [AIDO.DNA-300M](https://huggingface.co/genbio-ai/AIDO.DNA-300M) models. 6 | This task uses the pre-trained models directly, and does not require finetuning. 7 | 8 | To reproduce the dependency mapping results from the AIDO.DNA paper, run the following from the ModelGenerator root directory: 9 | ``` 10 | # Inference 11 | mgen predict --config experiments/AIDO.DNA/dependency_mapping/config.yaml 12 | 13 | # Plotting 14 | python experiments/AIDO.DNA/dependency_mapping/plot_dependency_maps.py \ 15 | -i depmap_predictions \ 16 | -o depmap_plots \ 17 | -v experiments/AIDO.DNA/dependency_mapping/DNA.txt \ 18 | -t modelgenerator/huggingface_models/rnabert/vocab.txt 19 | ``` 20 | 21 | To create new dependency maps, 22 | 23 | 1. Gather your sequences into a .tsv file with an `id` and `sequence` column. 24 | 2. Run `mgen predict --config config.yaml` where 25 | ``` 26 | model: 27 | class_path: Inference 28 | init_args: 29 | backbone: 30 | data: 31 | class_path: DependencyMappingDataModule 32 | init_args: 33 | path: # Note: this errors for ., use ../dependency_mapping if necessary 34 | test_split_files: 35 | - 36 | vocab_file: .txt # E.g. experiments/AIDO.DNA/dependency_mapping/DNA.txt 37 | trainer: 38 | callbacks: 39 | - class_path: modelgenerator.callbacks.PredictionWriter 40 | dict_kwargs: 41 | output_dir: predictions 42 | filetype: pt 43 | ``` 44 | 45 | 3. Run the plotting tool 46 | ``` 47 | python experiments/AIDO.DNA/dependency_mapping/plot_dependency_maps.py \ 48 | -i \ 49 | -o \ 50 | -v \ 51 | -t 52 | ``` 53 | 54 | The output will be files of the name `.png` in the output directory, with heatmaps of dependencies and logos with sequence information content. 55 | -------------------------------------------------------------------------------- /experiments/AIDO.DNA/dependency_mapping/README.md: -------------------------------------------------------------------------------- 1 | # Dependency Mapping 2 | 3 | Dependency mapping is an _in silico_ mutagenesis technique that identifies co-conserved elements in a sequence. 4 | AIDO.ModelGenerator implements the procedure proposed by [Tomaz da Silva et al.](https://www.biorxiv.org/content/10.1101/2024.07.27.605418v1) 5 | We use this to mine functional genomic elements in the [AIDO.DNA](https://doi.org/10.1101/2024.12.01.625444) paper with the [AIDO.DNA-7B](https://huggingface.co/genbio-ai/AIDO.DNA-7B) and [AIDO.DNA-300M](https://huggingface.co/genbio-ai/AIDO.DNA-300M) models. 6 | This task uses the pre-trained models directly, and does not require finetuning. 7 | 8 | To reproduce the dependency mapping results from the AIDO.DNA paper, run the following from the ModelGenerator root directory: 9 | ``` 10 | # Inference 11 | mgen predict --config experiments/AIDO.DNA/dependency_mapping/config.yaml 12 | 13 | # Plotting 14 | python experiments/AIDO.DNA/dependency_mapping/plot_dependency_maps.py \ 15 | -i depmap_predictions \ 16 | -o depmap_plots \ 17 | -v experiments/AIDO.DNA/dependency_mapping/DNA.txt \ 18 | -t modelgenerator/huggingface_models/rnabert/vocab.txt 19 | ``` 20 | 21 | To create new dependency maps, 22 | 23 | 1. Gather your sequences into a .tsv file with an `id` and `sequence` column. 24 | 2. Run `mgen predict --config config.yaml` where 25 | ``` 26 | model: 27 | class_path: Inference 28 | init_args: 29 | backbone: 30 | data: 31 | class_path: DependencyMappingDataModule 32 | init_args: 33 | path: # Note: this errors for ., use ../dependency_mapping if necessary 34 | test_split_files: 35 | - 36 | vocab_file: .txt # E.g. experiments/AIDO.DNA/dependency_mapping/DNA.txt 37 | trainer: 38 | callbacks: 39 | - class_path: modelgenerator.callbacks.PredictionWriter 40 | dict_kwargs: 41 | output_dir: predictions 42 | filetype: pt 43 | ``` 44 | 45 | 3. Run the plotting tool 46 | ``` 47 | python experiments/AIDO.DNA/dependency_mapping/plot_dependency_maps.py \ 48 | -i \ 49 | -o \ 50 | -v \ 51 | -t 52 | ``` 53 | 54 | The output will be files of the name `.png` in the output directory, with heatmaps of dependencies and logos with sequence information content. 55 | -------------------------------------------------------------------------------- /docs/docs/tutorials/finetuning_scheduler.md: -------------------------------------------------------------------------------- 1 | For some of our experiments, we leverage the [gradual unfreezing finetuning scheduler](https://github.com/genbio-ai/ModelGenerator/blob/main/modelgenerator/callbacks.py#L213), adapted from [RiNALMo](https://arxiv.org/abs/2403.00043)'s [code repository](https://github.com/lbcb-sci/RiNALMo/blob/main/rinalmo/utils/finetune_callback.py). 2 | 3 | #### Creating `schedule` 4 | To use a FT scheduler, first we have to create a schedule and saving as a `.yaml` file. An example schedule is shown below: 5 | ``` 6 | 0: 7 | - adapter.* 8 | 3: 9 | - backbone.encoder.encoder.ln.* 10 | - backbone.encoder.encoder.layer.32.* 11 | ``` 12 | 13 | In this example, when the model is setup, all the layers are first frozen. Then before the `0-th`-th epoch starts, all the parameters in the `adapter` module are unfrozen, and they remain unfrozen (trainable) for the rest of the training run. Similarly, before the `3-rd` epoch starts, parameters in the `backbone.encoder.encoder.ln` module (i.e., the last layer norm module of the backbone's encoder) is unfrozen, and they remain unfrozen until the training ends. Here can add any other layer or module if we want to unfreeze it before the starting of some specific epoch. 14 | 15 | #### Using `schedule` when finetuning with ModelGenerator 16 | In order to use this schedule for finetuning, we can simply to set this as CLI argument for `--trainer.callbacks.ft_schedule_path` when calling `mget fit`. 17 | 18 | Following is an example of finetuning the [AIDO.RNA-1.6B](https://huggingface.co/genbio-ai/AIDO.RNA-1.6B) model for RNA secondary structure prediction, with a **scheduler named `layers_0_32.yaml`**. (**NOTE:** Please refer to the [correspoding experiment folder](https://github.com/genbio-ai/ModelGenerator/tree/main/experiments/AIDO.RNA/rna_secondary_structure_prediction) for details of this experiment): 19 | ``` 20 | cd experiments/AIDO.RNA/rna_secondary_structure_prediction 21 | MGEN_DATA_DIR=~/mgen_data 22 | DATASET_NAME=bpRNA 23 | CKPT_SAVE_DIR=logs/rna_ss/${DATASET_NAME} 24 | mgen fit --config rna_ss_prediction.yaml \ 25 | --data.path ${MGEN_DATA_DIR}/modelgenerator/datasets/rna_ss_data/ \ 26 | --data.dataset ${DATASET_NAME} \ 27 | --trainer.default_root_dir ${CKPT_SAVE_DIR} \ 28 | --trainer.callbacks.ft_schedule_path ft_schedules/layers_0_32.yaml \ 29 | --trainer.devices 0,1,2,3 30 | ``` 31 | -------------------------------------------------------------------------------- /experiments/AIDO.StructurePrediction/fold/openfold_local/utils/geometry/quat_rigid.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 GenBio AI. 2 | # Copyright 2024 ByteDance and/or its affiliates. 3 | # Copyright 2021 AlQuraishi Laboratory 4 | # Copyright 2021 DeepMind Technologies Limited 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import torch 19 | import torch.nn as nn 20 | 21 | from fold.openfold_local.model.primitives import Linear 22 | from fold.openfold_local.utils.geometry.rigid_matrix_vector import Rigid3Array 23 | from fold.openfold_local.utils.geometry.rotation_matrix import Rot3Array 24 | from fold.openfold_local.utils.geometry.vector import Vec3Array 25 | 26 | 27 | class QuatRigid(nn.Module): 28 | def __init__(self, c_hidden, full_quat): 29 | super().__init__() 30 | self.full_quat = full_quat 31 | if self.full_quat: 32 | rigid_dim = 7 33 | else: 34 | rigid_dim = 6 35 | 36 | self.linear = Linear(c_hidden, rigid_dim, init="final", precision=torch.float32) 37 | 38 | def forward(self, activations: torch.Tensor) -> Rigid3Array: 39 | # NOTE: During training, this needs to be run in higher precision 40 | rigid_flat = self.linear(activations) 41 | 42 | rigid_flat = torch.unbind(rigid_flat, dim=-1) 43 | if self.full_quat: 44 | qw, qx, qy, qz = rigid_flat[:4] 45 | translation = rigid_flat[4:] 46 | else: 47 | qx, qy, qz = rigid_flat[:3] 48 | qw = torch.ones_like(qx) 49 | translation = rigid_flat[3:] 50 | 51 | rotation = Rot3Array.from_quaternion( 52 | qw, 53 | qx, 54 | qy, 55 | qz, 56 | normalize=True, 57 | ) 58 | translation = Vec3Array(*translation) 59 | return Rigid3Array(rotation, translation) 60 | -------------------------------------------------------------------------------- /modelgenerator/prot_inv_fold/proteinMPNN/proteinMPNN_model_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code in this file is adapted from 3 | - https://github.com/BytedProtein/ByProt/blob/main/src/byprot/models/fixedbb/__init__.py 4 | - https://github.com/BytedProtein/ByProt/blob/main/src/byprot/models/fixedbb/generator.py 5 | """ 6 | 7 | import torch 8 | from torch import nn 9 | import numpy as np 10 | 11 | from .proteinMPNN_data_utils import Alphabet 12 | 13 | from .proteinMPNN_decoder import MPNNSequenceDecoder 14 | from .proteinMPNN_encoder import MPNNEncoder 15 | 16 | 17 | class FixedBackboneDesignEncoderDecoder(nn.Module): 18 | _default_cfg = {} 19 | 20 | def __init__(self, cfg) -> None: 21 | super().__init__() 22 | self._update_cfg(cfg) 23 | 24 | def _update_cfg(self, cfg): 25 | from omegaconf import OmegaConf 26 | 27 | self.cfg = OmegaConf.merge(self._default_cfg, cfg) 28 | 29 | @classmethod 30 | def from_config(cls, cfg): 31 | raise NotImplementedError 32 | 33 | def forward_encoder(self, batch): 34 | raise NotImplementedError 35 | 36 | def forward_decoder(self, prev_decoder_out, encoder_out): 37 | raise NotImplementedError 38 | 39 | def initialize_output_tokens(self, batch, encoder_out): 40 | raise NotImplementedError 41 | 42 | def forward(self, coords, coord_mask, tokens, token_padding_mask=None, **kwargs): 43 | raise NotImplementedError 44 | 45 | def sample( 46 | self, coords, coord_mask, tokens=None, token_padding_mask=None, **kwargs 47 | ): 48 | raise NotImplementedError 49 | 50 | 51 | ## Replaced for "from byprot.models.fixedbb.generator import new_arange, sample_from_categorical" 52 | def new_arange(x, *size): 53 | """ 54 | Return a Tensor of `size` filled with a range function on the device of x. 55 | If size is empty, using the size of the variable x. 56 | """ 57 | if len(size) == 0: 58 | size = x.size() 59 | return torch.arange(size[-1], device=x.device).expand(*size).contiguous() 60 | 61 | 62 | def sample_from_categorical(logits=None, temperature=1.0): 63 | if temperature and False: 64 | dist = torch.distributions.Categorical(logits=logits.div(temperature)) 65 | tokens = dist.sample() 66 | scores = dist.log_prob(tokens) 67 | else: 68 | scores, tokens = logits.log_softmax(dim=-1).max(dim=-1) 69 | return tokens, scores 70 | -------------------------------------------------------------------------------- /modelgenerator/huggingface_models/scfoundation/pretrainmodels/select_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 BioMap (Beijing) Intelligence Technology Limited 2 | 3 | 4 | from .transformer import pytorchTransformerModule 5 | from .performer import PerformerModule 6 | from .mae_autobin import MaeAutobin 7 | 8 | def select_module(config, sub_config, module_name): 9 | if module_name == 'performer': 10 | return PerformerModule( 11 | max_seq_len=config['seq_len'], 12 | dim=sub_config['hidden_dim'], 13 | depth=sub_config['depth'], 14 | heads=sub_config['heads'], 15 | dim_head=sub_config['dim_head'], 16 | ff_dropout=sub_config.get('ff_dropout',0.0), 17 | attn_dropout=sub_config.get('attn_dropout',0.0) 18 | ) 19 | elif module_name == 'transformer': 20 | return pytorchTransformerModule( 21 | max_seq_len=config['seq_len'], 22 | dim=sub_config['hidden_dim'], 23 | depth=sub_config['depth'], 24 | heads=sub_config['heads'] 25 | ) 26 | else: 27 | print('module type error') 28 | exit(0) 29 | 30 | def select_model(config): 31 | if config["model"] == "mae_autobin": 32 | encoder_config =config['encoder'] 33 | decoder_config = config['decoder'] 34 | encoder = select_module(config, encoder_config, config['encoder']['module_type']) 35 | decoder = select_module(config, decoder_config, config['decoder']['module_type']) 36 | model = MaeAutobin( 37 | num_tokens=config['n_class'], 38 | max_seq_len=config['seq_len'], 39 | embed_dim=config['encoder']['hidden_dim'], 40 | decoder_embed_dim=config['decoder']['hidden_dim'], 41 | bin_alpha = config['bin_alpha'], 42 | bin_num = config['bin_num'], 43 | pad_token_id = config['pad_token_id'], 44 | mask_token_id = config['mask_token_id'], 45 | ) 46 | model.encoder = encoder 47 | model.decoder = decoder 48 | else: 49 | raise NotImplementedError("Unknown model type!") 50 | return model 51 | 52 | def get_sub_config(config, target): 53 | """ 54 | 获取 包含 target 的 config 55 | """ 56 | sub_config = {} 57 | for k in config.keys(): 58 | if target in k: 59 | tmp_name = k.replace(target + '_', '') 60 | sub_config[tmp_name] = config[k] 61 | return sub_config 62 | --------------------------------------------------------------------------------