├── tests
├── data
│ └── __init__.py
├── tasks
│ └── __init__.py
├── adapters
│ └── __init__.py
└── backbones
│ └── __init__.py
├── docs
├── .gitignore
├── docs
│ ├── assets
│ │ └── images
│ │ │ ├── icon.png
│ │ │ ├── logo.png
│ │ │ ├── genbio_logo.png
│ │ │ ├── genbio_header.png
│ │ │ └── structure_tokenizer
│ │ │ ├── cameo_0.png
│ │ │ ├── select_files.png
│ │ │ ├── launch_protein_viewer.png
│ │ │ ├── structure_tokenizer.png
│ │ │ ├── structure_prediction_model.png
│ │ │ └── visualize_reconstruction.png
│ ├── api_reference
│ │ ├── callbacks.md
│ │ ├── trainer.md
│ │ └── adapters.md
│ ├── usage
│ │ └── embedding_caching.md
│ ├── experiment_design
│ │ ├── backbones.md
│ │ └── data.md
│ └── tutorials
│ │ ├── kfold_cross_validation.md
│ │ ├── dependency_mapping.md
│ │ └── finetuning_scheduler.md
└── README.md
├── modelgenerator
├── __init__.py
├── distributed
│ ├── __init__.py
│ └── fsdp
│ │ └── __init__.py
├── huggingface_models
│ ├── __init__.py
│ ├── genbio
│ │ ├── __init__.py
│ │ └── modeling_genbio.py
│ ├── borzoi_pytorch
│ │ ├── __init__.py
│ │ ├── pytorch_borzoi_helpers.py
│ │ └── config_borzoi.py
│ ├── scfoundation
│ │ └── pretrainmodels
│ │ │ ├── __init__.py
│ │ │ ├── transformer.py
│ │ │ └── select_model.py
│ ├── geneformer
│ │ ├── token_dictionary_gc95M.pkl
│ │ ├── gene_name_id_dict_gc95M.pkl
│ │ ├── ensembl_mapping_dict_gc95M.pkl
│ │ ├── gene_median_dictionary_gc95M.pkl
│ │ └── __init__.py
│ ├── enformer_pytorch
│ │ ├── precomputed
│ │ │ └── tf_gammas.pt
│ │ ├── __init__.py
│ │ ├── LICENSE
│ │ └── config_enformer.py
│ ├── rnabert
│ │ └── vocab.txt
│ ├── fm4bio
│ │ └── vocab_protein.txt
│ └── scimilarity
│ │ └── model_v1.1
│ │ └── layer_sizes.json
├── structure_tokenizer
│ ├── __init__.py
│ ├── layers
│ │ ├── __init__.py
│ │ ├── esmfold
│ │ │ ├── __init__.py
│ │ │ └── categorical_mixture.py
│ │ └── equivariant
│ │ │ └── __init__.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── constants
│ │ │ ├── __init__.py
│ │ │ ├── structure_tokenizer.py
│ │ │ └── residue_constants.py
│ │ ├── geometry
│ │ │ └── __init__.py
│ │ ├── types.py
│ │ ├── distributed.py
│ │ ├── misc.py
│ │ └── init_params.py
│ ├── callbacks
│ │ └── __init__.py
│ ├── configs
│ │ ├── __init__.py
│ │ └── lightning_configs.py
│ ├── datasets
│ │ └── __init__.py
│ ├── models
│ │ └── __init__.py
│ └── README.md
├── rna_inv_fold
│ └── gRNAde_structure_encoder
│ │ ├── src
│ │ ├── __init__.py
│ │ └── data
│ │ │ └── __init__.py
│ │ └── LICENSE
├── tasks
│ └── __init__.py
├── utils
│ └── __init__.py
├── adapters
│ ├── __init__.py
│ └── base.py
└── prot_inv_fold
│ └── proteinMPNN
│ └── proteinMPNN_model_utils.py
├── experiments
├── AIDO.Cell
│ ├── requirements.txt
│ ├── README.md
│ ├── extract_features.py
│ ├── readme.md
│ ├── sctab_conversion.py
│ └── docker_readme.md
├── AIDO.StructurePrediction
│ ├── fold
│ │ ├── __init__.py
│ │ ├── data
│ │ │ ├── __init__.py
│ │ │ └── ccd_data.py
│ │ ├── metrics
│ │ │ └── __init__.py
│ │ ├── model
│ │ │ ├── __init__.py
│ │ │ ├── modules
│ │ │ │ ├── __init__.py
│ │ │ │ └── head.py
│ │ │ └── layer_norm
│ │ │ │ ├── __init__.py
│ │ │ │ ├── kernel
│ │ │ │ └── compat.h
│ │ │ │ └── torch_ext_compile.py
│ │ ├── utils
│ │ │ ├── __init__.py
│ │ │ ├── data_process.py
│ │ │ ├── hash_encoder.py
│ │ │ ├── logger.py
│ │ │ ├── file_io.py
│ │ │ ├── seed.py
│ │ │ └── geometry.py
│ │ └── openfold_local
│ │ │ ├── data
│ │ │ ├── __init__.py
│ │ │ ├── tools
│ │ │ │ ├── __init__.py
│ │ │ │ └── utils.py
│ │ │ └── errors.py
│ │ │ ├── np
│ │ │ └── __init__.py
│ │ │ ├── model
│ │ │ └── __init__.py
│ │ │ ├── utils
│ │ │ ├── __init__.py
│ │ │ ├── kernel
│ │ │ │ ├── __init__.py
│ │ │ │ └── csrc
│ │ │ │ │ ├── compat.h
│ │ │ │ │ ├── softmax_cuda_stub.cpp
│ │ │ │ │ └── softmax_cuda.cpp
│ │ │ ├── geometry
│ │ │ │ ├── utils.py
│ │ │ │ ├── __init__.py
│ │ │ │ └── quat_rigid.py
│ │ │ └── precision_utils.py
│ │ │ ├── __init__.py
│ │ │ └── README.md
│ ├── runner
│ │ └── __init__.py
│ ├── .gitignore
│ ├── src
│ │ └── genbio
│ │ │ └── aidosp
│ │ │ ├── cli
│ │ │ ├── __init__.py
│ │ │ ├── util
│ │ │ │ └── __init__.py
│ │ │ ├── completions
│ │ │ │ ├── __init__.py
│ │ │ │ ├── genbio-aidosp.fish
│ │ │ │ ├── genbio-aidosp-complete.bash
│ │ │ │ ├── commands.py
│ │ │ │ └── genbio-aidosp-complete.zsh
│ │ │ ├── base.py
│ │ │ └── predict.py
│ │ │ ├── scripts
│ │ │ ├── __init__.py
│ │ │ ├── download_colabfold_envdb.sh
│ │ │ └── download_uniref30.sh
│ │ │ ├── msa_retrieve
│ │ │ ├── __init__.py
│ │ │ ├── configs
│ │ │ │ ├── __init__.py
│ │ │ │ └── mmseqs.yaml
│ │ │ ├── msar
│ │ │ │ ├── __init__.py
│ │ │ │ ├── tools
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── utils.py
│ │ │ │ └── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── general.py
│ │ │ │ │ ├── errors.py
│ │ │ │ │ ├── logger.py
│ │ │ │ │ └── io_utils.py
│ │ │ └── bin
│ │ │ │ └── search_msa.sh
│ │ │ └── __init__.py
│ ├── .gitattributes
│ ├── examples
│ │ ├── T1106s1-D1.fasta
│ │ ├── T1104-D1.fasta
│ │ ├── T1106s2-D1.fasta
│ │ └── T1109-D1.fasta
│ ├── assets
│ │ └── img
│ │ │ ├── ana.png
│ │ │ ├── hln.png
│ │ │ ├── vis.png
│ │ │ ├── figure(gt-yellow vs af3-blue).png
│ │ │ └── figure(gt-yellow vs our-green).png
│ ├── job2fasta.py
│ ├── configs
│ │ └── mmseqs.yaml
│ ├── Dockerfile
│ ├── scripts
│ │ └── run_inference.sh
│ └── buildjob.py
├── AIDO.DNA
│ ├── dependency_mapping
│ │ ├── DNA.txt
│ │ ├── requirements.txt
│ │ ├── config.yaml
│ │ ├── depmap.csv
│ │ └── README.md
│ ├── zeroshot_variant_effect_prediction
│ │ ├── Clinvar_300M_zeroshot_Diff.yaml
│ │ ├── Clinvar_7B_zeroshot_Distance.yaml
│ │ └── Clinvar_300M_zeroshot_Distance.yaml
│ └── sequence_classification
│ │ ├── nt_promoter_all.yaml
│ │ ├── gue_core_promoter_all.yaml
│ │ ├── nt_enhancers.yaml
│ │ └── gue_splice_reconstruction.yaml
├── AIDO.RNA
│ ├── dependency_mapping
│ │ ├── DNA.txt
│ │ ├── requirements.txt
│ │ ├── depmap.csv
│ │ ├── config.yaml
│ │ └── README.md
│ ├── mean_ribosome_load_prediction
│ │ ├── ft_schedules
│ │ │ └── two_step.yaml
│ │ └── README.md
│ ├── modification_site_prediction.sh
│ ├── ncrna_family_classfification.sh
│ ├── splice_site_prediction.sh
│ ├── rna_inverse_folding
│ │ └── rna_inv_fold_test.yaml
│ ├── expression_level_prediction.sh
│ ├── translation_efficiency_prediction.sh
│ ├── protein_abundance_prediction.sh
│ ├── transcript_abundance_prediction.sh
│ ├── rna_secondary_structure_prediction
│ │ ├── rna_secondary_structure_prediction.sh
│ │ └── ft_schedules
│ │ │ └── layers_0_32.yaml
│ ├── multimodal_isoform_expression
│ │ └── isoform_expression_prediction.sh
│ └── demo_mrna_vaccine
│ │ └── get_mean_embeddings.py
├── AIDO.StructureTokenizer
│ ├── protein2structoken_example_input.csv
│ ├── decode_example_input.tsv
│ ├── decode.yaml
│ ├── encode_decode.yaml
│ ├── encode.yaml
│ ├── structure_encoding.sh
│ ├── protein2structoken_16b.yaml
│ └── extract_structure_tokenizer_codebook.py
├── AIDO.Protein-RAG
│ ├── xTrimo_RAG
│ │ ├── configs
│ │ │ ├── wandb.yaml
│ │ │ └── prediction_writer.yaml
│ │ └── init_env.sh
│ └── DMS_RAG
│ │ ├── configs
│ │ ├── wandb.yaml
│ │ └── prediction_writer.yaml
│ │ └── init_env.sh
├── AIDO.Protein
│ ├── DMS
│ │ ├── train_indels_LP.sh
│ │ ├── train_indels_LoRA_DDP.sh
│ │ ├── train_sub_LoRA_DDP.sh
│ │ ├── train_sub_LoRA_FSDP.sh
│ │ └── configs
│ │ │ ├── indels_LP_DDP.yaml
│ │ │ ├── indels_LoRA_DDP.yaml
│ │ │ ├── substitution_LoRA_DDP.yaml
│ │ │ └── substitution_LoRA_FSDP.yaml
│ ├── protein_inverse_folding
│ │ ├── merge_ckpt.py
│ │ ├── protein_inv_fold_test.yaml
│ │ └── end2end_inference.sh
│ └── xTrimo
│ │ ├── ssp_q3.sh
│ │ ├── fold_prediction.sh
│ │ ├── tcr_pmhc_affinity.sh
│ │ ├── fluorescence_prediction.sh
│ │ ├── contact_prediction_binary.sh
│ │ └── peptide_HLA_MHC_affinity.sh
└── AIDO.Tissue
│ └── emb.xenium.yaml
├── configs
├── examples
│ ├── quick_dev_run.yaml
│ ├── save_predictions.yaml
│ ├── wandb.yaml
│ └── lora_backbone.yaml
└── defaults.yaml
├── scripts
└── wandb_sweep
│ ├── slurm_sweep.yaml
│ ├── slurm_agent.sh
│ └── README.md
├── .github
└── workflows
│ ├── publish.yml
│ ├── tests.yml
│ └── docs.yml
├── .pre-commit-config.yaml
├── Dockerfile
└── CONTRIBUTING.md
/tests/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/tasks/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | site/
2 |
--------------------------------------------------------------------------------
/modelgenerator/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/adapters/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/backbones/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/distributed/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/distributed/fsdp/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.Cell/requirements.txt:
--------------------------------------------------------------------------------
1 | scanpy
2 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/genbio/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/layers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/runner/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/.gitignore:
--------------------------------------------------------------------------------
1 | version.py
2 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/metrics/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/model/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/layers/esmfold/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/utils/constants/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/utils/geometry/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/model/modules/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/rna_inv_fold/gRNAde_structure_encoder/src/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/layers/equivariant/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/dependency_mapping/DNA.txt:
--------------------------------------------------------------------------------
1 | A
2 | T
3 | G
4 | C
5 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/dependency_mapping/DNA.txt:
--------------------------------------------------------------------------------
1 | A
2 | T
3 | G
4 | C
5 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/np/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modelgenerator/rna_inv_fold/gRNAde_structure_encoder/src/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/model/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/util/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/scripts/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/data/tools/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/utils/kernel/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/completions/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/configs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb filter=strip-notebook-output
2 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/tools/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/dependency_mapping/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | seaborn
3 | pandas
4 | logomaker
5 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/dependency_mapping/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | seaborn
3 | pandas
4 | logomaker
5 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/model/layer_norm/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .layer_norm import FusedLayerNorm
3 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructureTokenizer/protein2structoken_example_input.csv:
--------------------------------------------------------------------------------
1 | idx,aa_seq
2 | example,KEFWNLDKNLQLRLGIVFLG
3 |
--------------------------------------------------------------------------------
/modelgenerator/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | from modelgenerator.tasks.base import *
2 | from modelgenerator.tasks.tasks import *
3 |
--------------------------------------------------------------------------------
/docs/docs/assets/images/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/icon.png
--------------------------------------------------------------------------------
/docs/docs/assets/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/logo.png
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/mean_ribosome_load_prediction/ft_schedules/two_step.yaml:
--------------------------------------------------------------------------------
1 | 0:
2 | - adapter.*
3 | 3:
4 | - backbone.*
5 |
--------------------------------------------------------------------------------
/docs/docs/assets/images/genbio_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/genbio_logo.png
--------------------------------------------------------------------------------
/docs/docs/assets/images/genbio_header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/genbio_header.png
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/examples/T1106s1-D1.fasta:
--------------------------------------------------------------------------------
1 | >T1106s1-D1
2 | TAQSKRSLWDFASPGYTFHGLHRAQDYRRELDTLQSLLTTSQSSELQAAAALLKCQQDDDRLLQIILNLLH
3 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/borzoi_pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | from .pytorch_borzoi_model import Borzoi, AnnotatedBorzoi
2 | # from .gene_utils import Transcriptome
3 |
--------------------------------------------------------------------------------
/docs/docs/assets/images/structure_tokenizer/cameo_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/cameo_0.png
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/assets/img/ana.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/experiments/AIDO.StructurePrediction/assets/img/ana.png
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/assets/img/hln.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/experiments/AIDO.StructurePrediction/assets/img/hln.png
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/assets/img/vis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/experiments/AIDO.StructurePrediction/assets/img/vis.png
--------------------------------------------------------------------------------
/docs/docs/assets/images/structure_tokenizer/select_files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/select_files.png
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/__init__.py:
--------------------------------------------------------------------------------
1 | from .version import version as __version__
2 |
3 |
4 | def get_version() -> str:
5 | return __version__
6 |
--------------------------------------------------------------------------------
/configs/examples/quick_dev_run.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | fast_dev_run: true
3 | accelerator: auto
4 | devices: 1
5 | precision: 32
6 | detect_anomaly: true
7 | log_every_n_steps: 1
8 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/scfoundation/pretrainmodels/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 BioMap (Beijing) Intelligence Technology Limited
2 |
3 | from .select_model import select_model
4 |
--------------------------------------------------------------------------------
/docs/docs/assets/images/structure_tokenizer/launch_protein_viewer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/launch_protein_viewer.png
--------------------------------------------------------------------------------
/docs/docs/assets/images/structure_tokenizer/structure_tokenizer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/structure_tokenizer.png
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/examples/T1104-D1.fasta:
--------------------------------------------------------------------------------
1 | >T1104-D1
2 | QLEDSEVEAVAKGLEEMYANGVTEDNFKNYVKNNFAQQEISSVEEELNVNISDSCVANKIKDEFFAMISISAIVKAAQKKAWKELAVTVLRFAKANGLKTNAIIVAGQLALWAVQCG
3 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/examples/T1106s2-D1.fasta:
--------------------------------------------------------------------------------
1 | >T1106s2-D1
2 | NITLTKRQQEFLLLNGWLQLQCGHAERACILLDALLTLNPEHLAGRRCRLVALLNNNQGERAEKEAQWLISHDPLQAGNWLCLSRAQQLNGDLDKARHAYQHYLELKDHNE
3 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/geneformer/token_dictionary_gc95M.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/modelgenerator/huggingface_models/geneformer/token_dictionary_gc95M.pkl
--------------------------------------------------------------------------------
/modelgenerator/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from modelgenerator.utils.kwargs_doc import GoogleKwargsDocstringInheritanceInitMeta
2 |
3 | __all__ = [
4 | "GoogleKwargsDocstringInheritanceInitMeta",
5 | ]
6 |
--------------------------------------------------------------------------------
/configs/examples/save_predictions.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | callbacks:
3 | - class_path: modelgenerator.callbacks.PredictionWriter
4 | dict_kwargs:
5 | output_dir: predictions
6 | filetype: pt
7 |
--------------------------------------------------------------------------------
/docs/docs/assets/images/structure_tokenizer/structure_prediction_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/structure_prediction_model.png
--------------------------------------------------------------------------------
/docs/docs/assets/images/structure_tokenizer/visualize_reconstruction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/docs/docs/assets/images/structure_tokenizer/visualize_reconstruction.png
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/geneformer/gene_name_id_dict_gc95M.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/modelgenerator/huggingface_models/geneformer/gene_name_id_dict_gc95M.pkl
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/enformer_pytorch/precomputed/tf_gammas.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/modelgenerator/huggingface_models/enformer_pytorch/precomputed/tf_gammas.pt
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/geneformer/ensembl_mapping_dict_gc95M.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/modelgenerator/huggingface_models/geneformer/ensembl_mapping_dict_gc95M.pkl
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/rnabert/vocab.txt:
--------------------------------------------------------------------------------
1 | [PAD]
2 | [MASK]
3 | [CLS]
4 | [SEP]
5 | [UNK]
6 | A
7 | G
8 | C
9 | T
10 | U
11 | N
12 | [BOS]
13 | [EOS]
14 | [UNUSED1]
15 | [UNUSED2]
16 | [UNUSED3]
--------------------------------------------------------------------------------
/experiments/AIDO.StructureTokenizer/decode_example_input.tsv:
--------------------------------------------------------------------------------
1 | uid sequences predictions
2 | example KEFWNLDKNLQLRLGIVFLG [355, 364, 490, 132, 81, 181, 176, 59, 19, 386, 176, 173, 199, 7, 35, 196, 113, 132, 284, 321]
3 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/geneformer/gene_median_dictionary_gc95M.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/modelgenerator/huggingface_models/geneformer/gene_median_dictionary_gc95M.pkl
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/assets/img/figure(gt-yellow vs af3-blue).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/experiments/AIDO.StructurePrediction/assets/img/figure(gt-yellow vs af3-blue).png
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/assets/img/figure(gt-yellow vs our-green).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genbio-ai/ModelGenerator/HEAD/experiments/AIDO.StructurePrediction/assets/img/figure(gt-yellow vs our-green).png
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/callbacks/__init__.py:
--------------------------------------------------------------------------------
1 | from .writer_pdb_callback import WriterPDBCallback
2 | from .struct_tokens_callback import StructTokensCallback
3 |
4 | __all__ = ["WriterPDBCallback", "StructTokensCallback"]
5 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein-RAG/xTrimo_RAG/configs/wandb.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | logger:
3 | class_path: lightning.pytorch.loggers.WandbLogger
4 | init_args:
5 | name: test
6 | save_dir: logs
7 | project: xTrimo_Benchmark
8 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein-RAG/DMS_RAG/configs/wandb.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | logger:
3 | class_path: lightning.pytorch.loggers.WandbLogger
4 | init_args:
5 | name: test
6 | save_dir: logs
7 | project: Protein_RAG_no_structure
8 |
--------------------------------------------------------------------------------
/configs/examples/wandb.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | logger:
3 | class_path: lightning.pytorch.loggers.wandb.WandbLogger
4 | init_args:
5 | name: "my-experiment-name"
6 | save_dir: "logs"
7 | project: "my-project"
8 | save_code: true
9 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/utils/general.py:
--------------------------------------------------------------------------------
1 |
2 | import hashlib
3 |
4 | def seq_encoder(sequence, method="md5"):
5 | hasher = eval(f"hashlib.{method}")
6 | return hasher(sequence.encode(encoding="utf-8")).hexdigest()
7 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/configs/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_configs import (
2 | ProteinDatasetConfig,
3 | StructTokensDatasetConfig,
4 | ProteinDataConfig,
5 | )
6 |
7 |
8 | __all__ = ["ProteinDatasetConfig", "StructTokensDatasetConfig", "ProteinDataConfig"]
9 |
--------------------------------------------------------------------------------
/modelgenerator/adapters/__init__.py:
--------------------------------------------------------------------------------
1 | from modelgenerator.adapters.base import *
2 | from modelgenerator.adapters.adapters import *
3 | from modelgenerator.adapters.fusion import (
4 | MMFusionTokenAdapter as MMFusionTokenAdapter,
5 | MMFusionSeqAdapter as MMFusionSeqAdapter,
6 | )
7 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/utils/errors.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | class Error(Exception):
4 | """Base class for exceptions."""
5 |
6 |
7 | class MultipleChainsError(Error):
8 | """An error indicating that multiple chains were found for a given ID."""
9 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/examples/T1109-D1.fasta:
--------------------------------------------------------------------------------
1 | >T1109-D1
2 | PRPPFHITIPIYPGVDLLDVAAPVELFSWMADAWKARATTITLAAEHLTPLKTRDGLTLTPQRQFADYADAAAPQPQTHLLWVPGGAPDVLRKLMRGGPYLDFLKAQSAGADHVSSVCEGALLLAAAGLLDGYRATTHWAFIPCLQQFPAIKVAEGFPRYVIDGNRITGGGISSGLAEALAIVARVAGQDIAKHVQMITQYFPDPPFEQTIVPATHCPLQ
3 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .protein_lightning_datamodule import ProteinLightningDataModule
2 | from .struct_tokens_lightning_datamodule import StructTokensLightningDataModule
3 |
4 |
5 | __all__ = [
6 | "ProteinLightningDataModule",
7 | "StructTokensLightningDataModule",
8 | ]
9 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/utils/kernel/csrc/compat.h:
--------------------------------------------------------------------------------
1 | // modified from https://github.com/NVIDIA/apex/blob/master/csrc/compat.h
2 |
3 | #ifndef TORCH_CHECK
4 | #define TORCH_CHECK AT_CHECK
5 | #endif
6 |
7 | #ifdef VERSION_GE_1_3
8 | #define DATA_PTR data_ptr
9 | #else
10 | #define DATA_PTR data
11 | #endif
12 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/dependency_mapping/depmap.csv:
--------------------------------------------------------------------------------
1 | id,sequence
2 | >NR_002092.1|RNAseP|Drosophila,AGTCAGTTGCAAACTAGCATCTGGGGCCCACACAACGAGTATCTGATTACTCCACAAACCATTGCCCCGGGAAGGTCTGAGAATCGGCCGAGCCAGCTGTTTGTTGCGGCTTCATTTCCCAGCAGGAAACCTGTGTGATTGCAGGGCGAAAGTACCAGAAATCCTGCTACCAGGTGTTGCCGTTGCCCCCGGTGACCGCCGCCTGGTTGGCATTGAAACCTTTCGTGGCCAGCGTTTTTAGTGCGATGTGCTTGCTGCCTCTAAGGCAGAACTCAATTCAGACTAATCTGTGACTGACT
3 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .equiformer_encoder import EquiformerEncoderLightning
2 | from .esmfold_decoder import ESMFoldDecoderLightning
3 | from .structure_tokenizer_lightning import StructureTokenizerLightning
4 |
5 |
6 | __all__ = [
7 | "StructureTokenizerLightning",
8 | "EquiformerEncoderLightning",
9 | "ESMFoldDecoderLightning",
10 | ]
11 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/base.py:
--------------------------------------------------------------------------------
1 | import click
2 | from genbio.aidosp.cli.completions.commands import completion
3 | from genbio.aidosp.cli.util.commands import util
4 |
5 |
6 | @click.group(help="GenBio AIDO Structure Prediction CLI")
7 | def cli(): ...
8 |
9 |
10 | cli.add_command(completion)
11 | cli.add_command(util)
12 | if __name__ == "__main__":
13 | cli()
14 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/fm4bio/vocab_protein.txt:
--------------------------------------------------------------------------------
1 | [PAD]
2 | L
3 | A
4 | G
5 | V
6 | S
7 | E
8 | R
9 | T
10 | I
11 | D
12 | P
13 | K
14 | Q
15 | N
16 | F
17 | Y
18 | M
19 | H
20 | W
21 | C
22 | X
23 | B
24 | U
25 | Z
26 | O
27 | .
28 | -
29 | [MASK]
30 | [gMASK]
31 | [sMASK]
32 | [eod]
33 | [sop]
34 | [eop]
35 | [SEP]
36 | [HC]
37 | [LC]
38 | [HUMAN]
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Making Documentation
2 |
3 | We use mkdocs for markdown rendering with mkdocstrings for autodocumentation.
4 |
5 | ## Installation
6 |
7 | To build the website
8 | 1. `pip install -r requirements.txt`
9 | 2. `mkdocs serve`
10 |
11 | ## Useful Links
12 |
13 | [mkdocs for markdown rendering](https://www.mkdocs.org/user-guide/)
14 | [mkdocstrings for automatic documentation](https://mkdocstrings.github.io/usage/)
15 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/utils/types.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from dataclasses import dataclass
3 |
4 | import torch
5 |
6 | ShapeLike = int | torch.Size
7 | PathLike = str | Path
8 |
9 |
10 | # for what is irreps, see https://docs.e3nn.org/en/stable/api/o3/o3_irreps.html
11 | # here you can just think of it as a tuple of scalars and vectors
12 | @dataclass
13 | class IrrepShape:
14 | s: ShapeLike
15 | v: ShapeLike
16 |
--------------------------------------------------------------------------------
/configs/examples/lora_backbone.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | class_path: modelgenerator.tasks.SequenceClassification
3 | init_args:
4 | n_classes: 2
5 | optimizer:
6 | class_path: torch.optim.AdamW
7 | init_args:
8 | lr: 0.001
9 | weight_decay: 0.01
10 | backbone:
11 | class_path: modelgenerator.backbones.aido_dna_dummy
12 | init_args:
13 | use_peft: True
14 | lora_r: 16
15 | lora_alpha: 32
16 | lora_dropout: 0.1
17 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein-RAG/DMS_RAG/configs/prediction_writer.yaml:
--------------------------------------------------------------------------------
1 | # lightning.pytorch==2.4.0
2 | seed_everything: 42
3 | trainer:
4 | callbacks:
5 | - class_path: modelgenerator.callbacks.PredictionWriter
6 | dict_kwargs:
7 | output_dir: DMS_output/
8 | filetype: tsv
9 | write_cols: ['uid', 'sequences', 'predictions', 'labels']
10 | remove_duplicates: true
11 | delete_intermediate_files: true
12 | data:
13 | init_args:
14 | generate_uid: true
15 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein-RAG/xTrimo_RAG/configs/prediction_writer.yaml:
--------------------------------------------------------------------------------
1 | # lightning.pytorch==2.4.0
2 | seed_everything: 42
3 | trainer:
4 | callbacks:
5 | - class_path: modelgenerator.callbacks.PredictionWriter
6 | dict_kwargs:
7 | output_dir: xTrimo_output/
8 | filetype: tsv
9 | write_cols: ['uid', 'sequences', 'predictions', 'labels']
10 | remove_duplicates: true
11 | delete_intermediate_files: true
12 | data:
13 | init_args:
14 | generate_uid: true
15 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/utils/distributed.py:
--------------------------------------------------------------------------------
1 | import torch.distributed as dist
2 |
3 |
4 | def get_world_size():
5 | if not dist.is_available():
6 | return 1
7 | if not dist.is_initialized():
8 | return 1
9 | return dist.get_world_size()
10 |
11 |
12 | def all_reduce(tensor, op=dist.ReduceOp.SUM):
13 | world_size = get_world_size()
14 | if world_size == 1:
15 | return tensor
16 | dist.all_reduce(tensor, op=op)
17 | return tensor
18 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein-RAG/DMS_RAG/init_env.sh:
--------------------------------------------------------------------------------
1 | SCRIPT_PATH=`dirname "$(realpath ${BASH_SOURCE[0]})"`
2 |
3 | source ~/.bash_profile
4 | # echo "source ~/.bash_profile"
5 | eval init_conda_env
6 | conda activate python3.11
7 | lib_nvjitlink
8 |
9 | # which torchrun
10 | MG_PATH=$(realpath ${SCRIPT_PATH}/../../..)
11 | export PYTHONPATH=${MG_PATH}:${PYTHONPATH}
12 | export OMP_NUM_THREADS=1
13 | # export HF_DATASETS_OFFLINE=1
14 | # export PL_GLOBAL_SEED=0
15 | # export TF_ENABLE_ONEDNN_OPTS=0
16 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein-RAG/xTrimo_RAG/init_env.sh:
--------------------------------------------------------------------------------
1 | SCRIPT_PATH=`dirname "$(realpath ${BASH_SOURCE[0]})"`
2 |
3 | source ~/.bash_profile
4 | # echo "source ~/.bash_profile"
5 | eval init_conda_env
6 | conda activate python3.11
7 | lib_nvjitlink
8 |
9 | # which torchrun
10 | MG_PATH=$(realpath ${SCRIPT_PATH}/../../..)
11 | export PYTHONPATH=${MG_PATH}:${PYTHONPATH}
12 | export OMP_NUM_THREADS=1
13 | # export HF_DATASETS_OFFLINE=1
14 | # export PL_GLOBAL_SEED=0
15 | # export TF_ENABLE_ONEDNN_OPTS=0
16 |
--------------------------------------------------------------------------------
/scripts/wandb_sweep/slurm_sweep.yaml:
--------------------------------------------------------------------------------
1 | program: mgen
2 | project: autotune-test # CHANGE_ME
3 | method: grid
4 | metric:
5 | goal: minimize
6 | name: val_loss
7 | parameters:
8 | model.optimizer.lr:
9 | values: [0.01, 0.001, 0.0001]
10 | command:
11 | - echo # DO NOT CHANGE
12 | - ${program}
13 | - fit
14 | - --config
15 | - .local/test.yaml # CHANGE_ME
16 | # more mgen arguments here e.g.,
17 | # - --trainer.devices
18 | # - 2
19 | - ${args} # paramenters set by wandb agents
20 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/borzoi_pytorch/pytorch_borzoi_helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | def predict_tracks(models, sequence_one_hot, slices):
5 | predicted_tracks = []
6 | for fold_ix in range(len(models)):
7 | with torch.no_grad():
8 | yh = models[fold_ix](sequence_one_hot[None, ...])[:, None, ...].numpy(force = True)[:,:,slices]
9 | predicted_tracks.append(yh)
10 |
11 | predicted_tracks = np.concatenate(predicted_tracks,axis=1).swapaxes(3,2)
12 |
13 | return predicted_tracks
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/utils/constants/structure_tokenizer.py:
--------------------------------------------------------------------------------
1 | from modelgenerator.structure_tokenizer.utils.constants import residue_constants
2 |
3 | SCALE_POSITIONS: float = 10
4 | QUANTIZE_IDX_MASK: int = (
5 | -100
6 | ) # -100 is a special value that will be ignored in the loss function
7 |
8 | DISTOGRAM_BINS: int = 64
9 | LDDT_BINS: int = 50
10 |
11 | # Tokens to predict residues
12 | # 0 is padding, N + 1 is mask.
13 | N_TOKENS = residue_constants.restype_num + 2
14 | PAD_IDX = 0
15 | UNK_IDX = N_TOKENS - 2
16 | MASK_IDX = N_TOKENS - 1
17 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/enformer_pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # from enformer_pytorch.config_enformer import EnformerConfig
2 | # from enformer_pytorch.modeling_enformer import Enformer, from_pretrained, SEQUENCE_LENGTH, AttentionPool
3 | # from enformer_pytorch.data import seq_indices_to_one_hot, str_to_one_hot, GenomeIntervalDataset, FastaInterval
4 | from .config_enformer import EnformerConfig
5 | from .modeling_enformer import Enformer, from_pretrained, SEQUENCE_LENGTH, AttentionPool
6 | from .data import seq_indices_to_one_hot, str_to_one_hot, GenomeIntervalDataset, FastaInterval
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/dependency_mapping/config.yaml:
--------------------------------------------------------------------------------
1 | # Caleb Ellington
2 | model:
3 | class_path: Inference
4 | init_args:
5 | backbone: aido_dna_7b
6 | data:
7 | class_path: DependencyMappingDataModule
8 | init_args:
9 | path: experiments/AIDO.DNA/dependency_mapping/
10 | test_split_files:
11 | - depmap.csv
12 | vocab_file: experiments/AIDO.DNA/dependency_mapping/DNA.txt
13 | batch_size: 32
14 | trainer:
15 | callbacks:
16 | - class_path: modelgenerator.callbacks.PredictionWriter
17 | dict_kwargs:
18 | output_dir: depmap_predictions
19 | filetype: pt
20 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/dependency_mapping/config.yaml:
--------------------------------------------------------------------------------
1 | # Caleb Ellington
2 | model:
3 | class_path: Inference
4 | init_args:
5 | backbone: aido_rna_1b600m
6 | data:
7 | class_path: DependencyMappingDataModule
8 | init_args:
9 | path: experiments/AIDO.RNA/dependency_mapping/
10 | test_split_files:
11 | - depmap.csv
12 | vocab_file: experiments/AIDO.RNA/dependency_mapping/DNA.txt
13 | batch_size: 32
14 | trainer:
15 | callbacks:
16 | - class_path: modelgenerator.callbacks.PredictionWriter
17 | dict_kwargs:
18 | output_dir: depmap_predictions
19 | filetype: pt
20 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/geneformer/__init__.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from pathlib import Path
3 |
4 | warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
5 |
6 | GENE_MEDIAN_FILE = Path(__file__).parent / "gene_median_dictionary_gc95M.pkl"
7 | TOKEN_DICTIONARY_FILE = Path(__file__).parent / "token_dictionary_gc95M.pkl"
8 | ENSEMBL_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict_gc95M.pkl"
9 | ENSEMBL_MAPPING_FILE = Path(__file__).parent / "ensembl_mapping_dict_gc95M.pkl"
10 |
11 | from . import (
12 | tokenizer,
13 | )
14 |
15 | from .tokenizer import TranscriptomeTokenizer
16 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/DMS/train_indels_LP.sh:
--------------------------------------------------------------------------------
1 | # Ning Sun
2 | TASK_NAME='FECA_ECOLI_Tsuboyama_2023_2D1U_indels'
3 | MUTATION_TYPE='indels'
4 | for FOLD in {0..4}
5 | do
6 | RUN_NAME=${TASK_NAME}_fold${FOLD}
7 | srun mgen fit --config experiments/AIDO.Protein/DMS/configs/indels_LP_DDP.yaml \
8 | --data.train_split_files "[\"${MUTATION_TYPE}/${TASK_NAME}.tsv\"]" \
9 | --trainer.logger.name ${RUN_NAME} \
10 | --trainer.logger.id ${RUN_NAME} \
11 | --data.cv_test_fold_id ${FOLD} \
12 | --trainer.num_nodes 1 \
13 | --trainer.devices 1 \
14 | --data.batch_size 8 \
15 | &> output_logs/protein/${RUN_NAME}.log
16 | done
17 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/DMS/train_indels_LoRA_DDP.sh:
--------------------------------------------------------------------------------
1 | # Ning Sun
2 | TASK_NAME='B1LPA6_ECOSM_Russ_2020_indels'
3 | MUTATION_TYPE='indels'
4 | for FOLD in {0..4}
5 | do
6 | RUN_NAME=${TASK_NAME}_fold${FOLD}
7 | srun mgen fit --config experiments/AIDO.Protein/DMS/configs/indels_LoRA_DDP.yaml \
8 | --data.train_split_files "[\"${MUTATION_TYPE}/${TASK_NAME}.tsv\"]" \
9 | --trainer.logger.name ${RUN_NAME} \
10 | --trainer.logger.id ${RUN_NAME} \
11 | --data.cv_test_fold_id ${FOLD} \
12 | --trainer.num_nodes 2 \
13 | --data.batch_size 1 \
14 | --trainer.callbacks.patience 5 \
15 | &> output_logs/protein/${RUN_NAME}.log
16 | done
17 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/DMS/train_sub_LoRA_DDP.sh:
--------------------------------------------------------------------------------
1 | # Ning Sun
2 | TASK_NAME='A4GRB6_PSEAI_Chen_2020'
3 | MUTATION_TYPE='singles_substitutions'
4 | for FOLD in {0..4}
5 | do
6 | RUN_NAME=${TASK_NAME}_fold${FOLD}
7 | srun mgen fit --config experiments/AIDO.Protein/DMS/configs/substitution_LoRA_DDP.yaml \
8 | --data.train_split_files "[\"${MUTATION_TYPE}/${TASK_NAME}.tsv\"]" \
9 | --trainer.logger.name ${RUN_NAME} \
10 | --trainer.logger.id ${RUN_NAME} \
11 | --data.cv_test_fold_id ${FOLD} \
12 | --trainer.num_nodes 4 \
13 | --data.batch_size 2 \
14 | --trainer.callbacks.patience 5 \
15 | &> output_logs/protein/${RUN_NAME}.log
16 | done
17 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/DMS/train_sub_LoRA_FSDP.sh:
--------------------------------------------------------------------------------
1 | # Ning Sun
2 | TASK_NAME='CP2C9_HUMAN_Amorosi_2021_abundance'
3 | MUTATION_TYPE='singles_substitutions'
4 | for FOLD in {0..4}
5 | do
6 | RUN_NAME=${TASK_NAME}_fold${FOLD}
7 | srun mgen fit --config experiments/AIDO.Protein/DMS/configs/substitution_LoRA_FSDP.yaml \
8 | --data.train_split_files "[\"${MUTATION_TYPE}/${TASK_NAME}.tsv\"]" \
9 | --trainer.logger.name ${RUN_NAME} \
10 | --trainer.logger.id ${RUN_NAME} \
11 | --data.cv_test_fold_id ${FOLD} \
12 | --trainer.num_nodes 4 \
13 | --data.batch_size 2 \
14 | --trainer.callbacks.patience 1 \
15 | &> output_logs/protein/${RUN_NAME}.log
16 | done
17 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/utils/logger.py:
--------------------------------------------------------------------------------
1 |
2 | # -*-coding:utf-8-*-
3 | import logging.config
4 |
5 |
6 | def singleton(cls):
7 | instances = {}
8 |
9 | def get_instance():
10 | if cls not in instances:
11 | instances[cls] = cls()
12 | return instances[cls]
13 |
14 | return get_instance()
15 |
16 |
17 | @singleton
18 | class Logger:
19 | def __init__(self):
20 | logging.basicConfig(
21 | format="%(asctime)s %(levelname)s %(process)d [%(filename)s:%(lineno)d] %(message)s",
22 | level=logging.INFO,
23 | )
24 | self.logger = logging.getLogger("root")
25 |
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/zeroshot_variant_effect_prediction/Clinvar_300M_zeroshot_Diff.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | logger: false
5 | callbacks:
6 | - class_path: modelgenerator.callbacks.PredictionWriter
7 | dict_kwargs:
8 | output_dir: output_logs
9 | filetype: tsv
10 | write_cols: ['score','label']
11 | model:
12 | class_path: modelgenerator.tasks.ZeroshotPredictionDiff
13 | init_args:
14 | backbone:
15 | class_path: modelgenerator.backbones.aido_dna_300m
16 | init_args:
17 | frozen: true
18 | data:
19 | class_path: modelgenerator.data.ClinvarRetrieve
20 | init_args:
21 | method: Diff
22 | window: 512
23 | batch_size: 5
24 |
--------------------------------------------------------------------------------
/modelgenerator/adapters/base.py:
--------------------------------------------------------------------------------
1 | class SequenceAdapter:
2 | """Base class only for type hinting purposes. Used for Callable[[int, int] SequenceAdapter] types."""
3 |
4 | pass
5 |
6 |
7 | class TokenAdapter:
8 | """Base class only for type hinting purposes. Used with Callable[[int, int] TokenAdapter] types."""
9 |
10 | pass
11 |
12 |
13 | class ConditionalGenerationAdapter:
14 | """Base class only for type hinting purposes. Used for Callable[[int, int, int, nn.Module] ConditionalGenerationAdapter] types."""
15 |
16 | pass
17 |
18 |
19 | class FusionAdapter:
20 | """Base class only for type hinting purposes. Used with Callable[[int, int, int, int] FusionAdapter] types."""
21 |
22 | pass
23 |
--------------------------------------------------------------------------------
/experiments/AIDO.Cell/README.md:
--------------------------------------------------------------------------------
1 | # AIDO.Cell
2 |
3 | AIDO.Cell-100M is GenBio AI’s SOTA cellular foundation model trained on 50 million cells over a diverse set of human tissues and organs. The AIDO.Cell models are capable of handling the entire human transcriptome as input, thus learning accurate and general representations of the human cell's entire transcriptional context. AIDO.Cell achieves state-of-the-art results in tasks such as zero-shot clustering, cell-type classification, and perturbation modeling.
4 |
5 | ## Resources
6 | - [Quick Start](./quickstart.ipynb)
7 | - [Cell Classification Tutorial](./tutorial_cell_classification.ipynb)
8 | - [AIDO.Cell HuggingFace Collection](https://huggingface.co/collections/genbio-ai/aidocell-6750f409bb20d8cd2cf14a25)
9 |
--------------------------------------------------------------------------------
/experiments/AIDO.Cell/extract_features.py:
--------------------------------------------------------------------------------
1 | import anndata as ad
2 | import numpy as np
3 | import torch
4 | import sys
5 | from modelgenerator.tasks import Embed
6 |
7 | device = 'cuda'
8 | batch_size = 4
9 |
10 | model = Embed.from_config({
11 | "model.backbone": "aido_cell_3m",
12 | "model.batch_size": batch_size
13 | }).eval()
14 | model = model.to(device).to(torch.float16)
15 |
16 | adata = ad.read_h5ad('../../modelgenerator/cell-downstream-tasks/zheng/zheng_train.h5ad')
17 |
18 | batch_np = adata[:batch_size].X.toarray()
19 | batch_tensor = torch.from_numpy(batch_np).to(torch.float16).to(device)
20 | batch_transformed = model.transform({'sequences': batch_tensor})
21 | embs = model(batch_transformed)
22 |
23 | print(embs)
24 |
--------------------------------------------------------------------------------
/experiments/AIDO.Cell/readme.md:
--------------------------------------------------------------------------------
1 | # AIDO.Cell
2 |
3 | AIDO.Cell-100M is GenBio AI’s SOTA cellular foundation model trained on 50 million cells over a diverse set of human tissues and organs. The AIDO.Cell models are capable of handling the entire human transcriptome as input, thus learning accurate and general representations of the human cell's entire transcriptional context. AIDO.Cell achieves state-of-the-art results in tasks such as zero-shot clustering, cell-type classification, and perturbation modeling.
4 |
5 | ## Resources
6 | - [Quick Start](./quickstart.ipynb)
7 | - [Cell Classification Tutorial](./tutorial_cell_classification.ipynb)
8 | - [AIDO.Cell HuggingFace Collection](https://huggingface.co/collections/genbio-ai/aidocell-6750f409bb20d8cd2cf14a25)
9 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/completions/genbio-aidosp.fish:
--------------------------------------------------------------------------------
1 | function _genbio_aidosp_completion;
2 | set -l response (env _GENBIO_AIDOSP_COMPLETE=fish_complete COMP_WORDS=(commandline -cp) COMP_CWORD=(commandline -t) genbio-aidosp);
3 |
4 | for completion in $response;
5 | set -l metadata (string split "," $completion);
6 |
7 | if test $metadata[1] = "dir";
8 | __fish_complete_directories $metadata[2];
9 | else if test $metadata[1] = "file";
10 | __fish_complete_path $metadata[2];
11 | else if test $metadata[1] = "plain";
12 | echo $metadata[2];
13 | end;
14 | end;
15 | end;
16 |
17 | complete --no-files --command genbio-aidosp --arguments "(_genbio_aidosp_completion)";
18 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/protein_inverse_folding/merge_ckpt.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | def reassemble_chunks(output_file, chunks_dir):
5 | with open(output_file, "wb") as f_out:
6 | for chunk_file in sorted(os.listdir(chunks_dir)):
7 | chunk_path = os.path.join(chunks_dir, chunk_file)
8 | with open(chunk_path, "rb") as f_in:
9 | f_out.write(f_in.read())
10 | print(f"Reassembled model saved to {output_file}")
11 |
12 |
13 | if len(sys.argv) < 3:
14 | print("Usage: python merge_ckpt.py ")
15 | sys.exit(1)
16 |
17 | # Parameters
18 | chunks_dir = sys.argv[1] #"model_chunks" # Directory containing downloaded chunks
19 | output_file = sys.argv[2]
20 |
21 | reassemble_chunks(output_file, chunks_dir)
22 |
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/zeroshot_variant_effect_prediction/Clinvar_7B_zeroshot_Distance.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | logger: false
5 | callbacks:
6 | - class_path: modelgenerator.callbacks.PredictionWriter
7 | dict_kwargs:
8 | output_dir: output_logs
9 | filetype: tsv
10 | write_cols: ['score','norm_type','labels','num_layer']
11 | model:
12 | class_path: modelgenerator.tasks.ZeroshotPredictionDistance
13 | init_args:
14 | backbone:
15 | class_path: modelgenerator.backbones.aido_dna_7b
16 | init_args:
17 | frozen: true
18 | all_hidden_states: True
19 | shared_ref: False
20 | data:
21 | class_path: modelgenerator.data.ClinvarRetrieve
22 | init_args:
23 | method: Distance
24 | window: 512
25 | batch_size: 5
26 |
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/zeroshot_variant_effect_prediction/Clinvar_300M_zeroshot_Distance.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | logger: false
5 | callbacks:
6 | - class_path: modelgenerator.callbacks.PredictionWriter
7 | dict_kwargs:
8 | output_dir: output_logs
9 | filetype: tsv
10 | write_cols: ['score','norm_type','labels','num_layer']
11 | model:
12 | class_path: modelgenerator.tasks.ZeroshotPredictionDistance
13 | init_args:
14 | backbone:
15 | class_path: modelgenerator.backbones.aido_dna_300m
16 | init_args:
17 | frozen: true
18 | all_hidden_states: True
19 | shared_ref: False
20 | data:
21 | class_path: modelgenerator.data.ClinvarRetrieve
22 | init_args:
23 | method: Distance
24 | window: 512
25 | batch_size: 5
26 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/xTrimo/ssp_q3.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 |
4 | RUN_NAME=ssp_AIDO.Protein_16B
5 | PROJECT=xtrimo_benchmark
6 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/ssp_q3.yaml
7 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME}
8 |
9 | if [ $MODE == "train" ]; then
10 | # using slurm script with 2 nodes (8 gpus in total) for training
11 | srun mgen fit --config $CONFIG_FILE \
12 | --trainer.logger.name $RUN_NAME \
13 | --trainer.logger.project $PROJECT \
14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \
15 | --trainer.num_nodes 2
16 | else
17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val*
18 | mgen test --config $CONFIG_FILE \
19 | --data.batch_size 4 \
20 | --model.strict_loading False \
21 | --model.reset_optimizer_states True \
22 | --trainer.logger null \
23 | --ckpt_path $CKPT_PATH
24 | fi
25 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/modification_site_prediction.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 |
4 | RUN_NAME=msp_aido_rna_1b600m
5 | PROJECT=rna_tasks
6 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME}
7 | CONFIG_FILE=experiments/AIDO.RNA/configs/modification_site_prediction.yaml
8 |
9 | if [ $MODE == "train" ]; then
10 | # using slurm script with 4 nodes (16 gpus in total) for training
11 | srun mgen fit --config $CONFIG_FILE \
12 | --trainer.logger.name $RUN_NAME \
13 | --trainer.logger.project $PROJECT \
14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \
15 | --trainer.num_nodes 4
16 | else
17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val*
18 | mgen test --config $CONFIG_FILE \
19 | --data.batch_size 16 \
20 | --model.strict_loading False \
21 | --model.reset_optimizer_states True \
22 | --trainer.logger null \
23 | --ckpt_path $CKPT_PATH
24 | fi
25 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/xTrimo/fold_prediction.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 |
4 | RUN_NAME=fold_AIDO.Protein_16B
5 | PROJECT=xtrimo_benchmark
6 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME}
7 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/fold_prediction.yaml
8 |
9 | if [ $MODE == "train" ]; then
10 | # using slurm script with 4 nodes (16 gpus in total) for training
11 | srun mgen fit --config $CONFIG_FILE \
12 | --trainer.logger.name $RUN_NAME \
13 | --trainer.logger.project $PROJECT \
14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \
15 | --trainer.num_nodes 4
16 | else
17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val*
18 | mgen test --config $CONFIG_FILE \
19 | --data.batch_size 16 \
20 | --model.strict_loading False \
21 | --model.reset_optimizer_states True \
22 | --trainer.logger null \
23 | --ckpt_path $CKPT_PATH
24 | fi
25 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/scimilarity/model_v1.1/layer_sizes.json:
--------------------------------------------------------------------------------
1 | {"network.0.1.weight": [1024, 28231], "network.0.1.bias": [1024], "network.0.2.weight": [1024], "network.0.2.bias": [1024], "network.0.2.running_mean": [1024], "network.0.2.running_var": [1024], "network.0.2.num_batches_tracked": [], "network.0.3.weight": [1], "network.1.1.weight": [1024, 1024], "network.1.1.bias": [1024], "network.1.2.weight": [1024], "network.1.2.bias": [1024], "network.1.2.running_mean": [1024], "network.1.2.running_var": [1024], "network.1.2.num_batches_tracked": [], "network.1.3.weight": [1], "network.2.1.weight": [1024, 1024], "network.2.1.bias": [1024], "network.2.2.weight": [1024], "network.2.2.bias": [1024], "network.2.2.running_mean": [1024], "network.2.2.running_var": [1024], "network.2.2.num_batches_tracked": [], "network.2.3.weight": [1], "network.3.weight": [128, 1024], "network.3.bias": [128]}
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/xTrimo/tcr_pmhc_affinity.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 |
4 | RUN_NAME=tcr_pmhc_affinity_AIDO.Protein_16B
5 | PROJECT=xtrimo_benchmark
6 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME}
7 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/tcr_pmhc_affinity.yaml
8 |
9 | if [ $MODE == "train" ]; then
10 | # using slurm script with 2 nodes (8 gpus in total) for training
11 | srun mgen fit --config $CONFIG_FILE \
12 | --trainer.logger.name $RUN_NAME \
13 | --trainer.logger.project $PROJECT \
14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \
15 | --trainer.num_nodes 2
16 | else
17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val*
18 | mgen test --config $CONFIG_FILE \
19 | --data.batch_size 16 \
20 | --model.strict_loading False \
21 | --model.reset_optimizer_states True \
22 | --trainer.logger null \
23 | --ckpt_path $CKPT_PATH
24 | fi
25 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/ncrna_family_classfification.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 | BOUNDARY_NOISE=bnoise0 #'bnoise0' 'bnoise200'
4 |
5 | RUN_NAME=nfc_${BOUNDARY_NOISE}_aido_rna_1b600m
6 | if [ $MODE == "train" ]; then
7 | CKPT_SAVE_DIR=logs/rna_tasks/${RUN_NAME}
8 | CUDA_VISIBLE_DEVICES=0,1 mgen fit --config experiments/AIDO.RNA/configs/ncrna_family_classification.yaml \
9 | --data.config_name ncrna_family_${BOUNDARY_NOISE} \
10 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR
11 | else
12 | CKPT_PATH=logs/rna_tasks/${RUN_NAME}/best_val*
13 | mgen test --config experiments/AIDO.RNA/configs/ncrna_family_classification.yaml \
14 | --data.config_name ncrna_family_${BOUNDARY_NOISE} \
15 | --data.batch_size 256 \
16 | --model.strict_loading False \
17 | --model.reset_optimizer_states True \
18 | --trainer.logger null \
19 | --ckpt_path $CKPT_PATH
20 | fi
21 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Upload to PyPI on release
2 |
3 | on:
4 | release:
5 | types: [created]
6 |
7 | jobs:
8 | pypi-publish:
9 | name: Publish release to PyPI
10 | runs-on: ubuntu-latest
11 | environment:
12 | name: pypi
13 | url: https://pypi.org/p/modelgenerator
14 | permissions:
15 | id-token: write
16 | steps:
17 | - uses: actions/checkout@v4
18 | - name: Set up Python
19 | uses: actions/setup-python@v4
20 | with:
21 | python-version: "3.x"
22 | - name: Install dependencies
23 | run: |
24 | python -m pip install --upgrade pip
25 | pip install build hatchling twine
26 | - name: Build package
27 | run: |
28 | python -m build
29 | - name: Publish package distributions to PyPI
30 | uses: pypa/gh-action-pypi-publish@release/v1
31 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/xTrimo/fluorescence_prediction.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 |
4 | RUN_NAME=fluorescence_AIDO.Protein_16B
5 | PROJECT=xtrimo_benchmark
6 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/fluorescence_prediction.yaml
7 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME}
8 |
9 | if [ $MODE == "train" ]; then
10 | # using slurm script with 2 nodes (8 gpus in total) for training
11 | srun mgen fit --config $CONFIG_FILE \
12 | --trainer.logger.name $RUN_NAME \
13 | --trainer.logger.project $PROJECT \
14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \
15 | --trainer.num_nodes 2
16 | else
17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val*
18 | mgen test --config $CONFIG_FILE \
19 | --data.batch_size 16 \
20 | --model.strict_loading False \
21 | --model.reset_optimizer_states True \
22 | --trainer.logger null \
23 | --ckpt_path $CKPT_PATH
24 | fi
25 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/xTrimo/contact_prediction_binary.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 |
4 | RUN_NAME=AIDO.Protein_16B_fsdp_bs4
5 | PROJECT=xtrimo_benchmark
6 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/contact_prediction_binary.yaml
7 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME}
8 |
9 | if [ $MODE == "train" ]; then
10 | # using slurm script with 1 nodes (4 gpus in total) for training
11 | srun mgen fit --config $CONFIG_FILE \
12 | --trainer.logger.name $RUN_NAME \
13 | --trainer.logger.project $PROJECT \
14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \
15 | --trainer.num_nodes 1
16 | else
17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val*.ckpt
18 | mgen test --config $CONFIG_FILE \
19 | --data.batch_size 1 \
20 | --model.strict_loading False \
21 | --model.reset_optimizer_states True \
22 | --trainer.logger null \
23 | --ckpt_path $CKPT_PATH
24 | fi
25 |
--------------------------------------------------------------------------------
/docs/docs/api_reference/callbacks.md:
--------------------------------------------------------------------------------
1 | # Callbacks
2 |
3 | Callbacks can be used with the LightningCLI trainer to inject custom behavior into the training process.
4 | Callbacks are configured in the `trainer` section of the YAML configuration file.
5 |
6 | We provide a few custom callbacks for common use cases, but many more are available in the Lightning ecosystem.
7 | Check the [Trainer documentation](../trainer) for more details.
8 |
9 | ```yaml
10 | # Example Callback Configuration
11 | trainer:
12 | callbacks:
13 | - class_path: modelgenerator.callbacks.PredictionWriter
14 | dict_kwargs:
15 | output_dir: my_predictions
16 | filetype: tsv
17 | write_cols:
18 | - id
19 | - prediction
20 | - label
21 | model:
22 | ...
23 | data:
24 | ...
25 | ```
26 |
27 | ::: modelgenerator.callbacks.PredictionWriter
28 |
29 | ::: modelgenerator.callbacks.FTScheduler
30 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/xTrimo/peptide_HLA_MHC_affinity.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 |
4 | RUN_NAME=peptide_HLA_MHC_affinity_AIDO.Protein_16B
5 | PROJECT=xtrimo_benchmark
6 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME}
7 | CONFIG_FILE=experiments/AIDO.Protein/xTrimo/configs/peptide_HLA_MHC_affinity.yaml
8 |
9 | if [ $MODE == "train" ]; then
10 | # using slurm script with 2 nodes (8 gpus in total) for training
11 | srun mgen fit --config $CONFIG_FILE \
12 | --trainer.logger.name $RUN_NAME \
13 | --trainer.logger.project $PROJECT \
14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \
15 | --trainer.num_nodes 2
16 | else
17 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val*
18 | mgen test --config $CONFIG_FILE \
19 | --data.batch_size 16 \
20 | --model.strict_loading False \
21 | --model.reset_optimizer_states True \
22 | --trainer.logger null \
23 | --ckpt_path $CKPT_PATH
24 | fi
25 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies and run tests with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Pytest
5 |
6 | on: [push]
7 |
8 | jobs:
9 | build:
10 |
11 | runs-on: ubuntu-latest
12 | strategy:
13 | fail-fast: false
14 | matrix:
15 | python-version: ["3.10", "3.11", "3.12"]
16 |
17 | steps:
18 | - uses: actions/checkout@v3
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v3
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | python -m pip install .
27 | - name: Test with pytest
28 | run: |
29 | pytest tests/
30 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/completions/genbio-aidosp-complete.bash:
--------------------------------------------------------------------------------
1 | _genbio_aidosp_completion() {
2 | local IFS=$'\n'
3 | local response
4 |
5 | response=$(env COMP_WORDS="${COMP_WORDS[*]}" COMP_CWORD=$COMP_CWORD _GENBIO_AIDOSP_COMPLETE=bash_complete $1)
6 |
7 | for completion in $response; do
8 | IFS=',' read type value <<< "$completion"
9 |
10 | if [[ $type == 'dir' ]]; then
11 | COMPREPLY=()
12 | compopt -o dirnames
13 | elif [[ $type == 'file' ]]; then
14 | COMPREPLY=()
15 | compopt -o default
16 | elif [[ $type == 'plain' ]]; then
17 | COMPREPLY+=($value)
18 | fi
19 | done
20 |
21 | return 0
22 | }
23 |
24 | _genbio_aidosp_completion_setup() {
25 | complete -o nosort -F _genbio_aidosp_completion genbio-aidosp
26 | }
27 |
28 | _genbio_aidosp_completion_setup;
29 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/utils/data_process.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | def make_them_on_same_device(*args):
17 | if "cpu" in [d.device.type for d in args]:
18 | out = [d.cpu() for d in args]
19 | return out
20 | else:
21 | return args
22 |
--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: docs
2 | on:
3 | push:
4 | branches:
5 | - main
6 | permissions:
7 | contents: write
8 | jobs:
9 | deploy:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v4
13 | - name: Configure Git Credentials
14 | run: |
15 | git config user.name github-actions[bot]
16 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com
17 | - uses: actions/setup-python@v5
18 | with:
19 | python-version: 3.10.15
20 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
21 | - uses: actions/cache@v4
22 | with:
23 | key: mkdocs-material-${{ env.cache_id }}
24 | path: .cache
25 | restore-keys: |
26 | mkdocs-material-
27 | - run: PIP_NO_CACHE_DIR=1 pip install ".[dev]"
28 | - run: mkdocs gh-deploy --config-file docs/mkdocs.yml --force
29 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | # Copyright 2025 GenBio AI.
3 | # Copyright 2024 ByteDance and/or its affiliates.
4 | # Copyright 2021 AlQuraishi Laboratory
5 | # Copyright 2021 DeepMind Technologies Limited
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 |
20 | from . import data, model, np, utils
21 |
22 | __all__ = ["model", "utils", "np", "data"]
23 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/README.md:
--------------------------------------------------------------------------------
1 | # Protein Structure Tokenizer
2 | This is the implementation for [genbio-ai/AIDO.StructureTokenizer](https://huggingface.co/genbio-ai/AIDO.StructureTokenizer). Due to the properties of protein data, it has a standalone data pipeline. The overall structure of this folder is as follows:
3 |
4 | - `callbacks`: Contains the callbacks used in saving structure tokens and PDB files
5 | - `configs`: Contains the configuration files for the model and data
6 | - `datasets`: Contains the dataset classes for handling PDB data and the data module
7 | - `layers`: Contains the custom layers used in the model
8 | - `models`: Contains the encoder (`equiformer_encoder.py`), decoder (`esmfold_decoder.py`), the full model (`structure_tokenizer.py`), and its lightning module (`structure_tokenizer_lightning.py`)
9 | - `utils`: Miscellaneous utility functions
10 |
11 | For the usage of this model, please refer to `experiments/AIDO.StructureTokenizer/README.md`.
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/model/layer_norm/kernel/compat.h:
--------------------------------------------------------------------------------
1 | // modified from https://github.com/NVIDIA/apex/blob/master/csrc/compat.h
2 | // Copyright 2021- HPC-AI Technology Inc.
3 | //
4 | // Licensed under the Apache License, Version 2.0 (the "License");
5 | // you may not use this file except in compliance with the License.
6 | // You may obtain a copy of the License at
7 | //
8 | // http://www.apache.org/licenses/LICENSE-2.0
9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 |
16 | #ifndef TORCH_CHECK
17 | #define TORCH_CHECK AT_CHECK
18 | #endif
19 |
20 | #ifdef VERSION_GE_1_3
21 | #define DATA_PTR data_ptr
22 | #else
23 | #define DATA_PTR data
24 | #endif
25 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/tools/utils.py:
--------------------------------------------------------------------------------
1 |
2 | """Common utilities for data pipeline tools."""
3 | import contextlib
4 | import datetime
5 | import logging
6 | import shutil
7 | import tempfile
8 | import time
9 | from typing import Optional
10 |
11 |
12 | @contextlib.contextmanager
13 | def tmpdir_manager(base_dir: Optional[str] = None):
14 | """Context manager that deletes a temporary directory on exit."""
15 | tmpdir = tempfile.mkdtemp(dir=base_dir)
16 | try:
17 | yield tmpdir
18 | finally:
19 | shutil.rmtree(tmpdir, ignore_errors=True)
20 |
21 |
22 | @contextlib.contextmanager
23 | def timing(msg: str):
24 | logging.info("Started %s", msg)
25 | tic = time.perf_counter()
26 | yield
27 | toc = time.perf_counter()
28 | logging.info("Finished %s in %.3f seconds", msg, toc - tic)
29 |
30 |
31 | def to_date(s: str):
32 | return datetime.datetime(year=int(s[:4]), month=int(s[5:7]), day=int(s[8:10]))
33 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/job2fasta.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 |
4 | def json2fasta(input_json, output_fasta):
5 | with open(input_json, 'r') as f:
6 | data = json.load(f)
7 |
8 | fasta_lines = []
9 | for i, job in enumerate(data):
10 | for j, entity in enumerate(job['sequences']):
11 | if "proteinChain" in entity:
12 | sequence = entity['proteinChain']['sequence']
13 | fasta_lines.append(f">job_{i}_entity_{j}\n{sequence}")
14 |
15 | with open(output_fasta, 'w') as f:
16 | f.write("\n".join(fasta_lines))
17 |
18 |
19 | if __name__ == '__main__':
20 | import argparse
21 | parser = argparse.ArgumentParser(description='Construct FASTA from job JSON for protein MSA retrieval.')
22 | parser.add_argument('--input', type=str, help='Input JSON file')
23 | parser.add_argument('--output', type=str, help='Output FASTA file')
24 | args = parser.parse_args()
25 | json2fasta(args.input, args.output)
26 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/bin/search_msa.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI
2 |
3 | set -xe
4 |
5 | source activate msa
6 |
7 | PROJECT_DIR=$(cd "$(dirname $0)" && pwd)/..
8 |
9 | export PATH=/workspace/env/mmseqs/bin/:$PATH
10 |
11 | #input=${PROJECT_DIR}/data/example/ # input a folder, could include multiple .fasta files
12 | input=${PROJECT_DIR}/data/example/T1104-D1.fasta # input a fasta, could include multiple sequences
13 | output_dir=local_msa_database/
14 | mkdir -p ${output_dir}
15 |
16 | #config_yaml_path=${PROJECT_DIR}/yamls/mmseqs.yaml
17 | config_yaml_path=${PROJECT_DIR}/yamls/mmseqs_api.yaml
18 |
19 | cpus_per_task=4
20 | no_tasks=60 # tuning this number according to machine setting.
21 |
22 | python ${PROJECT_DIR}/search_msa.py \
23 | --input=${input} \
24 | --output_dir=${output_dir} \
25 | --cpus_per_task=${cpus_per_task} \
26 | --no_tasks=${no_tasks} \
27 | --config_yaml_path=${config_yaml_path} \
28 | --shuffle_file_list \
29 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructureTokenizer/decode.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | max_steps: -1
5 | max_epochs: -1
6 | gradient_clip_val: 1
7 | default_root_dir: "logs/protstruct_decode/"
8 | logger: false
9 | callbacks:
10 | - class_path: modelgenerator.structure_tokenizer.callbacks.WriterPDBCallback
11 | dict_kwargs:
12 | dirpath: "logs/protstruct_decode/"
13 |
14 | model:
15 | class_path: modelgenerator.structure_tokenizer.models.ESMFoldDecoderLightning
16 | init_args:
17 | pretrained_model_name_or_path: "genbio-ai/AIDO.StructureDecoder"
18 |
19 | data:
20 | class_path: modelgenerator.structure_tokenizer.datasets.StructTokensLightningDataModule
21 | init_args:
22 | config:
23 | num_workers: 0
24 | struct_tokens_datasets_configs:
25 | - name: "casp15"
26 | struct_tokens_path: "logs/protstruct_encode/casp15_struct_tokens.pt"
27 | codebook_path: "logs/protstruct_encode/codebook.pt"
28 | batch_size: 2
29 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/splice_site_prediction.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 | SPLICE_SITE=acceptor #'acceptor' 'donor'
4 |
5 | RUN_NAME=csp_${SPLICE_SITE}_aido_rna_1b600m
6 | if [ $MODE == "train" ]; then
7 | CKPT_SAVE_DIR=logs/rna_tasks/${RUN_NAME}
8 | CUDA_VISIBLE_DEVICES=0,1 mgen fit --config experiments/AIDO.RNA/configs/splice_site_prediction.yaml \
9 | --data.config_name splice_site_${SPLICE_SITE} \
10 | --trainer.logger.name $RUN_NAME \
11 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR
12 | else
13 | CKPT_PATH=logs/rna_tasks/${RUN_NAME}/best_val*
14 | for TEST_TYPE in danio fly worm thaliana
15 | do
16 | echo $TEST_TYPE
17 | mgen test --config experiments/AIDO.RNA/configs/splice_site_prediction.yaml \
18 | --data.config_name splice_site_${SPLICE_SITE} \
19 | --data.test_split_name test_$TEST_TYPE \
20 | --data.batch_size 256 \
21 | --model.strict_loading False \
22 | --model.reset_optimizer_states True \
23 | --trainer.logger null \
24 | --ckpt_path $CKPT_PATH
25 | done
26 | fi
27 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/utils/geometry/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI.
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | # Copyright 2021 AlQuraishi Laboratory
4 | # Copyright 2021 DeepMind Technologies Limited
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | """Utils for geometry library."""
19 |
20 | import dataclasses
21 |
22 |
23 | def get_field_names(cls):
24 | fields = dataclasses.fields(cls)
25 | field_names = [f.name for f in fields]
26 | return field_names
27 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/utils/hash_encoder.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import hashlib
17 |
18 |
19 | def hash_seq(seq, method='md5'):
20 | """
21 | hash the string sequence
22 | :param seq:
23 | :param method:
24 | :return:
25 | """
26 | if method == "md5":
27 | hasher = hashlib.md5
28 | else:
29 | raise NotImplementedError
30 | code = hasher(seq.encode(encoding='utf-8')).hexdigest()
31 |
32 | return code
33 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/rna_inverse_folding/rna_inv_fold_test.yaml:
--------------------------------------------------------------------------------
1 | ckpt_path: null
2 |
3 | # Data Loading
4 | data:
5 | class_path: modelgenerator.rna_inv_fold.data_inverse_folding.datamodule.RNAInverseFoldingDataModule
6 | init_args:
7 | # path: /mgen_data/modelgenerator/datasets/rna_inv_fold/structure_encoding/
8 | path: null
9 |
10 | # Model Arguments
11 | model:
12 | class_path: modelgenerator.rna_inv_fold.rif_task.RNAInvFold
13 | init_args:
14 | backbone:
15 | class_path: modelgenerator.backbones.aido_rna_1b600m
16 | custom_invfold_config:
17 | ## diffusion
18 | num_denoise_steps: 3
19 | diffusion_verbose: 1
20 |
21 | # Training Configuration
22 | trainer:
23 | accelerator: auto
24 | devices: 1
25 | max_steps: -1
26 | max_epochs: -1
27 | gradient_clip_val: null
28 | precision: 32
29 | default_root_dir: "/mgen_data/modelgenerator/logs/rna_inv_fold/"
30 | detect_anomaly: true
31 |
32 | # DDP strategy
33 | strategy:
34 | class_path: lightning.pytorch.strategies.DDPStrategy
35 | dict_kwargs:
36 | find_unused_parameters: true
37 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructureTokenizer/encode_decode.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | max_steps: -1
5 | max_epochs: -1
6 | gradient_clip_val: 1
7 | default_root_dir: "logs/protstruct_model/"
8 | logger: false
9 | callbacks:
10 | - class_path: modelgenerator.structure_tokenizer.callbacks.WriterPDBCallback
11 | dict_kwargs:
12 | dirpath: "logs/protstruct_model/"
13 |
14 | model:
15 | class_path: modelgenerator.structure_tokenizer.models.StructureTokenizerLightning
16 | init_args:
17 | pretrained_model_name_or_path: "genbio-ai/AIDO.StructureTokenizer"
18 |
19 | data:
20 | class_path: modelgenerator.structure_tokenizer.datasets.ProteinLightningDataModule
21 | init_args:
22 | config:
23 | num_workers: 0
24 | seed: 0
25 | proteins_datasets_configs:
26 | - name: "casp15"
27 | registry_path: "data/protstruct_sample_data/registries/casp15_merged.csv"
28 | folder_path: "data/protstruct_sample_data/CASP15_merged/"
29 | max_nb_res: 1024
30 | batch_size: 2
31 | seed: 0
32 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/utils/precision_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI.
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | # Copyright 2021 AlQuraishi Laboratory
4 | # Copyright 2021 DeepMind Technologies Limited
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 |
19 | import torch
20 |
21 |
22 | def is_fp16_enabled():
23 | # Autocast world
24 | fp16_enabled = torch.get_autocast_gpu_dtype() == torch.float16
25 | fp16_enabled = fp16_enabled and torch.is_autocast_enabled()
26 |
27 | return fp16_enabled
28 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/expression_level_prediction.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 | CELL_LINE=pc3 #'Muscle' 'HEK' 'pc3'
4 |
5 | if [ $MODE == "train" ]; then
6 | for FOLD in {0..9}
7 | do
8 | RUN_NAME=el_${CELL_LINE}_aido_rna_1b600m_fold${FOLD}
9 | CKPT_SAVE_DIR=logs/rna_tasks/${RUN_NAME}
10 | CUDA_VISIBLE_DEVICES=0 mgen fit --config experiments/AIDO.RNA/configs/expression_level.yaml \
11 | --data.config_name expression_${CELL_LINE} \
12 | --data.cv_test_fold_id $FOLD \
13 | --trainer.logger.name $RUN_NAME \
14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR
15 | done
16 | else
17 | for FOLD in {0..9}
18 | do
19 | CKPT_PATH=logs/rna_tasks/el_${CELL_LINE}_aido_rna_1b600m_fold${FOLD}/best_val*
20 | echo ">>> Fold ${FOLD}"
21 | mgen test --config experiments/AIDO.RNA/configs/expression_level.yaml \
22 | --data.config_name expression_${CELL_LINE} \
23 | --data.cv_test_fold_id $FOLD \
24 | --model.strict_loading True \
25 | --model.reset_optimizer_states True \
26 | --trainer.logger null \
27 | --ckpt_path $CKPT_PATH
28 | done
29 | fi
30 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/data/errors.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI.
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | # Copyright 2021 AlQuraishi Laboratory
4 | # Copyright 2021 DeepMind Technologies Limited
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 |
19 | """General-purpose errors used throughout the data pipeline"""
20 |
21 |
22 | class Error(Exception):
23 | """Base class for exceptions."""
24 |
25 |
26 | class MultipleChainsError(Error):
27 | """An error indicating that multiple chains were found for a given ID."""
28 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructureTokenizer/encode.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | max_steps: -1
5 | max_epochs: -1
6 | gradient_clip_val: 1
7 | default_root_dir: "logs/protstruct_encode/"
8 | logger: false
9 | callbacks:
10 | - class_path: modelgenerator.structure_tokenizer.callbacks.StructTokensCallback
11 | dict_kwargs:
12 | output_dir: "logs/protstruct_encode/"
13 | write_interval: "epoch"
14 |
15 | model:
16 | class_path: modelgenerator.structure_tokenizer.models.EquiformerEncoderLightning
17 | init_args:
18 | pretrained_model_name_or_path: "genbio-ai/AIDO.StructureEncoder"
19 |
20 | data:
21 | class_path: modelgenerator.structure_tokenizer.datasets.ProteinLightningDataModule
22 | init_args:
23 | config:
24 | num_workers: 0
25 | seed: 0
26 | proteins_datasets_configs:
27 | - name: "casp15"
28 | registry_path: "data/protstruct_sample_data/registries/casp15_merged.csv"
29 | folder_path: "data/protstruct_sample_data/CASP15_merged/"
30 | max_nb_res: 1024
31 | batch_size: 2
32 | seed: 0
33 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructureTokenizer/structure_encoding.sh:
--------------------------------------------------------------------------------
1 | # 1. download the sample dataset
2 | huggingface-cli download genbio-ai/sample-structure-dataset --repo-type dataset --local-dir ./data/protstruct_sample_data/
3 |
4 | set -ex
5 |
6 | # 2. run encoding and then decoding
7 | # check logs/protstruct_model/casp15_pdb_files
8 | # *_input.pdb are the original pdb files
9 | # *_output.pdb are the reconstructed pdb files
10 | echo "run encoding and then decoding"
11 | CUDA_VISIBLE_DEVICES=0 mgen predict --config=experiments/AIDO.StructureTokenizer/encode_decode.yaml
12 |
13 |
14 | # 3. run encoding only
15 | # check logs/protstruct_encode/casp15_struct_tokens.pt for the output tokens
16 | # logs/protstruct_encode/codebook.pt for the codebook
17 | echo "run encoding only"
18 | CUDA_VISIBLE_DEVICES=0 mgen predict --config=experiments/AIDO.StructureTokenizer/encode.yaml
19 |
20 | # 4. decode the tokens from step 3
21 | # check logs/protstruct_decode/casp15_pdb_files for the output structures
22 | echo "decode the tokens from the encoding step"
23 | CUDA_VISIBLE_DEVICES=0 mgen predict --config=experiments/AIDO.StructureTokenizer/decode.yaml
24 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/protein_inverse_folding/protein_inv_fold_test.yaml:
--------------------------------------------------------------------------------
1 | ckpt_path: null
2 |
3 | # Data Loading
4 | data:
5 | class_path: modelgenerator.prot_inv_fold.data_inverse_folding.datamodule.ProteinInverseFoldingDataModule
6 | init_args:
7 | # path: /mgen_data/modelgenerator/datasets/protein_inv_fold/cath_4.2/
8 | path: null
9 |
10 | # Model Arguments
11 | model:
12 | class_path: modelgenerator.prot_inv_fold.pif_task.ProteinInvFold
13 | init_args:
14 | backbone:
15 | class_path: modelgenerator.backbones.aido_protein_16b
16 | custom_invfold_config:
17 | ## custom setting for diffusion
18 | num_denoise_steps: 3
19 | diffusion_verbose: 1
20 |
21 | # Training Configuration
22 | trainer:
23 | accelerator: auto
24 | devices: 3,
25 | max_steps: -1
26 | max_epochs: -1
27 | gradient_clip_val: null
28 | precision: 32
29 | default_root_dir: "/mgen_data/modelgenerator/logs/protein_inv_fold/"
30 | detect_anomaly: true
31 |
32 | # DDP strategy
33 | strategy:
34 | class_path: lightning.pytorch.strategies.DDPStrategy
35 | dict_kwargs:
36 | find_unused_parameters: true
37 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/translation_efficiency_prediction.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 | CELL_LINE=Muscle #'Muscle' 'HEK' 'pc3'
4 |
5 | if [ $MODE == "train" ]; then
6 | for FOLD in {0..9}
7 | do
8 | RUN_NAME=te_${CELL_LINE}_aido_rna_1b600m_fold${FOLD}
9 | CKPT_SAVE_DIR=logs/rna_tasks/${RUN_NAME}
10 | CUDA_VISIBLE_DEVICES=0 mgen fit --config experiments/AIDO.RNA/configs/translation_efficiency.yaml \
11 | --data.config_name translation_efficiency_${CELL_LINE} \
12 | --data.cv_test_fold_id $FOLD \
13 | --trainer.logger.name $RUN_NAME \
14 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR
15 | done
16 | else
17 | for FOLD in {0..9}
18 | do
19 | CKPT_PATH=logs/rna_tasks/te_${CELL_LINE}_aido_rna_1b600m_fold${FOLD}/best_val*
20 | echo ">>> Fold ${FOLD}"
21 | mgen test --config experiments/AIDO.RNA/configs/translation_efficiency.yaml \
22 | --data.config_name translation_efficiency_${CELL_LINE} \
23 | --data.cv_test_fold_id $FOLD \
24 | --model.strict_loading True \
25 | --model.reset_optimizer_states True \
26 | --trainer.logger null \
27 | --ckpt_path $CKPT_PATH
28 | done
29 | fi
30 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/enformer_pytorch/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Phil Wang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/modelgenerator/rna_inv_fold/gRNAde_structure_encoder/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Chaitanya K. Joshi
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/protein_abundance_prediction.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 | ORGANISM=hsapiens #'athaliana' 'dmelanogaster' 'ecoli' 'hsapiens' 'scerevisiae'
4 |
5 | PROJECT=rna_tasks
6 | if [ $MODE == "train" ]; then
7 | for FOLD in {0..4}
8 | do
9 | RUN_NAME=pa_${ORGANISM}_aido_rna_1b600mـcds_fold${FOLD}
10 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME}
11 | mgen fit --config experiments/AIDO.RNA/configs/protein_abundance.yaml \
12 | --data.config_name protein_abundance_${ORGANISM} \
13 | --data.cv_test_fold_id $FOLD \
14 | --trainer.logger.name $RUN_NAME \
15 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR
16 | done
17 | else
18 | for FOLD in {0..4}
19 | do
20 | RUN_NAME=pa_${ORGANISM}_aido_rna_1b600mـcds_fold${FOLD}
21 | CKPT_PATH=logs/${PROJECT}/${RUN_NAME}/best_val*
22 | echo ">>> Fold ${FOLD}"
23 | mgen test --config experiments/AIDO.RNA/configs/protein_abundance.yaml \
24 | --data.config_name protein_abundance_${ORGANISM} \
25 | --data.cv_test_fold_id $FOLD \
26 | --model.strict_loading False \
27 | --model.reset_optimizer_states True \
28 | --trainer.logger null \
29 | --ckpt_path $CKPT_PATH
30 | done
31 | fi
32 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/astral-sh/ruff-pre-commit
3 | rev: v0.11.7
4 | hooks:
5 | - id: ruff
6 | args: [--config, pyproject.toml, --fix]
7 | - id: ruff-format
8 | args: [--config, pyproject.toml]
9 | - repo: https://github.com/pre-commit/pre-commit-hooks
10 | rev: v5.0.0
11 | hooks:
12 | - id: trailing-whitespace
13 | exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)
14 | - id: end-of-file-fixer
15 | exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)
16 | - id: check-yaml
17 | exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)
18 | - id: debug-statements
19 | exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)
20 | - id: check-added-large-files
21 | exclude: ^modelgenerator/(huggingface_models|prot_inv_fold|rna_inv_fold|rna_ss|structure_tokenizer)
22 | - repo: https://github.com/python-poetry/poetry
23 | rev: 2.1.2
24 | hooks:
25 | - id: poetry-check
26 | - id: poetry-lock
27 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/transcript_abundance_prediction.sh:
--------------------------------------------------------------------------------
1 | # Shuxian Zou
2 | MODE=train
3 | ORGANISM=ecoli #'athaliana' 'dmelanogaster' 'ecoli' 'hsapiens' 'scerevisiae' 'ppastoris' 'hvolcanii'
4 |
5 | PROJECT=rna_tasks
6 | if [ $MODE == "train" ]; then
7 | for FOLD in {0..4}
8 | do
9 | RUN_NAME=ta_${ORGANISM}_aido_rna_1b600m_fold${FOLD}
10 | CKPT_SAVE_DIR=logs/${PROJECT}/${RUN_NAME}
11 | mgen fit --config experiments/AIDO.RNA/configs/transcript_abundance.yaml \
12 | --data.config_name transcript_abundance_${ORGANISM} \
13 | --data.cv_test_fold_id $FOLD \
14 | --trainer.logger.name $RUN_NAME \
15 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR
16 | done
17 | else
18 | for FOLD in {0..4}
19 | do
20 | RUN_NAME=ta_${ORGANISM}_aido_rna_1b600m_fold${FOLD}
21 | CKPT_PATH=logs/${PROJECT}/${RUN_NAME}/best_val*
22 | echo ">>> Fold ${FOLD}"
23 | mgen test --config experiments/AIDO.RNA/configs/transcript_abundance.yaml \
24 | --data.config_name transcript_abundance_${ORGANISM} \
25 | --data.cv_test_fold_id $FOLD \
26 | --model.strict_loading False \
27 | --model.reset_optimizer_states True \
28 | --trainer.logger null \
29 | --ckpt_path $CKPT_PATH
30 | done
31 | fi
32 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/borzoi_pytorch/config_borzoi.py:
--------------------------------------------------------------------------------
1 | from transformers import PretrainedConfig
2 |
3 | class BorzoiConfig(PretrainedConfig):
4 | model_type = "borzoi"
5 |
6 | def __init__(
7 | self,
8 | dim = 1536,
9 | depth = 8,
10 | heads = 8,
11 | # output_heads = dict(human = 5313, mouse= 1643),
12 | return_center_bins_only = True,
13 | attn_dim_key = 64,
14 | attn_dim_value = 192,
15 | dropout_rate = 0.2,
16 | attn_dropout = 0.05,
17 | pos_dropout = 0.01,
18 | enable_mouse_head = False,
19 | bins_to_return = 6144,
20 | **kwargs,
21 | ):
22 | self.dim = dim
23 | self.depth = depth
24 | self.heads = heads
25 | # self.output_heads = output_heads
26 | self.attn_dim_key = attn_dim_key
27 | self.attn_dim_value = attn_dim_value
28 | self.dropout_rate = dropout_rate
29 | self.attn_dropout = attn_dropout
30 | self.pos_dropout = pos_dropout
31 | self.return_center_bins_only = return_center_bins_only
32 | self.enable_mouse_head = enable_mouse_head
33 | self.bins_to_return = bins_to_return
34 | super().__init__(**kwargs)
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/data/ccd_data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from pathlib import Path
17 |
18 | COMPONENTS_FILE = None
19 | RKDIT_MOL_PKL = None
20 |
21 | def set_components_file(components_file):
22 | global COMPONENTS_FILE
23 | COMPONENTS_FILE = components_file
24 |
25 | def set_rkdit_mol_pkl(rkdit_mol_pkl):
26 | global RKDIT_MOL_PKL
27 | RKDIT_MOL_PKL = Path(rkdit_mol_pkl)
28 |
29 |
30 | def get_components_file():
31 | global COMPONENTS_FILE
32 | return COMPONENTS_FILE
33 |
34 | def get_rkdit_mol_pkl():
35 | global RKDIT_MOL_PKL
36 | return RKDIT_MOL_PKL
37 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/utils/misc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from omegaconf import OmegaConf
4 |
5 |
6 | def get_config_from_dict(config_dict: dict, config: type):
7 | schema = OmegaConf.structured(config)
8 | config = OmegaConf.create(config_dict)
9 | merged = OmegaConf.merge(schema, config)
10 | return OmegaConf.to_object(merged)
11 |
12 |
13 | def cdist(
14 | x: torch.Tensor,
15 | y: torch.Tensor,
16 | mask_x: torch.Tensor | None = None,
17 | mask_y: torch.Tensor | None = None,
18 | zero_diag: bool = False,
19 | ) -> torch.Tensor:
20 | # where mask is False, the distance is set to inf
21 | cdist = torch.cdist(x, y)
22 | if zero_diag:
23 | assert (
24 | cdist.shape[-1] == cdist.shape[-2]
25 | ), f"Zeroing diagonal is only supported for square matrix, got {cdist.shape}"
26 | N = cdist.shape[-1]
27 | device = cdist.device
28 | eye = torch.eye(N, dtype=torch.bool, device=device)
29 | cdist = torch.where(eye, 0, cdist)
30 | if mask_x is not None:
31 | cdist = torch.where(mask_x[..., :, None].bool(), cdist, np.inf)
32 | if mask_y is not None:
33 | cdist = torch.where(mask_y[..., None, :].bool(), cdist, np.inf)
34 | return cdist
35 |
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/dependency_mapping/depmap.csv:
--------------------------------------------------------------------------------
1 | id,sequence
2 | >KIF26B_1(+)|TATA|1,GCGCCGCCACATAAAATGGATCCCGGCCGGCGCGGCGAGGGCGGCAGGTTCCCGAGGCTCCTCCGCGCTGCGCCCGGGCGCACACGCGCGCTGTGACCGCCGACCGCTCCCGGGCCACGCGGAGCCGCCCCTCTCCCGGCCCTCGCGCAACTGTCAGGCGAAACGGGCCGGCGGATATTGGCTCGGCGACACGCCGAGGCTCCTCCCCGAGTCTGGATCTTTATATTTTGGGAGAATTTCTTTGAACTCAGTTACCAAGCTCGGTGAAGGAGACAAGTTCCCACAGCTGACTCGGCTCGG
3 | >M3ZUZ2_XIPMA|1,GATAAAACATTAAGTTGTCCTGAAGCGGTTTGACGTTACGTTTCACTGTTTAAGGACAAGGAGGCCGCGTCACGATGGTCCCCATCTTCACACTGAAGCTAAACCACAAGATTAACCCCCGCATGGTGGCTGTTGGAAAGTTTGATGGAGTGCACCCATGTCTAACAGCAGCAACACAAGCAGGAAAGGTGAGGGGAATATGTAGCATAACTGCTCAGCCTGCAGGAGGCTTCAAAGTTGCTGAAGGAACATACAGTATATCAAATATTTTCATTTTAACCTTAACTGTTCTTCATTTACAGGTTTTCATTCACAACCCTCATGATCGTGGTCAGAGACCTGCGACCCATCGACTGAGCCAGAGCACCCAGGACTCTGATATCTCTCTTCTCAACATCAACCAGGCCGTAACATGTTTGACTGCAGGGACACTGGGACCAAACACCACAGGAGACACGCTTCTGGTGGGATCTCAGACCAATCTGTTGGCCTATGATGTTCACGACAATACAGATGTTTTTTACAGAGACGTAAGTGGAAGAACTATCTTTGGGGTCACTGGATGTAGAGCAGACTCCTTTTTTTGTGATGTTTGTTCAT
4 | >SV2C_1(+)|no_TATA|1,CCCAGTCCCACACCGCAGCAGCGCCTCAGCACCGCGACTTGCCGGAGCACCGCGAGTGGCGCGCGGGTCCCGCCTCCCCCGCGCGCCGTGACTCCCTGCGCACCGCTGGTACTCTCGCCACGCCGCCGCCCGGCACTGCAGCACCAGGGGGAGGAGGCAGGCGGAGGAGAGGAGGAGGACCGCAGCGTGCAAGCCGGGAGCCACTTTCCCGCCCCTCCTCTCGCCGCTGACACGCTCAGAGGAGTCACCACTCCGCGCGCTGCAGGCGAGAGTGGCAGACGGAGGCAGCCCGGGGAAGCG
5 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/utils/logger.py:
--------------------------------------------------------------------------------
1 |
2 | # Copyright 2025 GenBio AI
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | # -*-coding:utf-8-*-
18 |
19 | import logging.config
20 |
21 |
22 | def singleton(cls):
23 | instances = {}
24 |
25 | def get_instance():
26 | if cls not in instances:
27 | instances[cls] = cls()
28 | return instances[cls]
29 |
30 | return get_instance()
31 |
32 |
33 | @singleton
34 | class Logger:
35 | def __init__(self):
36 | logging.basicConfig(
37 | format="%(asctime)s %(levelname)s %(process)d [%(filename)s:%(lineno)d] %(message)s",
38 | level=logging.INFO,
39 | )
40 | self.logger = logging.getLogger("root")
41 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/configs/lightning_configs.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from enum import Enum
3 | from pathlib import Path
4 | from typing import Any
5 |
6 |
7 | class Device(str, Enum):
8 | CPU = "cpu"
9 | CUDA = "cuda"
10 |
11 |
12 | @dataclass
13 | class ValidationExperimentConfig:
14 | name: str # name of the project
15 | wandb_project: str
16 | output_dir: str | Path # runs directory
17 | path_ckpt: str | Path
18 | device: Device
19 | log_every_n_steps: int
20 | loggers: list[Any]
21 | devices: Any = "auto" # list[int] | str | int
22 | seed: int = 0 # not necessary if no cropping
23 |
24 |
25 | @dataclass
26 | class EncodingConfig:
27 | folder_name: str # folder name
28 | output_dir: str | Path # output directory
29 | path_ckpt: str | Path
30 | device: Device
31 | return_predictions: bool
32 | devices: Any = "auto" # list[int] | str | int
33 | seed: int = 0 # not necessary if no cropping
34 |
35 |
36 | @dataclass
37 | class DecodingConfig:
38 | folder_name: str # folder name
39 | output_dir: str | Path # output directory
40 | path_ckpt: str | Path
41 | device: Device
42 | devices: Any = "auto" # list[int] | str | int
43 | seed: int = 0 # not necessary ?
44 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/rna_secondary_structure_prediction/rna_secondary_structure_prediction.sh:
--------------------------------------------------------------------------------
1 | # Sazan Mahbub
2 | MODE=$1 ## set it to "train" for finetuning the RNA-FM for RNA secondary structure prediction
3 |
4 | RUN_NAME=rna_ss
5 | DATASET_NAME=$2 ## set the named of the dataset
6 | CKPT_SAVE_DIR=logs/${RUN_NAME}/${DATASET_NAME}
7 |
8 | if [ $MODE == "train" ]; then
9 | mgen fit --config rna_ss_prediction.yaml \
10 | --data.path ${MGEN_DATA_DIR}/modelgenerator/datasets/rna_ss_data/ \
11 | --data.dataset ${DATASET_NAME} \
12 | --trainer.default_root_dir ${CKPT_SAVE_DIR} \
13 | --trainer.callbacks.ft_schedule_path ft_schedules/layers_0_32.yaml \
14 | --trainer.devices 0,1,2,3
15 |
16 | else
17 | # CKPT_PATH=${MGEN_DATA_DIR}/modelgenerator/huggingface_models/rna_ss/AIDO.RNA-1.6B-${DATASET_NAME}_secondary_structure_prediction/model.ckpt
18 | CKPT_PATH=$3 ## set the path to the checkpoint file (example shown in the commented line above)
19 | mgen test --config rna_ss_prediction.yaml \
20 | --data.path ${MGEN_DATA_DIR}/modelgenerator/datasets/rna_ss_data/ \
21 | --data.dataset ${DATASET_NAME} \
22 | --trainer.default_root_dir ${CKPT_SAVE_DIR} \
23 | --trainer.callbacks.ft_schedule_path ft_schedules/layers_0_32.yaml \
24 | --ckpt_path ${CKPT_PATH} \
25 | --trainer.devices 0,
26 | fi
27 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/multimodal_isoform_expression/isoform_expression_prediction.sh:
--------------------------------------------------------------------------------
1 | MODE=train
2 |
3 | RUN_NAME=enformer_rnafm1.6b-cds_esm2_concat_fusion
4 | CONFIG_FILE=experiments/AIDO.MM/configs/isoform_expression_dna_rna_prot_concat.yaml
5 |
6 | # RUN_NAME=enformer_rnafm1.6b-cds_concat_fusion
7 | # CONFIG_FILE=experiments/AIDO.MM/configs/isoform_expression_dna_rna_concat.yaml
8 |
9 | # RUN_NAME=enformer_aidorna650m_esm2_attention_fusion
10 | # CONFIG_FILE=experiments/AIDO.MM/configs/isoform_expression_dna_rna_prot_attention.yaml
11 |
12 | # RUN_NAME=enformer_aidorna650m_attention_fusion
13 | # CONFIG_FILE=experiments/AIDO.MM/configs/isoform_expression_dna_rna_attention.yaml
14 |
15 | PROJECT=isoform_tasks
16 | CKPT_SAVE_DIR=${GENBIO_DATA_DIR}/genbio_finetune/logs/${PROJECT}/${RUN_NAME}
17 |
18 | if [ $MODE == "train" ]; then
19 | mgen fit --config $CONFIG_FILE \
20 | --trainer.logger.name $RUN_NAME \
21 | --trainer.logger.project $PROJECT \
22 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR \
23 | --model.optimizer.lr 1e-4 \
24 | --data.batch_size 1
25 | else
26 | CKPT_PATH=${CKPT_SAVE_DIR}/best_val*
27 | mgen test --config $CONFIG_FILE \
28 | --data.batch_size 16 \
29 | --trainer.logger null \
30 | --model.strict_loading False \
31 | --model.reset_optimizer_states True \
32 | --ckpt_path $CKPT_PATH
33 | fi
34 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/utils/kernel/csrc/softmax_cuda_stub.cpp:
--------------------------------------------------------------------------------
1 | // Copyright 2021 AlQuraishi Laboratory
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | // modified from fastfold/model/fastnn/kernel/cuda_native/csrc/softmax_cuda.cpp
16 |
17 | #include
18 |
19 | void attn_softmax_inplace_forward_(
20 | at::Tensor input,
21 | long long rows, int cols
22 | )
23 | {
24 | throw std::runtime_error("attn_softmax_inplace_forward_ not implemented on CPU");
25 | };
26 | void attn_softmax_inplace_backward_(
27 | at::Tensor output,
28 | at::Tensor d_ov,
29 | at::Tensor values,
30 | long long rows,
31 | int cols_output,
32 | int cols_values
33 | )
34 | {
35 | throw std::runtime_error("attn_softmax_inplace_backward_ not implemented on CPU");
36 | };
37 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/utils/geometry/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI.
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | # Copyright 2021 AlQuraishi Laboratory
4 | # Copyright 2021 DeepMind Technologies Limited
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 |
19 | from fold.openfold_local.utils.geometry import rigid_matrix_vector
20 | from fold.openfold_local.utils.geometry import rotation_matrix
21 | from fold.openfold_local.utils.geometry import vector
22 |
23 | Rot3Array = rotation_matrix.Rot3Array
24 | Rigid3Array = rigid_matrix_vector.Rigid3Array
25 |
26 | Vec3Array = vector.Vec3Array
27 | square_euclidean_distance = vector.square_euclidean_distance
28 | euclidean_distance = vector.euclidean_distance
29 | dihedral_angle = vector.dihedral_angle
30 | dot = vector.dot
31 | cross = vector.cross
32 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/completions/commands.py:
--------------------------------------------------------------------------------
1 | from importlib.resources import files
2 |
3 | import click
4 |
5 |
6 | @click.command()
7 | @click.option(
8 | "-s",
9 | "--shell",
10 | type=click.Choice(["bash", "zsh", "fish"]),
11 | default="bash",
12 | help="Shell to generate completion script for",
13 | )
14 | def completion(shell: str) -> None:
15 | """Generate shell completion script and exit
16 |
17 | This command generates a shell completion script for the GenBio AIDO Structure Prediction CLI.
18 | The generated script should be added to your shell configuration file:
19 |
20 | \b
21 | - bash: ~/.bashrc
22 | - zsh: ~/.zshrc
23 | - fish: ~/.config/fish/completions/genbio-aidosp.fish
24 |
25 | After modifying the shell config, you need to start a new shell in order for the changes to be loaded.
26 | """
27 | completions = files("genbio.aidosp.cli.completions")
28 | if shell == "bash":
29 | completion_script = completions.joinpath("genbio-aidosp-complete.bash")
30 | elif shell == "zsh":
31 | completion_script = completions.joinpath("genbio-aidosp-complete.zsh")
32 | elif shell == "fish":
33 | completion_script = completions.joinpath("genbio-aidosp.fish")
34 | else:
35 | raise click.BadParameter(f"Unsupported shell: {shell}")
36 |
37 | click.echo(completion_script.read_text())
38 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/enformer_pytorch/config_enformer.py:
--------------------------------------------------------------------------------
1 | from transformers import PretrainedConfig
2 |
3 | class EnformerConfig(PretrainedConfig):
4 | model_type = "enformer"
5 |
6 | def __init__(
7 | self,
8 | dim = 1536,
9 | depth = 11,
10 | heads = 8,
11 | output_heads = dict(human = 5313, mouse= 1643),
12 | target_length = 896,
13 | attn_dim_key = 64,
14 | dropout_rate = 0.4,
15 | attn_dropout = 0.05,
16 | pos_dropout = 0.01,
17 | use_checkpointing = False,
18 | use_convnext = False,
19 | num_downsamples = 7, # genetic sequence is downsampled 2 ** 7 == 128x in default Enformer - can be changed for higher resolution
20 | dim_divisible_by = 128,
21 | use_tf_gamma = False,
22 | **kwargs,
23 | ):
24 | self.dim = dim
25 | self.depth = depth
26 | self.heads = heads
27 | self.output_heads = output_heads
28 | self.target_length = target_length
29 | self.attn_dim_key = attn_dim_key
30 | self.dropout_rate = dropout_rate
31 | self.attn_dropout = attn_dropout
32 | self.pos_dropout = pos_dropout
33 | self.use_checkpointing = use_checkpointing
34 | self.num_downsamples = num_downsamples
35 | self.dim_divisible_by = dim_divisible_by
36 | self.use_tf_gamma = use_tf_gamma
37 |
38 | super().__init__(**kwargs)
--------------------------------------------------------------------------------
/experiments/AIDO.Cell/sctab_conversion.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import pandas as pd
4 | import anndata as ad
5 | import tiledbsoma.io
6 | from tqdm import tqdm
7 |
8 | # Change the root path to your downloaded location
9 | sctab_root_path = "merlin_cxg_2023_05_15_sf-log1p/"
10 | # Output path
11 | soma_exp_output_path = "soma-exp-scTab/"
12 |
13 | if not os.path.isdir(soma_exp_output_path):
14 | os.makedirs(soma_exp_output_path)
15 |
16 | for split in ["train", "val", "test"]:
17 | df = pd.DataFrame()
18 | for fname in tqdm(os.listdir(os.path.join(sctab_root_path, split)), desc=f"Loading {split} data files"):
19 | if not fname.endswith('.parquet'):
20 | continue
21 | fpath = os.path.join(sctab_root_path, split, fname)
22 | # Read the parquet file into a pandas DataFrame
23 | df = pd.concat([df, pd.read_parquet(fpath)])
24 |
25 | print("Converting ...")
26 | # Create AnnData object with the data
27 | adata = ad.AnnData(np.array(list(df['X'])))
28 | adata.obs = df[['cell_type']]
29 | adata.var = pd.read_parquet(os.path.join(sctab_root_path, "var.parquet"))
30 | # Save the data object into a TileDB experiment folder
31 | tiledbsoma.io.from_anndata(
32 | experiment_uri=os.path.join(soma_exp_output_path, split),
33 | measurement_name="RNA",
34 | anndata=adata
35 | )
36 | print(f"Data conversion for split '{split}' is done!"
37 | )
38 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/msar/utils/io_utils.py:
--------------------------------------------------------------------------------
1 |
2 | import ml_collections
3 | import yaml
4 | from genbio.aidosp.msa_retrieve.msar.utils.logger import Logger
5 |
6 | logger = Logger.logger
7 |
8 |
9 | def load_yaml(yaml_path):
10 | with open(yaml_path) as f:
11 | hyp = yaml.load(f, Loader=yaml.SafeLoader)
12 | config = ml_collections.ConfigDict(hyp)
13 |
14 | return config
15 |
16 |
17 | def read_fasta(fasta_string: str):
18 | """Parses FASTA string and returns list of strings with amino-acid sequences.
19 |
20 | Arguments:
21 | fasta_string: The string contents of a FASTA file.
22 |
23 | Returns:
24 | A tuple of two lists:
25 | * A list of sequences.
26 | * A list of sequence descriptions taken from the comment lines. In the
27 | same order as the sequences.
28 | """
29 | sequences = []
30 | descriptions = []
31 | index = -1
32 | for line in fasta_string.splitlines():
33 | line = line.strip()
34 | if line.startswith(">"):
35 | index += 1
36 | descriptions.append(line[1:]) # Remove the '>' at the beginning.
37 | sequences.append("")
38 | continue
39 | elif line.startswith("#"):
40 | continue
41 | elif not line:
42 | continue # Skip blank lines.
43 | sequences[index] += line
44 |
45 | return sequences, descriptions
46 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/completions/genbio-aidosp-complete.zsh:
--------------------------------------------------------------------------------
1 | #compdef genbio-aidosp
2 |
3 | _genbio_aidosp_completion() {
4 | local -a completions
5 | local -a completions_with_descriptions
6 | local -a response
7 | (( ! $+commands[genbio-aidosp] )) && return 1
8 |
9 | response=("${(@f)$(env COMP_WORDS="${words[*]}" COMP_CWORD=$((CURRENT-1)) _GENBIO_AIDOSP_COMPLETE=zsh_complete genbio-aidosp)}")
10 |
11 | for type key descr in ${response}; do
12 | if [[ "$type" == "plain" ]]; then
13 | if [[ "$descr" == "_" ]]; then
14 | completions+=("$key")
15 | else
16 | completions_with_descriptions+=("$key":"$descr")
17 | fi
18 | elif [[ "$type" == "dir" ]]; then
19 | _path_files -/
20 | elif [[ "$type" == "file" ]]; then
21 | _path_files -f
22 | fi
23 | done
24 |
25 | if [ -n "$completions_with_descriptions" ]; then
26 | _describe -V unsorted completions_with_descriptions -U
27 | fi
28 |
29 | if [ -n "$completions" ]; then
30 | compadd -U -V unsorted -a completions
31 | fi
32 | }
33 |
34 | if [[ $zsh_eval_context[-1] == loadautofunc ]]; then
35 | # autoload from fpath, call function directly
36 | _genbio_aidosp_completion "$@"
37 | else
38 | # eval/source/. command, register function for later
39 | compdef _genbio_aidosp_completion genbio-aidosp
40 | fi
41 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvcr.io/nvidia/cuda:12.4.0-devel-ubuntu22.04 AS build
2 | WORKDIR /workspace
3 | # TODO: using conda just to get a Python binary is probably overkill
4 | RUN apt update && apt install -y wget git
5 | RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda3.sh && \
6 | bash miniconda3.sh -b -u -p /opt/conda
7 | RUN /opt/conda/bin/conda create -y -n finetune python=3.10
8 | ENV PATH=/opt/conda/envs/finetune/bin:$PATH
9 |
10 | # TODO: change to git clone when repos are public
11 | COPY modelgenerator modelgenerator
12 | COPY pyproject.toml .
13 | COPY README.md .
14 |
15 | RUN pip install --upgrade pip
16 | RUN pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0
17 | RUN pip install flash_attn==2.7.4.post1
18 |
19 | ## RNA and Protein inverse folding requirements
20 | RUN pip install torch_geometric==2.6.1
21 | pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.6.0+cu124.html
22 | RUN pip install biopython==1.84
23 | RUN pip install MDAnalysis==2.8.0
24 | RUN pip install biotite==1.0.1
25 | RUN pip install OmegaConf
26 |
27 | WORKDIR /workspace
28 | RUN pip install -e .
29 |
30 | FROM nvcr.io/nvidia/cuda:12.4.0-runtime-ubuntu22.04
31 | WORKDIR /workspace
32 | COPY --from=build /opt/conda/envs /opt/conda/envs
33 | ENV PATH=/opt/conda/envs/finetune/bin:$PATH
34 | COPY modelgenerator modelgenerator
35 | ENV MGEN_DATA_DIR=/mgen_data
36 | RUN mkdir ${MGEN_DATA_DIR}
37 |
--------------------------------------------------------------------------------
/scripts/wandb_sweep/slurm_agent.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ################################
3 | # SLURM options #
4 | ################################
5 | # uncomment to run multiple agents in parallel.
6 | # --array=1-X where X is number agents.
7 | ##SBATCH --array=1-X
8 | #SBATCH --ntasks-per-node=1 # same as trainer.devices
9 | #SBATCH --nodes=1 # same as trainer.num_nodes
10 | #SBATCH --output=logs/R-%x.%j.out
11 | #SBATCH --error=logs/R-%x.%j.err
12 |
13 | ################################
14 | # Python environment setup #
15 | ################################
16 | eval "$(~/miniconda3/bin/conda shell.bash hook)"
17 | conda activate finetune
18 |
19 | ################################
20 | #Required wandb sweep settings #
21 | ################################
22 | export WANDB_PROJECT="autotune-test"
23 | SWEEP_ID=""
24 |
25 | ################################
26 | # No change required below #
27 | ################################
28 | {
29 | IFS=$'\n' read -r -d '' AGENT_DETAILS;
30 | IFS=$'\n' read -r -d '' AGENT_COMMAND;
31 | } < <((printf '\0%s\0' "$(timeout 30 srun --ntasks=1 wandb agent --count 1 $SWEEP_ID)" 1>&2) 2>&1)
32 | RUN_ID=$(echo $AGENT_DETAILS | sed -e "s/.*\[\([^]]*\)\].*/\1/g" -e "s/[\'\']//g")
33 | if [[ -z "$RUN_ID" ]]; then
34 | echo wandb agent timed out. >&2
35 | exit 1
36 | fi
37 | AGENT_COMMAND="${AGENT_COMMAND} --trainer.logger.version ${RUN_ID}"
38 | echo Training command: $AGENT_COMMAND
39 |
40 | wait
41 | srun $AGENT_COMMAND
42 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/utils/kernel/csrc/softmax_cuda.cpp:
--------------------------------------------------------------------------------
1 | // Copyright 2021 AlQuraishi Laboratory
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | // modified from fastfold/model/fastnn/kernel/cuda_native/csrc/softmax_cuda.cpp
16 |
17 | #include
18 |
19 | void attn_softmax_inplace_forward_(
20 | at::Tensor input,
21 | long long rows, int cols
22 | );
23 | void attn_softmax_inplace_backward_(
24 | at::Tensor output,
25 | at::Tensor d_ov,
26 | at::Tensor values,
27 | long long rows,
28 | int cols_output,
29 | int cols_values
30 | );
31 |
32 |
33 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
34 | m.def(
35 | "forward_",
36 | &attn_softmax_inplace_forward_,
37 | "Softmax forward (CUDA)"
38 | );
39 | m.def(
40 | "backward_",
41 | &attn_softmax_inplace_backward_,
42 | "Softmax backward (CUDA)"
43 | );
44 | }
45 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/layers/esmfold/categorical_mixture.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | import torch
6 |
7 |
8 | class CategoricalMixture:
9 | def __init__(self, param, bins=50, start=0, end=1):
10 | # All tensors are of shape ..., bins.
11 | self.logits = param
12 | bins = torch.linspace(
13 | start, end, bins + 1, device=self.logits.device, dtype=self.logits.dtype
14 | )
15 | self.v_bins = (bins[:-1] + bins[1:]) / 2
16 |
17 | def log_prob(self, true):
18 | # Shapes are:
19 | # self.probs: ... x bins
20 | # true : ...
21 | true_index = (
22 | (
23 | true.unsqueeze(-1)
24 | - self.v_bins[
25 | [
26 | None,
27 | ]
28 | * true.ndim
29 | ]
30 | )
31 | .abs()
32 | .argmin(-1)
33 | )
34 | nll = self.logits.log_softmax(-1)
35 | return torch.take_along_dim(nll, true_index.unsqueeze(-1), dim=-1).squeeze(-1)
36 |
37 | def mean(self):
38 | return (self.logits.softmax(-1) @ self.v_bins.unsqueeze(1)).squeeze(-1)
39 |
40 |
41 | def categorical_lddt(logits, bins=50):
42 | # Logits are ..., 37, bins.
43 | return CategoricalMixture(logits, bins=bins).mean()
44 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/utils/file_io.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | import json
18 | from pathlib import Path
19 | from typing import Any, Union
20 |
21 | from fold.utils.torch_utils import map_values_to_list
22 |
23 |
24 | def save_json(data: dict, output_fpath: Union[str, Path], indent: int = 4):
25 | """
26 | Save a dictionary to a JSON file.
27 |
28 | Args:
29 | data (dict): The dictionary to be saved.
30 | output_fpath (Union[str, Path]): The output file path.
31 | indent (int, optional): The indentation level for the JSON file. Defaults to 4.
32 | """
33 | data_json = data.copy()
34 | data_json = map_values_to_list(data_json)
35 | with open(output_fpath, "w") as f:
36 | if indent is not None:
37 | json.dump(data_json, f, indent=indent)
38 | else:
39 | json.dump(data_json, f)
40 |
--------------------------------------------------------------------------------
/experiments/AIDO.Tissue/emb.xenium.yaml:
--------------------------------------------------------------------------------
1 | seed_everything: 42
2 | data:
3 | class_path: modelgenerator.data.CellWithNeighborDataModule
4 | init_args:
5 | path: './downloads/'
6 | batch_size: 1
7 | train_split_files:
8 | - 'processed_fetal_lung_visium_xenium.xenium.convert.h5ad'
9 | valid_split_files:
10 | - 'processed_fetal_lung_visium_xenium.xenium.convert.h5ad'
11 | test_split_files:
12 | - 'processed_fetal_lung_visium_xenium.xenium.convert.h5ad'
13 | filter_columns:
14 | - 'cell_type'
15 | - 'x'
16 | - 'y'
17 | rename_columns:
18 | - 'labels'
19 | - 'x'
20 | - 'y'
21 | neighbor_num: 10
22 | num_workers: 4
23 | persistent_workers: True
24 | generate_uid: True
25 | model:
26 | class_path: modelgenerator.tasks.Embed
27 | init_args:
28 | backbone:
29 | class_path: modelgenerator.backbones.aido_tissue_3m
30 | init_args:
31 | from_scratch: False
32 | trainer:
33 | log_every_n_steps: 10
34 | precision: bf16
35 | devices: 1
36 | max_epochs: 10
37 | gradient_clip_val: 0
38 | profiler: null
39 | default_root_dir: './logs/emb.xenium'
40 | strategy:
41 | class_path: lightning.pytorch.strategies.DDPStrategy
42 | callbacks:
43 | class_path: modelgenerator.callbacks.PredictionWriter
44 | init_args:
45 | output_dir: './logs/emb.xenium/lightning_logs/pred_output'
46 | filetype: 'pt'
47 | write_cols:
48 | - 'predictions'
49 | - 'uid'
50 | return_predictions: True
51 | # TODO: Clean up parameter dependencies.
52 |
--------------------------------------------------------------------------------
/docs/docs/api_reference/trainer.md:
--------------------------------------------------------------------------------
1 | # Trainer
2 |
3 | AIDO.ModelGenerator uses the LightningCLI for configuring runs with the PyTorch Lightning Trainer.
4 | The entrypoint for the CLI is `mgen`, which can be used with the `fit`, `test`, `validate`, and `predict` commands and the `--model`, `--data`, and `--trainer` arguments and their sub-arguments.
5 | ```bash
6 | mgen fit --model ConditionalDiffusion --model.backbone aido_dna_300m \
7 | --data ConditionalDiffusionDataModule --data.path "genbio-ai/100m-random-promoters" \
8 | --trainer.max_epochs 1 --trainer.accelerator auto --trainer.devices auto
9 | ```
10 |
11 | For detailed information about the LightningCLI, see the [LightningCLI documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_advanced.html).
12 |
13 | ```yaml
14 | # Example Trainer Configuration
15 | trainer:
16 | accelerator: auto
17 | strategy: lightning.pytorch.strategies.DDPStrategy
18 | devices: auto
19 | num_nodes: 1
20 | precision: bf16-mixed
21 | logger: null
22 | callbacks:
23 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint
24 | init_args:
25 | filename: best_val:{step}-{val_loss:.3f}-{train_loss:.3f}
26 | monitor: val_loss
27 | save_top_k: 1
28 | fast_dev_run: false
29 | max_epochs: 100
30 | limit_val_batches: null
31 | val_check_interval: null
32 | check_val_every_n_epoch: 1
33 | log_every_n_steps: 50
34 | accumulate_grad_batches: 1
35 | gradient_clip_val: 1
36 | gradient_clip_algorithm: null
37 | detect_anomaly: false
38 | default_root_dir: logs
39 | model:
40 | ...
41 | data:
42 | ...
43 | ```
44 |
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/sequence_classification/nt_promoter_all.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | max_epochs: 30
5 | gradient_clip_val: 1
6 | default_root_dir: logs
7 | logger: false
8 | callbacks:
9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt
10 | init_args:
11 | filename: epoch_{epoch}-val_mcc:{val_mcc:.3f}
12 | monitor: val_mcc
13 | mode: max
14 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
15 | dict_kwargs:
16 | monitor: val_mcc
17 | mode: max
18 | patience: 30
19 | model:
20 | class_path: modelgenerator.tasks.SequenceClassification
21 | init_args:
22 | backbone:
23 | class_path: modelgenerator.backbones.aido_dna_7b
24 | init_args:
25 | use_peft: true
26 | lora_r: 16
27 | lora_alpha: 32
28 | lora_dropout: 0.1
29 | lora_target_modules:
30 | - query
31 | - value
32 | adapter: modelgenerator.adapters.LinearCLSAdapter
33 | n_classes: 2
34 | optimizer:
35 | class_path: torch.optim.AdamW
36 | init_args:
37 | lr: 0.0005
38 | weight_decay: 0.1
39 | lr_scheduler:
40 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup
41 | init_args:
42 | warmup_ratio: 0.1
43 | data:
44 | class_path: modelgenerator.data.NTClassification
45 | init_args:
46 | config_name: promoter_all
47 | train_split_name: train
48 | test_split_name: test
49 | valid_split_size: 0.1
50 | batch_size: 4
51 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/scripts/download_colabfold_envdb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Copyright 2021 AlQuraishi Laboratory
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # Downloads and unzips the BFD database for AlphaFold.
18 | #
19 | # Usage: bash download_bfd.sh /path/to/download/directory
20 | set -e
21 |
22 | if [[ $# -eq 0 ]]; then
23 | echo "Error: download directory must be provided as an input argument."
24 | exit 1
25 | fi
26 |
27 | if ! command -v aria2c &> /dev/null ; then
28 | echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
29 | exit 1
30 | fi
31 |
32 | DOWNLOAD_DIR="$1"
33 | ROOT_DIR="${DOWNLOAD_DIR}"
34 | SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz"
35 | BASENAME=$(basename "${SOURCE_URL}")
36 | MAX_CONNECTIONS="${2:-4}"
37 |
38 | mkdir -p "${ROOT_DIR}"
39 | aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x "${MAX_CONNECTIONS}" --check-certificate=false
40 | tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
41 | --directory="${ROOT_DIR}"
42 |
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/sequence_classification/gue_core_promoter_all.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | max_epochs: 20
5 | gradient_clip_val: 1
6 | default_root_dir: logs
7 | logger: false
8 | callbacks:
9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt
10 | init_args:
11 | dirpath: null
12 | filename: epoch_{epoch}-val_mcc:{val_mcc:.3f}
13 | monitor: val_mcc
14 | mode: max
15 | every_n_epochs: 1
16 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
17 | dict_kwargs:
18 | monitor: val_mcc
19 | mode: max
20 | patience: 10
21 | model:
22 | class_path: modelgenerator.tasks.SequenceClassification
23 | init_args:
24 | backbone:
25 | class_path: modelgenerator.backbones.aido_dna_7b
26 | init_args:
27 | use_peft: true
28 | lora_r: 16
29 | lora_alpha: 32
30 | lora_dropout: 0.1
31 | lora_target_modules:
32 | - query
33 | - value
34 | n_classes: 2
35 | optimizer:
36 | class_path: torch.optim.AdamW
37 | init_args:
38 | lr: 0.0005
39 | weight_decay: 0.1
40 | adapter: modelgenerator.adapters.LinearCLSAdapter
41 | lr_scheduler:
42 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup
43 | init_args:
44 | warmup_ratio: 0.1
45 | data:
46 | class_path: modelgenerator.data.GUEClassification
47 | init_args:
48 | config_name: prom_core_all
49 | train_split_name: train
50 | test_split_name: test
51 | batch_size: 4
52 |
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/sequence_classification/nt_enhancers.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | max_epochs: 30
5 | gradient_clip_val: 1
6 | default_root_dir: logs
7 | logger: false
8 | callbacks:
9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt
10 | init_args:
11 | dirpath: null
12 | filename: epoch_{epoch}-val_mcc:{val_mcc:.3f}
13 | monitor: val_mcc
14 | mode: max
15 | every_n_epochs: 1
16 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
17 | dict_kwargs:
18 | monitor: val_mcc
19 | mode: max
20 | patience: 30
21 | model:
22 | class_path: modelgenerator.tasks.SequenceClassification
23 | init_args:
24 | backbone:
25 | class_path: modelgenerator.backbones.aido_dna_7b
26 | init_args:
27 | use_peft: true
28 | lora_r: 16
29 | lora_alpha: 32
30 | lora_dropout: 0.1
31 | lora_target_modules:
32 | - query
33 | - value
34 | adapter: modelgenerator.adapters.LinearCLSAdapter
35 | n_classes: 2
36 | optimizer:
37 | class_path: torch.optim.AdamW
38 | init_args:
39 | lr: 0.001
40 | weight_decay: 0.1
41 | lr_scheduler:
42 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup
43 | init_args:
44 | warmup_ratio: 0.1
45 | data:
46 | class_path: modelgenerator.data.NTClassification
47 | init_args:
48 | config_name: enhancers
49 | train_split_name: train
50 | test_split_name: test
51 | valid_split_size: 0.1
52 | batch_size: 8
53 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/utils/seed.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import os
17 | import random
18 |
19 | import numpy as np
20 | import torch
21 |
22 |
23 | def seed_everything(seed, deterministic):
24 | random.seed(seed)
25 | np.random.seed(seed)
26 | torch.random.manual_seed(seed)
27 | torch.cuda.manual_seed_all(seed)
28 | if deterministic:
29 | torch.backends.cudnn.benchmark = False
30 | # torch.backends.cudnn.deterministic=True applies to CUDA convolution operations, and nothing else.
31 | torch.backends.cudnn.deterministic = True
32 | # torch.use_deterministic_algorithms(True) affects all the normally-nondeterministic operations listed here https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html?highlight=use_deterministic#torch.use_deterministic_algorithms
33 | torch.use_deterministic_algorithms(True)
34 | # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
35 | os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
36 |
--------------------------------------------------------------------------------
/docs/docs/usage/embedding_caching.md:
--------------------------------------------------------------------------------
1 | # Embedding Caching (Experimental)
2 |
3 | AIDO.ModelGenerator provides seamless file-based persistent embedding caching for all freezable backbones (e.g. backbones with the option `frozen=True`). This feature aims to boost training speed and reduce overall resource consumption by skipping backbone forwarding and redundant data loading.
4 |
5 | ## Create and resume from cache
6 |
7 | Embedding caching is enabled by setting `--model.backbone.enable_cache true`. It works for all mgen subcommands including fit, validate, test and predict.
8 |
9 | ### Examples
10 | **Train a model and save cache at the same time**
11 | ```bash
12 | mgen fit --config my_config.yaml --model.backbone.enable_cache true --model.backbone.file_cache_dir my/cache/folder
13 | ```
14 | As training progresses, cached backbone output will be saved to disk and automatically used in future steps. For example, if your first epoch iterates through all the training data, cached embeddings will be utilized starting from the second epoch automatically.
15 |
16 | **Resume training from an existing cache**
17 | ```bash
18 | mgen fit --config my_config.yaml --model.backbone.enable_cache true --model.backbone.file_cache_dir my/cache/folder
19 | ```
20 | No change to the command is required, just make sure `--model.backbone.file_cache_dir` points to the right folder. Cached embedding will be used from the first step.
21 |
22 | **Create cache without training the model**
23 | ```bash
24 | mgen predict --config my_config.yaml --model.backbone.enable_cache true --model.backbone.file_cache_dir my/cache/folder
25 | ```
26 | The best practice in this case is to use the `Embed` task, which is minimal and contains the backbone only.
27 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/protein_inverse_folding/end2end_inference.sh:
--------------------------------------------------------------------------------
1 | set -e
2 |
3 | DATA_DIR=data ## Set the path of the directory where you want to keep/download the PDB/CIF file.
4 |
5 | PDB_ID=5YH2
6 | CHAIN_ID=A
7 |
8 | # ### Download and merge the Protein-IF checkpoint
9 | # mkdir -p ${MGEN_DATA_DIR}/modelgenerator/huggingface_models/protein_inv_fold/AIDO.ProteinIF-16B/
10 | # ## Download chunks
11 | # huggingface-cli download genbio-ai/AIDO.ProteinIF-16B \
12 | # --repo-type model \
13 | # --local-dir ${MGEN_DATA_DIR}/modelgenerator/huggingface_models/protein_inv_fold/AIDO.ProteinIF-16B/
14 | # ## Merge chunks
15 | # python merge_ckpt.py ${MGEN_DATA_DIR}/modelgenerator/huggingface_models/protein_inv_fold/AIDO.ProteinIF-16B/model_chunks ${MGEN_DATA_DIR}/modelgenerator/huggingface_models/protein_inv_fold/AIDO.ProteinIF-16B/model.ckpt
16 |
17 | ### Download a single structure from somewhere like PDB
18 | mkdir -p ${DATA_DIR}/
19 | wget https://files.rcsb.org/download/${PDB_ID}.cif -P ${DATA_DIR}/
20 |
21 | ### Put it into our format
22 | python preprocess_PDB.py ${DATA_DIR}/${PDB_ID}.cif ${CHAIN_ID} ${DATA_DIR}/
23 |
24 | ### Run inference to generate sequence
25 | # export CUDA_VISIBLE_DEVICES=6,
26 | mgen test \
27 | --config protein_inv_fold_test.yaml \
28 | --trainer.default_root_dir ${MGEN_DATA_DIR}/modelgenerator/logs/protein_inv_fold/ \
29 | --ckpt_path ${MGEN_DATA_DIR}/modelgenerator/huggingface_models/protein_inv_fold/AIDO.ProteinIF-16B/model.ckpt \
30 | --trainer.devices 0, \
31 | --data.path ${DATA_DIR}/
32 |
33 | ### The results will be saved under the folder "/experiments/AIDO.Protein/protein_inverse_folding/proteinIF_outputs" in a file named "results_acc_{recovery_accuracy}.txt".
34 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/DMS/configs/indels_LP_DDP.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | max_steps: 1000
5 | gradient_clip_val: 0.1
6 | default_root_dir: logs
7 | logger: false
8 | callbacks:
9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt
10 | init_args:
11 | filename: step_{step}_valloss_{val_loss}
12 | save_last: true
13 | save_top_k: 1
14 | save_weights_only: false
15 | mode: min
16 | every_n_train_steps: 500
17 | model:
18 | class_path: modelgenerator.tasks.SequenceRegression
19 | init_args:
20 | backbone:
21 | class_path: modelgenerator.backbones.aido_protein_16b
22 | init_args:
23 | frozen: true
24 | max_length: 2048
25 | adapter:
26 | class_path: modelgenerator.adapters.MLPPoolAdapter
27 | init_args:
28 | hidden_sizes:
29 | - 128
30 | dropout: 0.1
31 | dropout_in_middle: false
32 | optimizer:
33 | class_path: torch.optim.AdamW
34 | init_args:
35 | lr: 0.001
36 | weight_decay: 0.01
37 | lr_scheduler:
38 | class_path: modelgenerator.lr_schedulers.ConstantWithWarmup
39 | init_args:
40 | warmup_ratio: 0.05
41 | data:
42 | class_path: modelgenerator.data.DMSFitnessPrediction
43 | init_args:
44 | path: genbio-ai/ProteinGYM-DMS
45 | train_split_files:
46 | - indels/B1LPA6_ECOSM_Russ_2020_indels.tsv
47 | train_split_name: train
48 | random_seed: 42
49 | batch_size: 32
50 | cv_num_folds: 5
51 | cv_test_fold_id: 0
52 | cv_enable_val_fold: false
53 | cv_fold_id_col: fold_id
54 | ckpt_path: null
55 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/demo_mrna_vaccine/get_mean_embeddings.py:
--------------------------------------------------------------------------------
1 | # Takes a directory with *.pt files as argument
2 | # For each file, loads the embeddings and computes the mean along the sequence dimension
3 | # Compiles all mean embeddings as well as sequences into a single pt file
4 |
5 | import sys
6 | import os
7 | import torch
8 | import pandas as pd
9 | import argparse
10 | from tqdm import tqdm
11 |
12 | def compile_mean_embeddings(directory):
13 | all_sequences = []
14 | mean_embeddings = []
15 | for file in tqdm(os.listdir(directory)):
16 | if file.endswith('.pt'):
17 | vals = torch.load(os.path.join(directory, file))
18 | embeddings = torch.tensor(vals['predictions']).cpu()
19 | sequences = vals['sequences']
20 | attention_masks = torch.tensor(vals['attention_mask']).cpu()
21 | special_tokens_mask = torch.tensor(vals['special_tokens_mask']).cpu()
22 | for i in range(len(embeddings)):
23 | embedding = embeddings[i][attention_masks[i] == 1 & ~special_tokens_mask[i]]
24 | mean_embedding = embedding.mean(dim=0)
25 | mean_embeddings.append(mean_embedding)
26 | all_sequences.append(sequences[i])
27 | mean_embeddings = torch.stack(mean_embeddings)
28 | torch.save({'sequences': all_sequences, 'mean_embeddings': mean_embeddings}, os.path.join(directory, 'mean_embeddings.pt'))
29 |
30 |
31 |
32 | if __name__ == '__main__':
33 | parser = argparse.ArgumentParser(description='Compile mean embeddings')
34 | parser.add_argument('--directory', type=str, help='Path to directory with *.pt files')
35 | args = parser.parse_args()
36 | compile_mean_embeddings(args.directory)
37 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/data/tools/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI.
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | # Copyright 2021 AlQuraishi Laboratory
4 | # Copyright 2021 DeepMind Technologies Limited
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 |
19 | """Common utilities for data pipeline tools."""
20 | import contextlib
21 | import datetime
22 | import logging
23 | import shutil
24 | import tempfile
25 | import time
26 | from typing import Optional
27 |
28 |
29 | @contextlib.contextmanager
30 | def tmpdir_manager(base_dir: Optional[str] = None):
31 | """Context manager that deletes a temporary directory on exit."""
32 | tmpdir = tempfile.mkdtemp(dir=base_dir)
33 | try:
34 | yield tmpdir
35 | finally:
36 | shutil.rmtree(tmpdir, ignore_errors=True)
37 |
38 |
39 | @contextlib.contextmanager
40 | def timing(msg: str):
41 | logging.info("Started %s", msg)
42 | tic = time.perf_counter()
43 | yield
44 | toc = time.perf_counter()
45 | logging.info("Finished %s in %.3f seconds", msg, toc - tic)
46 |
47 |
48 | def to_date(s: str):
49 | return datetime.datetime(year=int(s[:4]), month=int(s[5:7]), day=int(s[8:10]))
50 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/rna_secondary_structure_prediction/ft_schedules/layers_0_32.yaml:
--------------------------------------------------------------------------------
1 | 0:
2 | - adapter.*
3 | 3:
4 | - backbone.encoder.encoder.ln.*
5 | - backbone.encoder.encoder.layer.32.*
6 | - backbone.encoder.encoder.layer.31.*
7 | - backbone.encoder.encoder.layer.30.*
8 | 6:
9 | - backbone.encoder.encoder.layer.29.*
10 | - backbone.encoder.encoder.layer.28.*
11 | - backbone.encoder.encoder.layer.27.*
12 | 9:
13 | - backbone.encoder.encoder.layer.26.*
14 | - backbone.encoder.encoder.layer.25.*
15 | - backbone.encoder.encoder.layer.24.*
16 | 12:
17 | - backbone.encoder.encoder.layer.23.*
18 | - backbone.encoder.encoder.layer.22.*
19 | - backbone.encoder.encoder.layer.21.*
20 | 15:
21 | - backbone.encoder.encoder.layer.20.*
22 | - backbone.encoder.encoder.layer.19.*
23 | - backbone.encoder.encoder.layer.18.*
24 | 18:
25 | - backbone.encoder.encoder.layer.17.*
26 | - backbone.encoder.encoder.layer.16.*
27 | - backbone.encoder.encoder.layer.15.*
28 | 21:
29 | - backbone.encoder.encoder.layer.14.*
30 | - backbone.encoder.encoder.layer.13.*
31 | - backbone.encoder.encoder.layer.12.*
32 | 24:
33 | - backbone.encoder.encoder.layer.11.*
34 | - backbone.encoder.encoder.layer.10.*
35 | - backbone.encoder.encoder.layer.9.*
36 | 27:
37 | - backbone.encoder.encoder.layer.8.*
38 | - backbone.encoder.encoder.layer.7.*
39 | - backbone.encoder.encoder.layer.6.*
40 | 30:
41 | - backbone.encoder.encoder.layer.5.*
42 | - backbone.encoder.encoder.layer.4.*
43 | - backbone.encoder.encoder.layer.3.*
44 | 33:
45 | - backbone.encoder.encoder.layer.2.*
46 | - backbone.encoder.encoder.layer.1.*
47 | - backbone.encoder.encoder.layer.0.*
48 | 36:
49 | - backbone.encoder.*
50 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/utils/geometry.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | import numpy as np
18 | from scipy.spatial.transform import Rotation
19 |
20 |
21 | def angle_3p(a, b, c):
22 | """
23 | Calculate the angle between three points in a 2D space.
24 |
25 | Args:
26 | a (list or array-like): The coordinates of the first point.
27 | b (list or array-like): The coordinates of the second point.
28 | c (list or array-like): The coordinates of the third point.
29 |
30 | Returns:
31 | float: The angle in degrees (0, 180) between the vectors
32 | from point a to point b and point b to point c.
33 | """
34 | a = np.array(a)
35 | b = np.array(b)
36 | c = np.array(c)
37 |
38 | ab = b - a
39 | bc = c - b
40 |
41 | dot_product = np.dot(ab, bc)
42 |
43 | norm_ab = np.linalg.norm(ab)
44 | norm_bc = np.linalg.norm(bc)
45 |
46 | cos_theta = np.clip(dot_product / (norm_ab * norm_bc + 1e-4), -1, 1)
47 | theta_radians = np.arccos(cos_theta)
48 | theta_degrees = np.degrees(theta_radians)
49 | return theta_degrees
50 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/DMS/configs/indels_LoRA_DDP.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | max_steps: 10000
5 | gradient_clip_val: 0.1
6 | default_root_dir: logs
7 | logger: false
8 | callbacks:
9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt
10 | init_args:
11 | filename: epoch_{epoch}-val_mcc:{val_spearman:.3f}
12 | monitor: val_spearman
13 | mode: max
14 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
15 | dict_kwargs:
16 | monitor: val_spearman
17 | mode: max
18 | patience: 10
19 | model:
20 | class_path: modelgenerator.tasks.SequenceRegression
21 | init_args:
22 | backbone:
23 | class_path: modelgenerator.backbones.aido_protein_16b
24 | init_args:
25 | use_peft: true
26 | lora_dropout: 0.05
27 | max_length: 2048
28 | adapter:
29 | class_path: modelgenerator.adapters.MLPPoolAdapter
30 | init_args:
31 | hidden_sizes:
32 | - 128
33 | dropout: 0.1
34 | dropout_in_middle: false
35 | optimizer:
36 | class_path: torch.optim.AdamW
37 | init_args:
38 | lr: 0.0001
39 | weight_decay: 0.01
40 | lr_scheduler:
41 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup
42 | init_args:
43 | warmup_ratio: 0.05
44 | data:
45 | class_path: modelgenerator.data.DMSFitnessPrediction
46 | init_args:
47 | path: genbio-ai/ProteinGYM-DMS
48 | train_split_files:
49 | - indels/B1LPA6_ECOSM_Russ_2020_indels.tsv
50 | train_split_name: train
51 | random_seed: 42
52 | batch_size: 32
53 | cv_num_folds: 5
54 | cv_test_fold_id: 0
55 | cv_enable_val_fold: true
56 | cv_fold_id_col: fold_id
57 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/DMS/configs/substitution_LoRA_DDP.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | logger: false
5 | callbacks:
6 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt
7 | init_args:
8 | filename: epoch_{epoch}-val_mcc:{val_spearman:.3f}
9 | monitor: val_spearman
10 | mode: max
11 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
12 | dict_kwargs:
13 | monitor: val_spearman
14 | mode: max
15 | patience: 10
16 | max_steps: 10000
17 | gradient_clip_val: 0.1
18 | default_root_dir: logs
19 | model:
20 | class_path: modelgenerator.tasks.SequenceRegression
21 | init_args:
22 | backbone:
23 | class_path: modelgenerator.backbones.aido_protein_16b_v1
24 | init_args:
25 | use_peft: true
26 | max_length: 2048
27 | adapter:
28 | class_path: modelgenerator.adapters.MLPPoolAdapter
29 | init_args:
30 | hidden_sizes:
31 | - 128
32 | dropout: 0.1
33 | dropout_in_middle: false
34 | optimizer:
35 | class_path: torch.optim.AdamW
36 | init_args:
37 | lr: 0.0001
38 | weight_decay: 0.01
39 | lr_scheduler:
40 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup
41 | init_args:
42 | warmup_ratio: 0.05
43 | data:
44 | class_path: modelgenerator.data.DMSFitnessPrediction
45 | init_args:
46 | path: genbio-ai/ProteinGYM-DMS
47 | train_split_files:
48 | - singles_substitutions/VRPI_BPT7_Tsuboyama_2023_2WNM.tsv
49 | train_split_name: 'train'
50 | random_seed: 42
51 | batch_size: 32
52 | cv_num_folds: 5
53 | cv_test_fold_id: 0
54 | cv_enable_val_fold: true
55 | cv_fold_id_col: fold_id
56 | ckpt_path: null
57 |
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/sequence_classification/gue_splice_reconstruction.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | max_epochs: 20
5 | gradient_clip_val: 1
6 | default_root_dir: logs
7 | logger: false
8 | callbacks:
9 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt
10 | init_args:
11 | dirpath: null
12 | filename: epoch_{epoch}-val_mcc:{val_mcc:.3f}
13 | monitor: val_mcc
14 | mode: max
15 | every_n_epochs: 1
16 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
17 | dict_kwargs:
18 | monitor: val_mcc
19 | mode: max
20 | patience: 10
21 | model:
22 | class_path: modelgenerator.tasks.SequenceClassification
23 | init_args:
24 | backbone:
25 | class_path: modelgenerator.backbones.aido_dna_7b
26 | init_args:
27 | use_peft: true
28 | lora_r: 16
29 | lora_alpha: 32
30 | lora_dropout: 0.1
31 | lora_target_modules:
32 | - query
33 | - value
34 | adapter:
35 | class_path: modelgenerator.adapters.MLPPoolAdapter
36 | init_args:
37 | pooling: mean_pooling
38 | hidden_sizes:
39 | - 128
40 | bias: true
41 | dropout: 0.1
42 | n_classes: 3
43 | optimizer:
44 | class_path: torch.optim.AdamW
45 | init_args:
46 | lr: 0.0005
47 | weight_decay: 0.1
48 | lr_scheduler:
49 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup
50 | init_args:
51 | warmup_ratio: 0.1
52 | data:
53 | class_path: modelgenerator.data.GUEClassification
54 | init_args:
55 | config_name: splice_reconstructed
56 | train_split_name: train
57 | test_split_name: test
58 | valid_split_size: 0.1
59 | batch_size: 4
60 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/scfoundation/pretrainmodels/transformer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 BioMap (Beijing) Intelligence Technology Limited
2 |
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 |
8 | class pytorchTransformerModule(nn.Module):
9 | def __init__(self,
10 | max_seq_len,
11 | dim,
12 | depth,
13 | heads,
14 | ff_mult=4,
15 | norm_first=False,
16 | ):
17 | super(pytorchTransformerModule, self).__init__()
18 |
19 | self.max_seq_len = max_seq_len
20 | self.depth = depth
21 | layers = []
22 | for i in range(depth):
23 | layers.append(nn.TransformerEncoderLayer(d_model=dim, nhead=heads,
24 | dim_feedforward=dim * ff_mult,
25 | batch_first=True,
26 | norm_first=norm_first,
27 | #activation="gelu",
28 | ))
29 |
30 | self.transformer_encoder = nn.ModuleList(layers)
31 | self.norm = nn.LayerNorm(dim)
32 |
33 | def forward(self, x, padding_mask):
34 | b, n, _, device = *x.shape, x.device
35 | assert n <= self.max_seq_len, f'sequence length {n} must be less than the max sequence length {self.max_seq_len}'
36 |
37 | # x get encodings [B, N, D] , batch_first is True
38 | for mod in self.transformer_encoder:
39 | x = mod(x, src_key_padding_mask=padding_mask) # , src_mask=mask, src_key_padding_mask=src_key_padding_mask)
40 | # x = self.transformer_encoder(x)
41 | x = self.norm(x)
42 |
43 | return x
44 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/README.md:
--------------------------------------------------------------------------------
1 | ## openfold_local
2 |
3 | This is copy from [openfold](https://github.com/aqlaboratory/openfold), commit id: [bb3f51](https://github.com/aqlaboratory/openfold/commit/bb3f51e5a2cf2d5e3b709fe8f7d7a083c870222e)
4 |
5 | Openfold is a great work. Protenix try to reuse it when building models. However, A few modifications has been made for protenix project.
6 | We reuse protenix's code.
7 |
8 | * In [protenix/openfold_local/model/primitives.py](model/primitives.py), we add a custom [`Layernorm`](../model/layer_norm/) implementation, it accelerate protenix about 30%-50% during different training stages
9 |
10 | If you use our work, please also cite Openfold:
11 |
12 | ```bibtex
13 | @article {Ahdritz2022.11.20.517210,
14 | author = {Ahdritz, Gustaf and Bouatta, Nazim and Floristean, Christina and Kadyan, Sachin and Xia, Qinghui and Gerecke, William and O{\textquoteright}Donnell, Timothy J and Berenberg, Daniel and Fisk, Ian and Zanichelli, Niccolò and Zhang, Bo and Nowaczynski, Arkadiusz and Wang, Bei and Stepniewska-Dziubinska, Marta M and Zhang, Shang and Ojewole, Adegoke and Guney, Murat Efe and Biderman, Stella and Watkins, Andrew M and Ra, Stephen and Lorenzo, Pablo Ribalta and Nivon, Lucas and Weitzner, Brian and Ban, Yih-En Andrew and Sorger, Peter K and Mostaque, Emad and Zhang, Zhao and Bonneau, Richard and AlQuraishi, Mohammed},
15 | title = {{O}pen{F}old: {R}etraining {A}lpha{F}old2 yields new insights into its learning mechanisms and capacity for generalization},
16 | elocation-id = {2022.11.20.517210},
17 | year = {2022},
18 | doi = {10.1101/2022.11.20.517210},
19 | publisher = {Cold Spring Harbor Laboratory},
20 | URL = {https://www.biorxiv.org/content/10.1101/2022.11.20.517210},
21 | eprint = {https://www.biorxiv.org/content/early/2022/11/22/2022.11.20.517210.full.pdf},
22 | journal = {bioRxiv}
23 | }
24 | ```
25 |
--------------------------------------------------------------------------------
/configs/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Default fit config
2 | # If no other --config is specified with mgen fit, this is used
3 | # If another --config is specified, this is ignored
4 | # To continue using this with another --config, specify this as another config
5 | # e.g. mgen fit --config defaults.yaml --config my_custom_config.yaml
6 | # LightningCLI will override the defaults with the custom config while keeping the remaining defaults
7 | trainer:
8 | accelerator: auto
9 | devices: auto
10 | strategy: auto
11 | max_steps: -1
12 | max_epochs: -1
13 | precision: 32-true
14 | log_every_n_steps: 50
15 | default_root_dir: logs
16 | profiler:
17 | class_path: lightning.pytorch.profilers.PyTorchProfiler
18 | dict_kwargs:
19 | profile_memory: true
20 | callbacks:
21 | - class_path: lightning.pytorch.callbacks.LearningRateMonitor
22 | dict_kwargs:
23 | logging_interval: "step"
24 | # Save a checkpoint for min val loss
25 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint
26 | dict_kwargs:
27 | monitor: val_loss
28 | mode: min
29 | save_top_k: 1
30 | filename: "best_val:{step}-{val_loss:.3f}-{train_loss:.3f}"
31 | # Save a checkpoint for min train loss
32 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint
33 | dict_kwargs:
34 | monitor: train_loss
35 | mode: min
36 | save_top_k: 1
37 | filename: "best_train:{step}-{val_loss:.3f}-train:{train_loss:.3f}"
38 | # Save the latest step
39 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint
40 | dict_kwargs:
41 | filename: "last:{step}-{val_loss:.3f}-{train_loss:.3f}"
42 | # Save a checkpoint every 1000 steps
43 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint
44 | dict_kwargs:
45 | every_n_train_steps: 1000
46 | filename: "step:{step}-{val_loss:.3f}-{train_loss:.3f}"
47 | save_top_k: -1
48 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructureTokenizer/protein2structoken_16b.yaml:
--------------------------------------------------------------------------------
1 | # Jiayou Zhang
2 | # Usage: mgen predict --config experiments/AIDO.StructureTokenizer/protein2structoken_16b.yaml
3 | # The input amino acid sequences are specified in the test_split_files.
4 | # The model will predict the structure tokens of the input sequences.
5 | # The predictions will be saved in `callbacks.init_args.output_dir` using tsv format.
6 | seed_everything: 42
7 | trainer:
8 | accelerator: auto
9 | strategy:
10 | class_path: lightning.pytorch.strategies.DDPStrategy
11 | devices: auto
12 | num_nodes: 1
13 | precision: 32
14 | logger:
15 | class_path: lightning.pytorch.loggers.WandbLogger
16 | init_args:
17 | name: protein2structoken_16b
18 | save_dir: logs
19 | project: MGEN_AIDO.StructureTokenizer
20 | callbacks:
21 | - class_path: modelgenerator.callbacks.PredictionWriter
22 | init_args:
23 | output_dir: logs/protein2structoken_16b
24 | filetype: tsv
25 | write_cols: [uid, sequences, labels, predictions]
26 | drop_special_tokens: true
27 | argmax_predictions: true
28 | remove_duplicates: true
29 | model:
30 | class_path: modelgenerator.tasks.Inference
31 | init_args:
32 | backbone:
33 | class_path: modelgenerator.backbones.aido_protein2structoken_16b
34 | init_args:
35 | from_scratch: false
36 | max_length: 2048 # 512 is too short for some proteins. The first stage training of the model is done with 2048. The second stage is 1024.
37 | config_overwrites:
38 | hidden_dropout_prob: 0
39 | attention_probs_dropout_prob: 0
40 | use_legacy_adapter: true
41 | strict_loading: true
42 | data:
43 | class_path: modelgenerator.data.StructureTokenDataModule
44 | init_args:
45 | path: genbio-ai/casp14-casp15-cameo-test-proteins
46 | test_split_files: [casp14_csv/test.csv, casp15_csv/test.csv, cameo_csv/test.csv]
47 | batch_size: 1
48 | ckpt_path: null
49 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/scripts/download_uniref30.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Copyright 2021 AlQuraishi Laboratory
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # Downloads and unzips the BFD database for AlphaFold.
18 | #
19 | # Usage: bash download_bfd.sh /path/to/download/directory
20 | # not that, the download uniref30 cannot be used by mmseqs2. if one can download uniref30 from
21 | # http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz, after unzip this download file, use the follow
22 | # script to convert data format.
23 | # mmseqs tsv2exprofiledb uniref30_2103 uniref30_2103_db
24 | # mmseqs createindex uniref30_2103_db tmp
25 | set -e
26 |
27 | if [[ $# -eq 0 ]]; then
28 | echo "Error: download directory must be provided as an input argument."
29 | exit 1
30 | fi
31 |
32 | if ! command -v aria2c &> /dev/null ; then
33 | echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
34 | exit 1
35 | fi
36 |
37 | DOWNLOAD_DIR="$1"
38 | ROOT_DIR="${DOWNLOAD_DIR}/uniref30"
39 | SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz"
40 | BASENAME=$(basename "${SOURCE_URL}")
41 | MAX_CONNECTIONS="${2:-4}"
42 |
43 | mkdir -p "${ROOT_DIR}"
44 | aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x "${MAX_CONNECTIONS}" --check-certificate=false
45 | tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
46 | --directory="${ROOT_DIR}"
47 | rm "${ROOT_DIR}/${BASENAME}"
48 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/cli/predict.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import click
4 | from loguru import logger
5 |
6 |
7 | @click.command()
8 | @click.option(
9 | "-i",
10 | "--input-json",
11 | type=click.Path(exists=True, dir_okay=False),
12 | required=True,
13 | help="Input JSON file containing protein sequences",
14 | )
15 | @click.option(
16 | "-o",
17 | "--output-dir",
18 | type=click.Path(file_okay=False, dir_okay=True),
19 | required=True,
20 | help="Directory to save the prediction results",
21 | )
22 | @click.option(
23 | "-m",
24 | "--model-path",
25 | type=click.Path(exists=True, dir_okay=False),
26 | required=True,
27 | help="Path to the model checkpoint",
28 | )
29 | @click.option(
30 | "--ccd-components",
31 | type=click.Path(exists=True, dir_okay=False),
32 | required=True,
33 | help="Path to the CCD components file",
34 | )
35 | @click.option(
36 | "--ccd-components-rdkit",
37 | type=click.Path(exists=True, dir_okay=False),
38 | required=True,
39 | help="Path to the CCD components RDKit molecules file",
40 | )
41 | @click.option(
42 | "--seed",
43 | type=int,
44 | default=1234,
45 | help="Random seed for reproducibility",
46 | )
47 | @click.option(
48 | "--device-ids",
49 | type=str,
50 | default="0",
51 | help="Comma-separated list of GPU device IDs to use",
52 | )
53 | @click.option(
54 | "--master-port",
55 | type=int,
56 | default=8803,
57 | help="Master port for distributed training",
58 | )
59 | def predict(
60 | input_json: Path,
61 | output_dir: Path,
62 | model_path: Path,
63 | ccd_components: Path,
64 | ccd_components_rdkit: Path,
65 | seed: int,
66 | device_ids: str,
67 | master_port: int,
68 | ) -> None:
69 | """Run protein structure prediction"""
70 | logger.info(f"Running prediction with input {input_json}")
71 | logger.info(f"Output will be saved to {output_dir}")
72 | # TODO: Implement
73 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/configs/mmseqs.yaml:
--------------------------------------------------------------------------------
1 | setting:
2 | description: msa retrieve
3 | tools:
4 | mmseqs2:
5 | enable: false
6 | binary_path: mmseqs # after add binary path into $PATH.
7 | dbs: uniref30,envdb
8 | uniref30:
9 | # -s: controls how many similar k-mers should be produced during the seeding stage. This is the most important parameter for speed, a lower value is fast but less sensitive and a higher one is sensitive but slower. The default search is already sensitive
10 | # --db-load-mode 2: MMseqs2 can be forced to use the main memory database by using the parameter
11 | search: "--num-iterations 3 --db-load-mode 2 -s 8 -e 0.1 --max-seqs 10000 -a"
12 | expandaln: "--db-load-mode 2 --expansion-mode 0 -e inf --expand-filter-clusters 1 --max-seq-id 0.95"
13 | align: "--db-load-mode 2 -e 10 --max-accept 100000 --alt-ali 10 -a"
14 | filter: "--db-load-mode 2 --qid 0 --qsc 0.8 --diff 0 --max-seq-id 1.0 --filter-min-enable 100"
15 | result2msa: "--msa-format-mode 6 --db-load-mode 2 --filter-msa 1 --filter-min-enable 1000 --diff 3000 --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95"
16 | envdb:
17 | search: "--num-iterations 3 --db-load-mode 2 -s 8 -e 0.1 --max-seqs 10000 -a"
18 | expandaln: "--db-load-mode 2 --expansion-mode 0 -e inf"
19 | align: "--db-load-mode 2 -e 10 --max-accept 100000 --alt-ali 10 -a"
20 | filter: "--db-load-mode 2 --qid 0 --qsc 0.8 --diff 0 --max-seq-id 1.0 --filter-min-enable 100"
21 | result2msa: "--msa-format-mode 6 --db-load-mode 2 --filter-msa 1 --filter-min-enable 1000 --diff 3000 --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95"
22 | mmseqs2_api:
23 | enable: true
24 | use_env: true
25 | use_filter: true
26 | use_pairing: false
27 | pairing_strategy: greedy # greedy, complete
28 | data:
29 | uniref30:
30 | database_path: /localssd/data/uniref30_mmseqs/uniref30_2103_db
31 | envdb:
32 | database_path: /localssd/data/colabfold_envdb_202108/colabfold_envdb_202108_db
33 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to AIDO.ModelGenerator
2 | Thank you for considering to contribute to AIDO.ModelGenerator!
3 |
4 | ## Merge Requests
5 | We welcome your merge requests (MRs).
6 | For minor fixes (e.g., documentation improvements), feel free to submit a MR directly.
7 | If you would like to implement a new feature or a bug, please make sure you (or someone else) has opened an appropriate [issue](https://github.com/genbio-ai/ModelGenerator/issues) first; in your MR, please mention the issue it addresses.
8 |
9 | ### Creating a Merge Request
10 | 1. [Fork](https://github.com/genbio-ai/ModelGenerator/forks) this repository.
11 | 2. Install locally with `pip install -e .[dev]`.
12 | 3. Make your code changes locally.
13 | 4. **Set up commit hooks:**
14 | Initialize [pre-commit](https://pre-commit.com/) hooks:
15 | ```bash
16 | pre-commit install
17 | ```
18 | This will automatically check formatting (Ruff with max line length 100), trailing whitespace, end-of-file, YAML syntax, and large files before each commit.
19 | 5. Run `pytest tests/` to test your code.
20 | 6. If dependencies changed, rebuild the lock file with `poetry lock`
21 | 7. Check that your code is properly documented by going into the `docs` directory and running `mkdocs serve` to build the documentation and view it in your browser.
22 | 8. Issue a MR to merge your changes into the `main` branch.
23 |
24 |
25 | ## Issues
26 | We use GitHub issues to track bugs and feature requests.
27 | Before submitting an issue, please make sure:
28 |
29 | 1. You have read the README and documentation and your question is NOT addressed there.
30 | 2. You have done your best to ensure that your issue is NOT a duplicate of one of [the previous issues](https://github.com/genbio-ai/ModelGenerator/issues).
31 | 3. Your issue is either a bug (unexpected/undesirable behavior) or a feature request.
32 |
33 | ## License
34 | By contributing to AIDO.ModelGenerator, you agree that your contributions will be licensed
35 | under the LICENSE file in the root directory of the source tree.
36 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ai4s-cn-beijing.cr.volces.com/pytorch-mirror/pytorch:2.3.1-cuda12.1-cudnn8-devel
2 |
3 | ENV DEBIAN_FRONTEND=noninteractive
4 | ENV TZ=Asia/Shanghai
5 | RUN apt-get update && \
6 | apt-get install -y --no-install-recommends \
7 | wget \
8 | g++ \
9 | gcc \
10 | libc6-dev \
11 | libaio-dev \
12 | make zlib1g zlib1g-dev \
13 | git git-lfs expect zsh vim wget curl unzip zip cmake cmake-curses-gui libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev \
14 | net-tools \
15 | && apt-get clean \
16 | && rm -rf /var/lib/apt/lists/*
17 |
18 | RUN apt update && apt -y install postgresql
19 |
20 | RUN DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
21 | hmmer cmake cmake-curses-gui \
22 | && git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \
23 | && mkdir /tmp/hh-suite/build \
24 | && cd /tmp/hh-suite/build \
25 | && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \
26 | && make -j 32 && make install \
27 | && ln -s /opt/hhsuite/bin/* /usr/bin \
28 | && cd - \
29 | && rm -rf /tmp/hh-suite
30 |
31 | RUN apt-get install -yq --no-install-recommends iproute2 curl
32 | # Add PIP Package
33 | RUN pip3 --no-cache-dir install \
34 | scipy \
35 | ml_collections \
36 | tqdm \
37 | pandas \
38 | dm-tree \
39 | rdkit
40 |
41 | # Add openfold dependency
42 | RUN pip3 --no-cache-dir install \
43 | biopython==1.83 \
44 | modelcif==0.7
45 |
46 | # Add datapipeline dependency
47 | RUN pip3 --no-cache-dir install \
48 | biotite==1.0.1 \
49 | scikit-learn \
50 | scikit-learn-extra \
51 | deepspeed==0.14.4 \
52 | protobuf==3.20.2 tos icecream ipdb wandb numpy==1.26.3 matplotlib==3.9.2 ipywidgets py3Dmol
53 |
54 | # For H20 compatibility
55 | RUN pip3 install --no-cache-dir nvidia-cublas-cu12==12.4.5.8 --no-deps
56 | RUN git clone -b v3.5.1 https://github.com/NVIDIA/cutlass.git /opt/cutlass
57 | ENV CUTLASS_PATH=/opt/cutlass
58 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/src/genbio/aidosp/msa_retrieve/configs/mmseqs.yaml:
--------------------------------------------------------------------------------
1 | setting:
2 | description: msa retrieve
3 | tools:
4 | mmseqs2:
5 | enable: false
6 | binary_path: mmseqs # after add binary path into $PATH.
7 | dbs: uniref30,envdb
8 | uniref30:
9 | # -s: controls how many similar k-mers should be produced during the seeding stage. This is the most important parameter for speed, a lower value is fast but less sensitive and a higher one is sensitive but slower. The default search is already sensitive
10 | # --db-load-mode 2: MMseqs2 can be forced to use the main memory database by using the parameter
11 | search: "--num-iterations 3 --db-load-mode 2 -s 8 -e 0.1 --max-seqs 10000 -a"
12 | expandaln: "--db-load-mode 2 --expansion-mode 0 -e inf --expand-filter-clusters 1 --max-seq-id 0.95"
13 | align: "--db-load-mode 2 -e 10 --max-accept 100000 --alt-ali 10 -a"
14 | filter: "--db-load-mode 2 --qid 0 --qsc 0.8 --diff 0 --max-seq-id 1.0 --filter-min-enable 100"
15 | result2msa: "--msa-format-mode 6 --db-load-mode 2 --filter-msa 1 --filter-min-enable 1000 --diff 3000 --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95"
16 | envdb:
17 | search: "--num-iterations 3 --db-load-mode 2 -s 8 -e 0.1 --max-seqs 10000 -a"
18 | expandaln: "--db-load-mode 2 --expansion-mode 0 -e inf"
19 | align: "--db-load-mode 2 -e 10 --max-accept 100000 --alt-ali 10 -a"
20 | filter: "--db-load-mode 2 --qid 0 --qsc 0.8 --diff 0 --max-seq-id 1.0 --filter-min-enable 100"
21 | result2msa: "--msa-format-mode 6 --db-load-mode 2 --filter-msa 1 --filter-min-enable 1000 --diff 3000 --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95"
22 | mmseqs2_api:
23 | enable: true
24 | use_env: true
25 | use_filter: true
26 | use_pairing: false
27 | pairing_strategy: greedy # greedy, complete
28 | data:
29 | uniref30:
30 | database_path: /localssd/data/uniref30_mmseqs/uniref30_2103_db
31 | envdb:
32 | database_path: /localssd/data/colabfold_envdb_202108/colabfold_envdb_202108_db
33 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/scripts/run_inference.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euo pipefail
3 | IFS=$'\n\t'
4 |
5 | [[ "${DEBUG:-}" == "1" ]] && set -x
6 |
7 | : "${LAYERNORM_TYPE:=fast_layernorm}"
8 | : "${USE_DEEPSPEED_EVO_ATTTENTION:=true}"
9 | : "${device_ids:=0,1,2,3}"
10 | : "${master_port:=8803}"
11 | : "${seed:=1234}"
12 |
13 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
14 | PROJECT_DIR="${SCRIPT_DIR}/.."
15 | CHECKPOINT_PATH="/nfs/model"
16 | CCD_COMPONENTS="/nfs/ccd"
17 | CCD_COMPONENTS_RDKIT="/nfs/ccd"
18 |
19 | echo ${PROJECT_DIR}
20 |
21 | : "${CHECKPOINT_PATH:?Environment variable CHECKPOINT_PATH is required}"
22 | : "${CCD_COMPONENTS:?Environment variable CCD_COMPONENTS is required}"
23 | : "${CCD_COMPONENTS_RDKIT:?Environment variable CCD_COMPONENTS_RDKIT is required}"
24 |
25 | yaml_file_path="${PROJECT_DIR}/configs/inference_v0.1.yaml"
26 | checkpoint_path="${CHECKPOINT_PATH}/fold49-v0.1.2.pt"
27 | ccd_components_file="${CCD_COMPONENTS}/components.v20240608.cif"
28 | ccd_components_rdkit_mol_file="${CCD_COMPONENTS_RDKIT}/components.v20240608.cif.rdkit_mol.pkl"
29 | input_json_path="${PROJECT_DIR}/examples/example_built.json"
30 | output_dir="./outputs/example-${seed}"
31 |
32 | for f in "${yaml_file_path}" "${checkpoint_path}" "${ccd_components_file}" "${ccd_components_rdkit_mol_file}" "${input_json_path}"; do
33 | [[ -f "$f" ]] || { echo "Missing required file: $f" >&2; exit 1; }
34 | done
35 |
36 | mkdir -p "${output_dir}"
37 |
38 | export LAYERNORM_TYPE
39 | export USE_DEEPSPEED_EVO_ATTTENTION
40 |
41 | CUDA_VISIBLE_DEVICES="${device_ids}" \
42 | OMP_NUM_THREADS=1 \
43 | torchrun --nnodes=1 --nproc_per_node=4 --master_port="${master_port}" \
44 | "${PROJECT_DIR}/runner/inference.py" \
45 | --yaml_file_path="${yaml_file_path}" \
46 | --checkpoint_path="${checkpoint_path}" \
47 | --ccd_components_file="${ccd_components_file}" \
48 | --ccd_components_rdkit_mol_file="${ccd_components_rdkit_mol_file}" \
49 | --seeds="${seed}" \
50 | --dump_dir="${output_dir}" \
51 | --input_json_path="${input_json_path}"
52 |
--------------------------------------------------------------------------------
/experiments/AIDO.Protein/DMS/configs/substitution_LoRA_FSDP.yaml:
--------------------------------------------------------------------------------
1 | trainer:
2 | accelerator: auto
3 | devices: auto
4 | max_steps: 10000
5 | gradient_clip_val: 0.1
6 | gradient_clip_algorithm: value
7 | default_root_dir: logs
8 | strategy:
9 | class_path: lightning.pytorch.strategies.FSDPStrategy
10 | init_args:
11 | auto_wrap_policy: modelgenerator.distributed.fsdp.wrap.AutoWrapPolicy
12 | sharding_strategy: HYBRID_SHARD
13 | logger: false
14 | callbacks:
15 | - class_path: lightning.pytorch.callbacks.ModelCheckpoint # save ckpt at the end of each epoch, and save the best val_mcc ckpt
16 | init_args:
17 | dirpath: null
18 | filename: epoch_{epoch}-val_mcc:{val_spearman:.3f}
19 | monitor: val_spearman
20 | mode: max
21 | - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
22 | dict_kwargs:
23 | monitor: val_spearman
24 | mode: max
25 | patience: 10
26 | model:
27 | class_path: modelgenerator.tasks.SequenceRegression
28 | init_args:
29 | backbone:
30 | class_path: modelgenerator.backbones.aido_protein_16b_v1
31 | init_args:
32 | use_peft: true
33 | max_length: 2048
34 | adapter:
35 | class_path: modelgenerator.adapters.MLPPoolAdapter
36 | init_args:
37 | hidden_sizes:
38 | - 128
39 | dropout: 0.1
40 | dropout_in_middle: false
41 | optimizer:
42 | class_path: torch.optim.AdamW
43 | init_args:
44 | lr: 0.0001
45 | weight_decay: 0.01
46 | lr_scheduler:
47 | class_path: modelgenerator.lr_schedulers.CosineWithWarmup
48 | init_args:
49 | warmup_ratio: 0.05
50 | data:
51 | class_path: modelgenerator.data.DMSFitnessPrediction
52 | init_args:
53 | path: genbio-ai/ProteinGYM-DMS
54 | train_split_files:
55 | - singles_substitutions/VRPI_BPT7_Tsuboyama_2023_2WNM.tsv
56 | train_split_name: train
57 | random_seed: 42
58 | batch_size: 32
59 | cv_num_folds: 5
60 | cv_test_fold_id: 0
61 | cv_enable_val_fold: true
62 | cv_fold_id_col: fold_id
63 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/model/layer_norm/torch_ext_compile.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | import os
18 |
19 | from torch.utils.cpp_extension import load
20 |
21 |
22 | def compile(name, sources, extra_include_paths, build_directory):
23 | os.environ["TORCH_CUDA_ARCH_LIST"] = "7.0;8.0"
24 | return load(
25 | name=name,
26 | sources=sources,
27 | extra_include_paths=extra_include_paths,
28 | extra_cflags=[
29 | "-O3",
30 | "-DVERSION_GE_1_1",
31 | "-DVERSION_GE_1_3",
32 | "-DVERSION_GE_1_5",
33 | ],
34 | extra_cuda_cflags=[
35 | "-O3",
36 | "--use_fast_math",
37 | "-DVERSION_GE_1_1",
38 | "-DVERSION_GE_1_3",
39 | "-DVERSION_GE_1_5",
40 | "-std=c++17",
41 | "-maxrregcount=50",
42 | "-U__CUDA_NO_HALF_OPERATORS__",
43 | "-U__CUDA_NO_HALF_CONVERSIONS__",
44 | "--expt-relaxed-constexpr",
45 | "--expt-extended-lambda",
46 | "-gencode",
47 | "arch=compute_70,code=sm_70",
48 | "-gencode",
49 | "arch=compute_80,code=sm_80",
50 | "-gencode",
51 | "arch=compute_86,code=sm_86",
52 | "-gencode",
53 | "arch=compute_90,code=sm_90",
54 | ],
55 | verbose=True,
56 | build_directory=build_directory,
57 | )
58 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/model/modules/head.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | import torch
18 | import torch.nn as nn
19 | from torch.nn import Linear
20 |
21 |
22 | # Adapted From openfold.model.heads
23 | class DistogramHead(nn.Module):
24 | """Implements Algorithm 1 [Line17] in AF3
25 | Computes a distogram probability distribution.
26 | For use in computation of distogram loss, subsection 1.9.8 (AF2)
27 | """
28 |
29 | def __init__(self, c_z: int = 128, no_bins: int = 64) -> None:
30 | """
31 | Args:
32 | c_z (int, optional): hidden dim [for pair embedding]. Defaults to 128.
33 | no_bins (int, optional): Number of distogram bins. Defaults to 64.
34 | """
35 | super(DistogramHead, self).__init__()
36 |
37 | self.c_z = c_z
38 | self.no_bins = no_bins
39 |
40 | self.linear = Linear(in_features=self.c_z, out_features=self.no_bins)
41 |
42 | def forward(self, z: torch.Tensor) -> torch.Tensor: # [*, N, N, C_z]
43 | """
44 | Args:
45 | z (torch.Tensor): pair embedding
46 | [*, N_token, N_token, C_z]
47 |
48 | Returns:
49 | torch.Tensor: distogram probability distribution
50 | [*, N_token, N_token, no_bins]
51 | """
52 | # [*, N, N, no_bins]
53 | logits = self.linear(z)
54 | logits = logits + logits.transpose(-2, -3)
55 | return logits
56 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/genbio/modeling_genbio.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | from lightning.pytorch import LightningModule
4 | from lightning.pytorch.core.saving import _load_state
5 | from transformers import PreTrainedModel, PretrainedConfig
6 |
7 |
8 | class GenBioConfig(PretrainedConfig):
9 | model_type = "genbio"
10 |
11 | def __init__(self, hparams=None, **kwargs):
12 | self.hparams = hparams
13 | super().__init__(**kwargs)
14 |
15 |
16 | class GenBioModel(PreTrainedModel):
17 | config_class = GenBioConfig
18 |
19 | def __init__(self, config: GenBioConfig, genbio_model=None, **kwargs):
20 | super().__init__(config, **kwargs)
21 | # if genbio_model is provided, we don't need to initialize it
22 | if genbio_model is not None:
23 | self.genbio_model = genbio_model
24 | return
25 | # otherwise, initialize the model from hyperparameters
26 | cls_path = config.hparams["_class_path"]
27 | module_path, name = cls_path.rsplit(".", 1)
28 | genbio_cls = getattr(__import__(module_path, fromlist=[name]), name)
29 | checkpoint = {
30 | LightningModule.CHECKPOINT_HYPER_PARAMS_KEY: config.hparams,
31 | "state_dict": {},
32 | }
33 | # TODO: _load_state is a private function and it throws a warning for an
34 | # empty state_dict. We need a fucntion to intialize our model; this
35 | # is the only choice we have for now.
36 | with warnings.catch_warnings():
37 | warnings.filterwarnings("ignore", "Found keys that are*")
38 | self.genbio_model = _load_state(
39 | genbio_cls, checkpoint, strict_loading=False
40 | )
41 |
42 | @classmethod
43 | def from_genbio_model(cls, model: LightningModule):
44 | return cls(GenBioConfig(hparams=model.hparams), genbio_model=model)
45 |
46 | def forward(self, *args, **kwargs):
47 | return self.genbio_model(*args, **kwargs)
48 |
49 |
50 | GenBioModel.register_for_auto_class("AutoModel")
51 | GenBioConfig.register_for_auto_class("AutoConfig")
52 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/utils/init_params.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 |
3 | from typing import Union
4 |
5 |
6 | def init_linear_xavier_(linear: Union[nn.Linear, nn.Embedding, None]):
7 | if linear is None:
8 | return linear
9 | nn.init.xavier_uniform_(linear.weight, gain=1)
10 | if hasattr(linear, "bias") and linear.bias is not None:
11 | nn.init.zeros_(linear.bias)
12 | return linear
13 |
14 |
15 | def init_linear_zero_(linear: Union[nn.Linear, nn.Embedding, None], eps: float = 1e-6):
16 | if linear is None:
17 | return linear
18 | nn.init.normal_(linear.weight, 0, eps)
19 | if hasattr(linear, "bias") and linear.bias is not None:
20 | nn.init.zeros_(linear.bias)
21 | return linear
22 |
23 |
24 | def init_linear_(
25 | linear: Union[nn.Linear, nn.Embedding, None],
26 | init_type: str = "xavier",
27 | eps: float = 1e-6,
28 | ):
29 | if init_type == "xavier":
30 | return init_linear_xavier_(linear)
31 | elif init_type == "zero":
32 | return init_linear_zero_(linear, eps=eps)
33 | else:
34 | raise ValueError(f"Unknown init_type {init_type}")
35 |
36 |
37 | def init_layer_norm_(layer_norm: Union[nn.LayerNorm, None]):
38 | if layer_norm is None:
39 | return layer_norm
40 | nn.init.ones_(layer_norm.weight)
41 | nn.init.zeros_(layer_norm.bias)
42 | return layer_norm
43 |
44 |
45 | def init_params_recursively_(module: nn.Module):
46 | for name, child in module.named_children():
47 | if hasattr(child, "reset_parameters"):
48 | child.reset_parameters()
49 | elif isinstance(child, nn.Linear):
50 | init_linear_(child)
51 | elif isinstance(child, nn.LayerNorm):
52 | init_layer_norm_(child)
53 | elif isinstance(child, nn.Embedding):
54 | init_linear_(child)
55 | elif isinstance(child, nn.Dropout):
56 | pass
57 | else:
58 | try:
59 | init_params_recursively_(child)
60 | except Exception as e:
61 | print(f"Failed to init {name} with {e}")
62 |
--------------------------------------------------------------------------------
/docs/docs/experiment_design/backbones.md:
--------------------------------------------------------------------------------
1 | # Adding Backbones
2 |
3 | Backbones are pre-trained foundation models.
4 |
5 | Foundation models are essential to modern ML but are often difficult to work with.
6 | Design decisions made during pre-training (tokenization, architecture, io format) cannot be changed.
7 | At best, this results in many reimplementations for benchmarking or finetuning tasks, and a high risk of buggy code.
8 | At worst, these decisions can lock in users and exclude certain tasks and use-cases.
9 |
10 | AIDO.ModelGenerator eliminates the need for reimplementation and makes backbones task-agnostic: wrap your backbone in a standard interface, and reuse it across all inference and finetuning tasks.
11 | It also makes compatibility transparent: if a backbone fits the required interface, it can be used for any data-appropriate task.
12 |
13 | > Note: Backbones for 1D sequence modeling are universally supported. Other types of backbones included in AIDO.ModelGenerator (e.g. structure, image) are not yet universally supported, but will be in the future.
14 |
15 | Available Backbones:
16 |
17 | - DNA: `aido_dna_7b`, `aido_dna_300m`, `aido_dna_dummy`, `aido_dna_debug`, `dna_onehot`
18 | - RNA: `aido_rna_1b600m`, `aido_rna_1b600m_cds`, `aido_rna_650m`, `aido_rna_650m_cds`, `aido_rna_300m_mars`, `aido_rna_25m_mars`, `aido_rna_1m_mars`, `aido_dna_dummy`, `aido_dna_debug`, `dna_onehot`
19 | - Protein: `aido_protein_16b`, `aido_protein_16b_v1`, `aido_protein2structoken_16b`, `aido_protein_debug`, `protein_onehot`, `aido_protein_rag_16b`, `aido_protein_rag_3b`
20 | - Cell (gene expression): `aido_cell_100m`, `aido_cell_10m`, `aido_cell_3m`, `geneformer`
21 | - OneHot: dummy model, only tokenizes, useful for non-FM baselines and quick tests
22 |
23 | At their core, backbones are PyTorch `nn.Module` objects with a few extra interfaces.
24 | To implement a new backbone, subclass a backbone interface and implement the required methods.
25 |
26 | ::: modelgenerator.backbones.SequenceBackboneInterface
27 | handler: python
28 | options:
29 | filters:
30 | - "!^__"
31 | show_root_heading: true
32 | show_source: true
33 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/mean_ribosome_load_prediction/README.md:
--------------------------------------------------------------------------------
1 | # Mean Ribosome Load Prediction
2 | Ribosomes are cellular structures responsible for protein synthesis, and the ribosome load on an mRNA molecule can influence the rate and efficiency of protein production, and the success of genetic engineering. Predicting ribosome load can provide valuable insights into gene expression regulation, translation efficiency, and cellular processes. We fully finetune AIDO.RNA-1.6B for mean ribosome load prediction using the dataset by [Sample _et al._](https://www.nature.com/articles/s41587-019-0164-5). We use the same train, test, and validation split used in a previous study [RiNALMo](https://arxiv.org/abs/2403.00043). See the [config file](https://github.com/genbio-ai/ModelGenerator/tree/main/experiments/AIDO.RNA/mean_ribosome_load_prediction/mean_ribosome_load_prediction.yaml) for detailed hyperparameter settings.
3 |
4 | #### Finetuning script
5 | ```shell
6 | RUN_NAME=rna_mrl
7 | CKPT_SAVE_DIR=logs/${RUN_NAME}
8 | mgen fit --config experiments/AIDO.RNA/mean_ribosome_load_prediction/mean_ribosome_load_prediction.yaml \
9 | --trainer.default_root_dir ${CKPT_SAVE_DIR} \
10 | --trainer.callbacks.ft_schedule_path experiments/AIDO.RNA/mean_ribosome_load_prediction/ft_schedules/two_step.yaml \
11 | --trainer.devices 0,
12 | ```
13 |
14 | Note that here we are using finetuning scheduler. See [this tutorial](https://github.com/genbio-ai/ModelGenerator/blob/main/docs/docs/tutorials/finetuning_scheduler.md) for details.
15 |
16 | #### Evaluation script
17 | ```shell
18 | RUN_NAME=rna_mrl
19 | CKPT_SAVE_DIR=logs/${RUN_NAME}
20 | CKPT_PATH=/path/to/checkpoint ## NOTE: Replace `/path/to/checkpoint` with the actual finetuned checkpoint path.
21 | mgen test --config experiments/AIDO.RNA/mean_ribosome_load_prediction/mean_ribosome_load_prediction.yaml \
22 | --trainer.default_root_dir ${CKPT_SAVE_DIR}/test \
23 | --trainer.callbacks.ft_schedule_path experiments/AIDO.RNA/mean_ribosome_load_prediction/ft_schedules/two_step.yaml \
24 | --trainer.devices 0, \
25 | --ckpt_path ${CKPT_PATH}
26 | ```
27 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructureTokenizer/extract_structure_tokenizer_codebook.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from modelgenerator.structure_tokenizer.models.structure_tokenizer import StructureTokenizerModel
3 | import argparse
4 |
5 |
6 | def main():
7 | """
8 | Main function to extract and save the codebook of the StructureTokenizerModel.
9 |
10 | The codebook is a matrix that represents the embeddings of structure tokens in the model.
11 | It has the shape (num_tokens, embedding_dim), where:
12 | - num_tokens: The number of unique structure tokens in the model's vocabulary.
13 | - embedding_dim: The dimensionality of the token embeddings.
14 |
15 | This script loads a pretrained StructureTokenizerModel, extracts its codebook, and saves
16 | it as a PyTorch tensor file (.pt).
17 |
18 | Usage:
19 | Run the script with the required arguments to specify the output file path and the
20 | pretrained model to use. The codebook will be saved in the specified path.
21 | """
22 | parser = argparse.ArgumentParser(
23 | description=(
24 | "Extract the codebook of StructureTokenizerModel.\n"
25 | "The codebook is a matrix of shape (num_tokens, embedding_dim), where each row corresponds "
26 | "to the embedding of a structure token. The extracted codebook is saved as a PyTorch tensor (.pt) file."
27 | )
28 | )
29 | parser.add_argument("--output_path", type=str, required=True, help="Path to save the codebook in .pt format.")
30 | parser.add_argument(
31 | "--pretrained_model_name_or_path",
32 | type=str,
33 | default="genbio-ai/AIDO.StructureTokenizer",
34 | help=(
35 | "The pretrained model name or local path to load the StructureTokenizerModel.\n"
36 | "Default: 'genbio-ai/AIDO.StructureTokenizer'."
37 | ),
38 | )
39 | args = parser.parse_args()
40 |
41 | model = StructureTokenizerModel.from_pretrained(args.pretrained_model_name_or_path)
42 | codebook = model.encoder.codebook.data.cpu()
43 | # Save the extracted codebook as a PyTorch tensor (.pt) file
44 | torch.save(codebook, args.output_path)
45 |
46 |
47 | if __name__ == "__main__":
48 | main()
49 |
--------------------------------------------------------------------------------
/docs/docs/api_reference/adapters.md:
--------------------------------------------------------------------------------
1 | # Adapters
2 |
3 | Adapters work with [`Backbones`](../backbones) and [`Tasks`](../tasks) to adapt pretrained models to new objectives.
4 | They are specified with the `--model.adapter` argument in the CLI or in the `model.adapter` section of a configuration file.
5 |
6 | Adapters are the focal point for architecture design on top of backbones, and can be swapped with other adapters of the same type to benchmark different architectures.
7 |
8 | This reference overviews the available no-code adapters.
9 | If you would like to develop new adapters, see [Experiment Design](../../experiment_design).
10 |
11 | ```yaml
12 | # Example Adapter Configuration
13 | model:
14 | class_path: modelgenerator.tasks.SequenceRegression
15 | init_args:
16 | adapter:
17 | class_path: modelgenerator.adapters.MLPPoolAdapter
18 | init_args:
19 | pooling: mean_pooling
20 | hidden_sizes:
21 | - 512
22 | bias: true
23 | dropout: 0.1
24 | dropout_in_middle: false
25 | data:
26 | ...
27 | trainer:
28 | ...
29 | ```
30 |
31 | ## Sequence Adapters
32 |
33 | These adapters make a single prediction for the entire input.
34 |
35 | ::: modelgenerator.adapters.MLPAdapter
36 |
37 | ::: modelgenerator.adapters.LinearCLSAdapter
38 |
39 | ::: modelgenerator.adapters.LinearMeanPoolAdapter
40 |
41 | ::: modelgenerator.adapters.LinearMaxPoolAdapter
42 |
43 | ::: modelgenerator.adapters.LinearTransformerAdapter
44 |
45 | ::: modelgenerator.adapters.ResNet2DAdapter
46 |
47 | ::: modelgenerator.adapters.ResNet1DAdapter
48 |
49 | ## Token Adapters
50 |
51 | These adapters make one prediction per token.
52 |
53 | ::: modelgenerator.adapters.LinearAdapter
54 |
55 | ::: modelgenerator.adapters.MLPAdapter
56 |
57 | ::: modelgenerator.adapters.MLPAdapterWithoutOutConcat
58 |
59 | ## Conditional Generation Adapters
60 |
61 | These adapters are used for conditional generation tasks.
62 |
63 | ::: modelgenerator.adapters.ConditionalLMAdapter
64 |
65 | ## Fusion Adapters
66 |
67 | These adapters are used for multi-modal fusion to combine multiple backbones.
68 |
69 | ::: modelgenerator.adapters.MMFusionSeqAdapter
70 |
71 | ::: modelgenerator.adapters.MMFusionTokenAdapter
72 |
--------------------------------------------------------------------------------
/experiments/AIDO.Cell/docker_readme.md:
--------------------------------------------------------------------------------
1 | # Data setup
2 |
3 | Data for cell classification tasks can be found in [cell-downstream-tasks](https://huggingface.co/datasets/genbio-ai/cell-downstream-tasks) on Hugging Face.
4 |
5 | To download all cell downstream tasks:
6 | ```
7 | cd /path/to/ModelGenerator/modelgenerator
8 | git clone git@hf.co:datasets/genbio-ai/cell-downstream-tasks
9 | ```
10 |
11 | You should only need to do this once.
12 |
13 | # Building the Docker image
14 |
15 | ```bash
16 | cd /path/to/ModelGenerator
17 | docker build -t finetune -f Dockerfile .
18 | ```
19 |
20 | You should only need to do this once.
21 |
22 | # Hugging Face authentication
23 |
24 | If you need to access private or gated models/data:
25 |
26 | ```bash
27 | huggingface-cli login
28 | ```
29 |
30 | # Fine-tuning a model
31 |
32 | ```bash
33 | cd /path/to/ModelGenerator
34 | docker run --rm --runtime=nvidia \
35 | -v /home/user/ModelGenerator/configs:/workspace/configs \
36 | -v /home/user/ModelGenerator/modelgenerator:/workspace/modelgenerator \
37 | -v /home/user/ModelGenerator/experiments:/workspace/experiments \
38 | -v /home/user/.cache/huggingface:/root/.cache/huggingface \
39 | -v "/home/user/ModelGenerator/logs:/workspace/logs" \
40 | finetune bash -c "mgen fit --config experiments/AIDO.Cell/cell_type_classification.yaml"
41 | ```
42 |
43 | # Evaluating a checkpoint
44 |
45 | ```bash
46 | cd /path/to/ModelGenerator
47 | docker run --rm --runtime=nvidia \
48 | -v /home/user/ModelGenerator/configs:/workspace/configs \
49 | -v /home/user/ModelGenerator/modelgenerator:/workspace/modelgenerator \
50 | -v /home/user/ModelGenerator/experiments:/workspace/experiments \
51 | -v /home/user/.cache/huggingface:/root/.cache/huggingface \
52 | -v "/home/user/ModelGenerator/logs:/workspace/logs" \
53 | finetune bash -c "mgen test --config experiments/AIDO.Cell/cell_type_classification.yaml --ckpt_path /workspace/lightning_logs/version_X/checkpoints/my.ckpt"
54 | ```
55 |
56 | # Other usage examples
57 |
58 | The example above fine-tunes and evaluates a model for cell type classification. Other usage examples are described below.
59 |
60 | ## Transcriptomic Clock Task
61 | Simply replace the `config` argument of `mgen fit` with `experiments/AIDO.Cell/transcriptomic_clock.yaml`.
62 |
--------------------------------------------------------------------------------
/docs/docs/experiment_design/data.md:
--------------------------------------------------------------------------------
1 | # Adding Data Loaders
2 |
3 | AIDO.ModelGenerator uses [Lightning DataModules](https://lightning.ai/docs/pytorch/stable/data/datamodule.html) for dataset management and loading.
4 | We also provide a few tools to make data management more convenient, and work with common file types out-of-the-box.
5 |
6 | AIDO.ModelGenerator provides a `DataInterface` class that hides boilerplate, along with a `HFDatasetLoaderMixin` that combines Lightning DataModule structure and [HuggingFace Datasets](https://huggingface.co/docs/datasets) convenience together to quickly load data from HuggingFace or common file formats (e.g. tsv, csv, json, etc).
7 | More convenient mixins and example usage are outlined below.
8 |
9 | Many common tasks and data loaders are already implemented in AIDO.ModelGenerator, and only require setting new paths to run with new data.
10 | See the [Data API Reference](../api_reference/data.md) for all types of available data modules.
11 |
12 | ::: modelgenerator.data.DataInterface
13 | handler: python
14 | options:
15 | filters:
16 | - "!^__"
17 | members:
18 | - setup
19 | - load_and_split_dataset
20 | show_root_heading: true
21 | show_source: true
22 |
23 | ## Useful Mixins
24 |
25 | ::: modelgenerator.data.HFDatasetLoaderMixin
26 | handler: python
27 | options:
28 | filters:
29 | - "!^__"
30 | show_root_heading: true
31 | show_source: true
32 |
33 | ::: modelgenerator.data.KFoldMixin
34 | handler: python
35 | options:
36 | filters:
37 | - "!^__"
38 | show_root_heading: true
39 | show_source: true
40 |
41 | ## Implementing a DataModule
42 |
43 | To transform datasets for task-specific behaviors (e.g. masking for masked language modeling), use `torch.utils.data.Dataset` objects to implement the transformation.
44 | Below is an example.
45 |
46 | ::: modelgenerator.data.MLMDataModule
47 | handler: python
48 | options:
49 | filters:
50 | - "!^__"
51 | members:
52 | - setup
53 | show_root_heading: true
54 | show_source: true
55 |
56 | ::: modelgenerator.data.MLMDataset
57 | handler: python
58 | options:
59 | filters:
60 | - "!^__"
61 | show_root_heading: true
62 | show_source: true
63 |
--------------------------------------------------------------------------------
/docs/docs/tutorials/kfold_cross_validation.md:
--------------------------------------------------------------------------------
1 | # K-fold cross validation
2 |
3 | Datasets implementing the `DataInterface` with the `KFoldMixin` support semi-automatic k-fold crossvalidation for uncertainty estimation.
4 |
5 | We use translation efficiency prediction as an example task to demonstrate how to do a k-fold cross validation in ModelGenerator. The logic is to split the dataset into k-fold, and call each fold as a test set iteratively.
6 |
7 | #### Data configs
8 | For cross validation task, we input only one dataset named `train` containing a colomn `fold_id` indicating the fold index for each sample. You need to set `cv_num_folds`, `cv_test_fold_id`, `cv_enable_val_fold`, `cv_fold_id_col` according to your experiment setting.
9 | ```yaml
10 | data:
11 | class_path: modelgenerator.data.TranslationEfficiency
12 | init_args:
13 | path: genbio-ai/rna-downstream-tasks
14 | config_name: translation_efficiency_Muscle
15 | normalize: true
16 | train_split_name: train
17 | random_seed: 42
18 | batch_size: 8
19 | shuffle: true
20 | cv_num_folds: 10
21 | cv_test_fold_id: 0
22 | cv_enable_val_fold: true
23 | cv_fold_id_col: fold_id
24 | ```
25 | See `experiments/AIDO.RNA/configs/translation_efficiency.yaml` for full hyperparameter settings.
26 |
27 |
28 | #### Finetuning script
29 | ```shell
30 | for FOLD in {0..9}
31 | do
32 | RUN_NAME=te_Muscle_aido_rna_1b600m_fold${FOLD}
33 | CKPT_SAVE_DIR=logs/rna_tasks/${RUN_NAME}
34 | CUDA_VISIBLE_DEVICES=0 mgen fit --config experiments/AIDO.RNA/configs/translation_efficiency.yaml \
35 | --data.config_name translation_efficiency_Muscle \
36 | --data.cv_test_fold_id $FOLD \
37 | --trainer.logger.name $RUN_NAME \
38 | --trainer.callbacks.dirpath $CKPT_SAVE_DIR
39 | done
40 | ```
41 |
42 | #### Evaluation script
43 | ```shell
44 | for FOLD in {0..9}
45 | do
46 | CKPT_PATH=logs/rna_tasks/te_Muscle_aido_rna_1b600m_fold${FOLD}/best_val*
47 | echo ">>> Fold ${FOLD}"
48 | mgen test --config experiments/AIDO.RNA/configs/translation_efficiency.yaml \
49 | --data.config_name translation_efficiency_Muscle \
50 | --data.cv_test_fold_id $FOLD \
51 | --model.strict_loading True \
52 | --model.reset_optimizer_states True \
53 | --trainer.logger null \
54 | --ckpt_path $CKPT_PATH
55 | done
56 | ```
57 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/buildjob.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import json
4 |
5 | def buildjob(input_json, input_msa_dir, output_json):
6 | # First, make a reverse index of sequences to MSA files
7 | msa_files = glob.glob(os.path.join(input_msa_dir, '*/*/*.fasta'))
8 | # key: protein sequence, values: precomputed_msa_dir, non_pairing_msa_names
9 | msa_dict = {}
10 | for msa_file in msa_files:
11 | # Get msa_file parent directory
12 | msa_dir = os.path.dirname(msa_file)
13 | msa_subdir = msa_dir.split(input_msa_dir)[-1].strip('/')
14 | # Get all the non-fasta files in the msa_dir
15 | msa_files = glob.glob(os.path.join(msa_dir, '*'))
16 | msa_files.remove(msa_file)
17 | msa_files = [os.path.basename(f) for f in msa_files]
18 | # Get the protein sequence from the fasta file
19 | with open(msa_file, 'r') as f:
20 | sequence = f.readlines()[1].strip()
21 | msa_dict[sequence] = {
22 | "precomputed_msa_dir": os.path.join('/msa_database', msa_subdir),
23 | "non_pairing_msa_names": msa_files
24 | }
25 |
26 | with open(input_json, 'r') as f:
27 | data = json.load(f)
28 |
29 | fasta_lines = []
30 | for i, job in enumerate(data):
31 | for j, entity in enumerate(job['sequences']):
32 | if "proteinChain" in entity:
33 | protein_chain = entity['proteinChain']
34 | sequence = protein_chain['sequence']
35 | # Add msa information
36 | assert sequence in msa_dict, f"Sequence {sequence} not found in MSA directory."
37 | protein_chain['msa'] = msa_dict[sequence]
38 |
39 | with open(output_json, 'w') as f:
40 | json.dump(data, f, indent=4)
41 |
42 |
43 | if __name__ == '__main__':
44 | import argparse
45 | parser = argparse.ArgumentParser(description='Construct FASTA from job JSON for protein MSA retrieval.')
46 | parser.add_argument('--input', type=str, help='Input job JSON file')
47 | parser.add_argument('--msa-db', type=str, help='Input directory for MSA files, built using job2msa.py')
48 | parser.add_argument('--output', type=str, help='Output job JSON file')
49 | args = parser.parse_args()
50 | buildjob(args.input, args.msa_db, args.output)
51 |
--------------------------------------------------------------------------------
/modelgenerator/structure_tokenizer/utils/constants/residue_constants.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 AlQuraishi Laboratory
2 | # Copyright 2021 DeepMind Technologies Limited
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # This mapping is used when we need to store atom data in a format that requires
17 | # fixed atom data size for every residue (e.g. a numpy array).
18 | atom_types = [
19 | "N",
20 | "CA",
21 | "C",
22 | "CB",
23 | "O",
24 | "CG",
25 | "CG1",
26 | "CG2",
27 | "OG",
28 | "OG1",
29 | "SG",
30 | "CD",
31 | "CD1",
32 | "CD2",
33 | "ND1",
34 | "ND2",
35 | "OD1",
36 | "OD2",
37 | "SD",
38 | "CE",
39 | "CE1",
40 | "CE2",
41 | "CE3",
42 | "NE",
43 | "NE1",
44 | "NE2",
45 | "OE1",
46 | "OE2",
47 | "CH2",
48 | "NH1",
49 | "NH2",
50 | "OH",
51 | "CZ",
52 | "CZ2",
53 | "CZ3",
54 | "NZ",
55 | "OXT",
56 | ]
57 | atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
58 | atom_type_num = len(atom_types) # := 37.
59 |
60 | restype_1to3 = {
61 | "A": "ALA",
62 | "R": "ARG",
63 | "N": "ASN",
64 | "D": "ASP",
65 | "C": "CYS",
66 | "Q": "GLN",
67 | "E": "GLU",
68 | "G": "GLY",
69 | "H": "HIS",
70 | "I": "ILE",
71 | "L": "LEU",
72 | "K": "LYS",
73 | "M": "MET",
74 | "F": "PHE",
75 | "P": "PRO",
76 | "S": "SER",
77 | "T": "THR",
78 | "W": "TRP",
79 | "Y": "TYR",
80 | "V": "VAL",
81 | "X": "UNK",
82 | }
83 | restype_3to1 = {v: k for k, v in restype_1to3.items()}
84 | restype_1toidx = {k: i for i, k in enumerate(restype_1to3.keys())}
85 | restype_idxto1 = {v: k for k, v in restype_1toidx.items()}
86 | restype_num = len(restype_1to3)
87 | unknown_restype_idx = restype_num - 1
88 |
--------------------------------------------------------------------------------
/experiments/AIDO.RNA/dependency_mapping/README.md:
--------------------------------------------------------------------------------
1 | # Dependency Mapping
2 |
3 | Dependency mapping is an _in silico_ mutagenesis technique that identifies co-conserved elements in a sequence.
4 | AIDO.ModelGenerator implements the procedure proposed by [Tomaz da Silva et al.](https://www.biorxiv.org/content/10.1101/2024.07.27.605418v1)
5 | We use this to assess structural features learned during pretraining in the [AIDO.RNA](https://www.biorxiv.org/content/10.1101/2024.11.28.625345v1) paper with the [AIDO.RNA-1.6B](https://huggingface.co/genbio-ai/AIDO.RNA-1.6B) model.
6 | This task uses the pre-trained models directly, and does not require finetuning.
7 |
8 | To reproduce the dependency mapping results from the AIDO.RNA paper, run the following from the ModelGenerator root directory:
9 | ```
10 | # Inference
11 | mgen predict --config experiments/AIDO.RNA/dependency_mapping/config.yaml
12 |
13 | # Plotting
14 | python experiments/AIDO.RNA/dependency_mapping/plot_dependency_maps.py \
15 | -i depmap_predictions \
16 | -o depmap_plots \
17 | -v experiments/AIDO.RNA/dependency_mapping/DNA.txt \
18 | -t modelgenerator/huggingface_models/rnabert/vocab.txt
19 | ```
20 |
21 | To create new dependency maps,
22 |
23 | 1. Gather your sequences into a .tsv file with an `id` and `sequence` column.
24 | 2. Run `mgen predict --config config.yaml` where
25 | ```
26 | model:
27 | class_path: Inference
28 | init_args:
29 | backbone:
30 | data:
31 | class_path: DependencyMappingDataModule
32 | init_args:
33 | path: # Note: this errors for ., use ../dependency_mapping if necessary
34 | test_split_files:
35 | -
36 | vocab_file: .txt # E.g. experiments/AIDO.RNA/dependency_mapping/DNA.txt
37 | trainer:
38 | callbacks:
39 | - class_path: modelgenerator.callbacks.PredictionWriter
40 | dict_kwargs:
41 | output_dir: predictions
42 | filetype: pt
43 | ```
44 |
45 | 3. Run the plotting tool
46 | ```
47 | python experiments/AIDO.RNA/dependency_mapping/plot_dependency_maps.py \
48 | -i \
49 | -o \
50 | -v \
51 | -t
52 | ```
53 |
54 | The output will be files of the name `.png` in the output directory, with heatmaps of dependencies and logos with sequence information content.
55 |
--------------------------------------------------------------------------------
/scripts/wandb_sweep/README.md:
--------------------------------------------------------------------------------
1 | # How to use W&B Sweeps with ModelGenerator for hyperparameter tuning
2 |
3 | ## Caveats
4 | W&B agents cannot launch multi-node training jobs, which causes great difficulties integrating W&B Sweeps with ModelGenerator. This guide is based on a hacky workaround that introduces many limitations.
5 |
6 | ### The workaround
7 | An agent is configured to exit immediately after retrieving the next set of hyperparamenters and outputing the complete training command to stdout. This command is then executed on each node without being monitored by an active agent.
8 |
9 | ### Limitations
10 | 1. All agent functionalities are lost. It is not possible to use agent to start/stop/resume/update training runs. Users must manually terminate training runs or implement early-stopping mechanisms.
11 | 2. Failed runs have to be re-run manually using your own sbatch scripts. The command for that run is availale in stdout of the failed run.
12 | 3. Parameter importance plots use wrong parameters by default, it can be manually fixed by selecting the right parameter names in your mgen config.
13 |
14 | >**NOTE**: Before proceeding, please make sure that your training job uses **WandbLogger**.
15 | ## SLURM
16 | ### Step 1: create a wandb sweep
17 | The default `slurm_sweep.yaml` creates a wandb sweep with the training command `mgen fit --config .local/test.yaml` under the project `autotune-test`. Please modify it to suit your experiments. Key values to change are **project**, **command** and **parameters**.
18 |
19 | Run the following command to create a wandb sweep:
20 | ```bash
21 | wandb sweep scripts/wandb_sweep/slurm_sweep.yaml
22 | ```
23 | Take a note of your sweep ID for step 2. It looks like `//` and is found in the output: `wandb: Run sweep agent with: wandb agent`
24 | ### Step 2: submit the next training job to SLURM
25 | Similar to step 1, you need to edit `slurm_agent.sh` for your experiment. The most important changes are **WANDB_PROJECT** and **SWEEP_ID**.
26 |
27 | The following command creates one sweep agent that runs training with the next set of hyperparamenters.
28 | ```bash
29 | sbatch scripts/wandb_sweep/slurm_agent.sh
30 | ```
31 |
32 | >**TIPS**: To queue your other sweep runs, use `sbatch --dependency`. To launch your other sweep runs in parallel, use `sbatch --array=1-X` where `X` is the number of parallel runs.
33 |
--------------------------------------------------------------------------------
/docs/docs/tutorials/dependency_mapping.md:
--------------------------------------------------------------------------------
1 | # Dependency Mapping
2 |
3 | Dependency mapping is an _in silico_ mutagenesis technique that identifies co-conserved elements in a sequence.
4 | AIDO.ModelGenerator implements the procedure proposed by [Tomaz da Silva et al.](https://www.biorxiv.org/content/10.1101/2024.07.27.605418v1)
5 | We use this to mine functional genomic elements in the [AIDO.DNA](https://doi.org/10.1101/2024.12.01.625444) paper with the [AIDO.DNA-7B](https://huggingface.co/genbio-ai/AIDO.DNA-7B) and [AIDO.DNA-300M](https://huggingface.co/genbio-ai/AIDO.DNA-300M) models.
6 | This task uses the pre-trained models directly, and does not require finetuning.
7 |
8 | To reproduce the dependency mapping results from the AIDO.DNA paper, run the following from the ModelGenerator root directory:
9 | ```
10 | # Inference
11 | mgen predict --config experiments/AIDO.DNA/dependency_mapping/config.yaml
12 |
13 | # Plotting
14 | python experiments/AIDO.DNA/dependency_mapping/plot_dependency_maps.py \
15 | -i depmap_predictions \
16 | -o depmap_plots \
17 | -v experiments/AIDO.DNA/dependency_mapping/DNA.txt \
18 | -t modelgenerator/huggingface_models/rnabert/vocab.txt
19 | ```
20 |
21 | To create new dependency maps,
22 |
23 | 1. Gather your sequences into a .tsv file with an `id` and `sequence` column.
24 | 2. Run `mgen predict --config config.yaml` where
25 | ```
26 | model:
27 | class_path: Inference
28 | init_args:
29 | backbone:
30 | data:
31 | class_path: DependencyMappingDataModule
32 | init_args:
33 | path: # Note: this errors for ., use ../dependency_mapping if necessary
34 | test_split_files:
35 | -
36 | vocab_file: .txt # E.g. experiments/AIDO.DNA/dependency_mapping/DNA.txt
37 | trainer:
38 | callbacks:
39 | - class_path: modelgenerator.callbacks.PredictionWriter
40 | dict_kwargs:
41 | output_dir: predictions
42 | filetype: pt
43 | ```
44 |
45 | 3. Run the plotting tool
46 | ```
47 | python experiments/AIDO.DNA/dependency_mapping/plot_dependency_maps.py \
48 | -i \
49 | -o \
50 | -v \
51 | -t
52 | ```
53 |
54 | The output will be files of the name `.png` in the output directory, with heatmaps of dependencies and logos with sequence information content.
55 |
--------------------------------------------------------------------------------
/experiments/AIDO.DNA/dependency_mapping/README.md:
--------------------------------------------------------------------------------
1 | # Dependency Mapping
2 |
3 | Dependency mapping is an _in silico_ mutagenesis technique that identifies co-conserved elements in a sequence.
4 | AIDO.ModelGenerator implements the procedure proposed by [Tomaz da Silva et al.](https://www.biorxiv.org/content/10.1101/2024.07.27.605418v1)
5 | We use this to mine functional genomic elements in the [AIDO.DNA](https://doi.org/10.1101/2024.12.01.625444) paper with the [AIDO.DNA-7B](https://huggingface.co/genbio-ai/AIDO.DNA-7B) and [AIDO.DNA-300M](https://huggingface.co/genbio-ai/AIDO.DNA-300M) models.
6 | This task uses the pre-trained models directly, and does not require finetuning.
7 |
8 | To reproduce the dependency mapping results from the AIDO.DNA paper, run the following from the ModelGenerator root directory:
9 | ```
10 | # Inference
11 | mgen predict --config experiments/AIDO.DNA/dependency_mapping/config.yaml
12 |
13 | # Plotting
14 | python experiments/AIDO.DNA/dependency_mapping/plot_dependency_maps.py \
15 | -i depmap_predictions \
16 | -o depmap_plots \
17 | -v experiments/AIDO.DNA/dependency_mapping/DNA.txt \
18 | -t modelgenerator/huggingface_models/rnabert/vocab.txt
19 | ```
20 |
21 | To create new dependency maps,
22 |
23 | 1. Gather your sequences into a .tsv file with an `id` and `sequence` column.
24 | 2. Run `mgen predict --config config.yaml` where
25 | ```
26 | model:
27 | class_path: Inference
28 | init_args:
29 | backbone:
30 | data:
31 | class_path: DependencyMappingDataModule
32 | init_args:
33 | path: # Note: this errors for ., use ../dependency_mapping if necessary
34 | test_split_files:
35 | -
36 | vocab_file: .txt # E.g. experiments/AIDO.DNA/dependency_mapping/DNA.txt
37 | trainer:
38 | callbacks:
39 | - class_path: modelgenerator.callbacks.PredictionWriter
40 | dict_kwargs:
41 | output_dir: predictions
42 | filetype: pt
43 | ```
44 |
45 | 3. Run the plotting tool
46 | ```
47 | python experiments/AIDO.DNA/dependency_mapping/plot_dependency_maps.py \
48 | -i \
49 | -o \
50 | -v \
51 | -t
52 | ```
53 |
54 | The output will be files of the name `.png` in the output directory, with heatmaps of dependencies and logos with sequence information content.
55 |
--------------------------------------------------------------------------------
/docs/docs/tutorials/finetuning_scheduler.md:
--------------------------------------------------------------------------------
1 | For some of our experiments, we leverage the [gradual unfreezing finetuning scheduler](https://github.com/genbio-ai/ModelGenerator/blob/main/modelgenerator/callbacks.py#L213), adapted from [RiNALMo](https://arxiv.org/abs/2403.00043)'s [code repository](https://github.com/lbcb-sci/RiNALMo/blob/main/rinalmo/utils/finetune_callback.py).
2 |
3 | #### Creating `schedule`
4 | To use a FT scheduler, first we have to create a schedule and saving as a `.yaml` file. An example schedule is shown below:
5 | ```
6 | 0:
7 | - adapter.*
8 | 3:
9 | - backbone.encoder.encoder.ln.*
10 | - backbone.encoder.encoder.layer.32.*
11 | ```
12 |
13 | In this example, when the model is setup, all the layers are first frozen. Then before the `0-th`-th epoch starts, all the parameters in the `adapter` module are unfrozen, and they remain unfrozen (trainable) for the rest of the training run. Similarly, before the `3-rd` epoch starts, parameters in the `backbone.encoder.encoder.ln` module (i.e., the last layer norm module of the backbone's encoder) is unfrozen, and they remain unfrozen until the training ends. Here can add any other layer or module if we want to unfreeze it before the starting of some specific epoch.
14 |
15 | #### Using `schedule` when finetuning with ModelGenerator
16 | In order to use this schedule for finetuning, we can simply to set this as CLI argument for `--trainer.callbacks.ft_schedule_path` when calling `mget fit`.
17 |
18 | Following is an example of finetuning the [AIDO.RNA-1.6B](https://huggingface.co/genbio-ai/AIDO.RNA-1.6B) model for RNA secondary structure prediction, with a **scheduler named `layers_0_32.yaml`**. (**NOTE:** Please refer to the [correspoding experiment folder](https://github.com/genbio-ai/ModelGenerator/tree/main/experiments/AIDO.RNA/rna_secondary_structure_prediction) for details of this experiment):
19 | ```
20 | cd experiments/AIDO.RNA/rna_secondary_structure_prediction
21 | MGEN_DATA_DIR=~/mgen_data
22 | DATASET_NAME=bpRNA
23 | CKPT_SAVE_DIR=logs/rna_ss/${DATASET_NAME}
24 | mgen fit --config rna_ss_prediction.yaml \
25 | --data.path ${MGEN_DATA_DIR}/modelgenerator/datasets/rna_ss_data/ \
26 | --data.dataset ${DATASET_NAME} \
27 | --trainer.default_root_dir ${CKPT_SAVE_DIR} \
28 | --trainer.callbacks.ft_schedule_path ft_schedules/layers_0_32.yaml \
29 | --trainer.devices 0,1,2,3
30 | ```
31 |
--------------------------------------------------------------------------------
/experiments/AIDO.StructurePrediction/fold/openfold_local/utils/geometry/quat_rigid.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 GenBio AI.
2 | # Copyright 2024 ByteDance and/or its affiliates.
3 | # Copyright 2021 AlQuraishi Laboratory
4 | # Copyright 2021 DeepMind Technologies Limited
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import torch
19 | import torch.nn as nn
20 |
21 | from fold.openfold_local.model.primitives import Linear
22 | from fold.openfold_local.utils.geometry.rigid_matrix_vector import Rigid3Array
23 | from fold.openfold_local.utils.geometry.rotation_matrix import Rot3Array
24 | from fold.openfold_local.utils.geometry.vector import Vec3Array
25 |
26 |
27 | class QuatRigid(nn.Module):
28 | def __init__(self, c_hidden, full_quat):
29 | super().__init__()
30 | self.full_quat = full_quat
31 | if self.full_quat:
32 | rigid_dim = 7
33 | else:
34 | rigid_dim = 6
35 |
36 | self.linear = Linear(c_hidden, rigid_dim, init="final", precision=torch.float32)
37 |
38 | def forward(self, activations: torch.Tensor) -> Rigid3Array:
39 | # NOTE: During training, this needs to be run in higher precision
40 | rigid_flat = self.linear(activations)
41 |
42 | rigid_flat = torch.unbind(rigid_flat, dim=-1)
43 | if self.full_quat:
44 | qw, qx, qy, qz = rigid_flat[:4]
45 | translation = rigid_flat[4:]
46 | else:
47 | qx, qy, qz = rigid_flat[:3]
48 | qw = torch.ones_like(qx)
49 | translation = rigid_flat[3:]
50 |
51 | rotation = Rot3Array.from_quaternion(
52 | qw,
53 | qx,
54 | qy,
55 | qz,
56 | normalize=True,
57 | )
58 | translation = Vec3Array(*translation)
59 | return Rigid3Array(rotation, translation)
60 |
--------------------------------------------------------------------------------
/modelgenerator/prot_inv_fold/proteinMPNN/proteinMPNN_model_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Code in this file is adapted from
3 | - https://github.com/BytedProtein/ByProt/blob/main/src/byprot/models/fixedbb/__init__.py
4 | - https://github.com/BytedProtein/ByProt/blob/main/src/byprot/models/fixedbb/generator.py
5 | """
6 |
7 | import torch
8 | from torch import nn
9 | import numpy as np
10 |
11 | from .proteinMPNN_data_utils import Alphabet
12 |
13 | from .proteinMPNN_decoder import MPNNSequenceDecoder
14 | from .proteinMPNN_encoder import MPNNEncoder
15 |
16 |
17 | class FixedBackboneDesignEncoderDecoder(nn.Module):
18 | _default_cfg = {}
19 |
20 | def __init__(self, cfg) -> None:
21 | super().__init__()
22 | self._update_cfg(cfg)
23 |
24 | def _update_cfg(self, cfg):
25 | from omegaconf import OmegaConf
26 |
27 | self.cfg = OmegaConf.merge(self._default_cfg, cfg)
28 |
29 | @classmethod
30 | def from_config(cls, cfg):
31 | raise NotImplementedError
32 |
33 | def forward_encoder(self, batch):
34 | raise NotImplementedError
35 |
36 | def forward_decoder(self, prev_decoder_out, encoder_out):
37 | raise NotImplementedError
38 |
39 | def initialize_output_tokens(self, batch, encoder_out):
40 | raise NotImplementedError
41 |
42 | def forward(self, coords, coord_mask, tokens, token_padding_mask=None, **kwargs):
43 | raise NotImplementedError
44 |
45 | def sample(
46 | self, coords, coord_mask, tokens=None, token_padding_mask=None, **kwargs
47 | ):
48 | raise NotImplementedError
49 |
50 |
51 | ## Replaced for "from byprot.models.fixedbb.generator import new_arange, sample_from_categorical"
52 | def new_arange(x, *size):
53 | """
54 | Return a Tensor of `size` filled with a range function on the device of x.
55 | If size is empty, using the size of the variable x.
56 | """
57 | if len(size) == 0:
58 | size = x.size()
59 | return torch.arange(size[-1], device=x.device).expand(*size).contiguous()
60 |
61 |
62 | def sample_from_categorical(logits=None, temperature=1.0):
63 | if temperature and False:
64 | dist = torch.distributions.Categorical(logits=logits.div(temperature))
65 | tokens = dist.sample()
66 | scores = dist.log_prob(tokens)
67 | else:
68 | scores, tokens = logits.log_softmax(dim=-1).max(dim=-1)
69 | return tokens, scores
70 |
--------------------------------------------------------------------------------
/modelgenerator/huggingface_models/scfoundation/pretrainmodels/select_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 BioMap (Beijing) Intelligence Technology Limited
2 |
3 |
4 | from .transformer import pytorchTransformerModule
5 | from .performer import PerformerModule
6 | from .mae_autobin import MaeAutobin
7 |
8 | def select_module(config, sub_config, module_name):
9 | if module_name == 'performer':
10 | return PerformerModule(
11 | max_seq_len=config['seq_len'],
12 | dim=sub_config['hidden_dim'],
13 | depth=sub_config['depth'],
14 | heads=sub_config['heads'],
15 | dim_head=sub_config['dim_head'],
16 | ff_dropout=sub_config.get('ff_dropout',0.0),
17 | attn_dropout=sub_config.get('attn_dropout',0.0)
18 | )
19 | elif module_name == 'transformer':
20 | return pytorchTransformerModule(
21 | max_seq_len=config['seq_len'],
22 | dim=sub_config['hidden_dim'],
23 | depth=sub_config['depth'],
24 | heads=sub_config['heads']
25 | )
26 | else:
27 | print('module type error')
28 | exit(0)
29 |
30 | def select_model(config):
31 | if config["model"] == "mae_autobin":
32 | encoder_config =config['encoder']
33 | decoder_config = config['decoder']
34 | encoder = select_module(config, encoder_config, config['encoder']['module_type'])
35 | decoder = select_module(config, decoder_config, config['decoder']['module_type'])
36 | model = MaeAutobin(
37 | num_tokens=config['n_class'],
38 | max_seq_len=config['seq_len'],
39 | embed_dim=config['encoder']['hidden_dim'],
40 | decoder_embed_dim=config['decoder']['hidden_dim'],
41 | bin_alpha = config['bin_alpha'],
42 | bin_num = config['bin_num'],
43 | pad_token_id = config['pad_token_id'],
44 | mask_token_id = config['mask_token_id'],
45 | )
46 | model.encoder = encoder
47 | model.decoder = decoder
48 | else:
49 | raise NotImplementedError("Unknown model type!")
50 | return model
51 |
52 | def get_sub_config(config, target):
53 | """
54 | 获取 包含 target 的 config
55 | """
56 | sub_config = {}
57 | for k in config.keys():
58 | if target in k:
59 | tmp_name = k.replace(target + '_', '')
60 | sub_config[tmp_name] = config[k]
61 | return sub_config
62 |
--------------------------------------------------------------------------------