├── .github ├── CODEOWNERS └── workflows │ └── cla.yml ├── .gitmodules ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── cosmoflow ├── .gitignore ├── LEGAL ├── README.md ├── builds │ ├── Dockerfile │ └── Dockerfile.cpu_mpich ├── configs │ ├── cosmo.yaml │ ├── cosmo_dummy.yaml │ └── cosmo_v07.yaml ├── data │ ├── __init__.py │ ├── cosmo.py │ └── dummy.py ├── logs │ └── .gitignore ├── models │ ├── __init__.py │ ├── cosmoflow.py │ └── layers.py ├── prepare.py ├── scripts │ ├── setup_cgpu.sh │ ├── train_cgpu.sh │ └── train_cori_shifter.sh ├── train.py └── utils │ ├── __init__.py │ ├── argparse.py │ ├── callbacks.py │ ├── checkpoints.py │ ├── device.py │ ├── distributed.py │ ├── mlperf_logging.py │ ├── optimizers.py │ └── staging.py ├── deepcam ├── LICENSE ├── README.md ├── analysis │ ├── process_nsight_deepcam.ipynb │ ├── roofline_plot.ipynb │ ├── training_analysis.ipynb │ └── utils.py ├── compliance │ └── 1.0.0 │ │ ├── closed_deepcam_adam.yml │ │ ├── closed_deepcam_lamb.yml │ │ └── rcps_deepcam.json ├── docker │ ├── Dockerfile │ └── build_docker.sh ├── requirements.txt └── src │ ├── deepCam │ ├── architecture │ │ ├── __init__.py │ │ └── deeplab_xception.py │ ├── data │ │ ├── __init__.py │ │ └── cam_hdf5_dataset.py │ ├── driver │ │ ├── __init__.py │ │ ├── trainer.py │ │ └── validation.py │ ├── run_scripts │ │ ├── run_training.sh │ │ └── run_training_nranks1024.sh │ ├── train.py │ └── utils │ │ ├── __init__.py │ │ ├── bnstats.py │ │ ├── comm.py │ │ ├── losses.py │ │ ├── metric.py │ │ ├── mlperf_log_utils.py │ │ ├── optimizer_helpers.py │ │ ├── parser.py │ │ ├── schedulers.py │ │ └── types.py │ └── utils │ ├── run_stage.sh │ ├── run_summarize_circe.sh │ ├── split_data.py │ └── summarize_data.py ├── open_catalyst ├── .circleci │ └── config.yml ├── .flake8 ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── DATASET.md ├── DATASET_PER_ADSORBATE.md ├── LICENSE.md ├── MODELS.md ├── README.md ├── TRAIN.md ├── configs │ ├── is2re │ │ ├── 100k │ │ │ ├── base.yml │ │ │ ├── cgcnn │ │ │ │ └── cgcnn.yml │ │ │ ├── dimenet_plus_plus │ │ │ │ └── dpp.yml │ │ │ └── schnet │ │ │ │ └── schnet.yml │ │ ├── 10k │ │ │ ├── base.yml │ │ │ ├── cgcnn │ │ │ │ └── cgcnn.yml │ │ │ ├── dimenet_plus_plus │ │ │ │ └── dpp.yml │ │ │ └── schnet │ │ │ │ └── schnet.yml │ │ └── all │ │ │ ├── base.yml │ │ │ ├── cgcnn │ │ │ └── cgcnn.yml │ │ │ ├── dimenet_plus_plus │ │ │ └── dpp.yml │ │ │ └── schnet │ │ │ └── schnet.yml │ ├── mlperf_hpc.yml │ ├── pm_b2048.yml │ └── s2ef │ │ ├── 200k │ │ ├── base.yml │ │ ├── cgcnn │ │ │ └── cgcnn.yml │ │ ├── dimenet_plus_plus │ │ │ └── dpp.yml │ │ ├── forcenet │ │ │ └── fn_forceonly.yml │ │ └── schnet │ │ │ └── schnet.yml │ │ ├── 20M │ │ ├── base.yml │ │ ├── cgcnn │ │ │ └── cgcnn.yml │ │ ├── dimenet_plus_plus │ │ │ └── dpp.yml │ │ └── schnet │ │ │ └── schnet.yml │ │ ├── 2M │ │ ├── base.yml │ │ ├── cgcnn │ │ │ └── cgcnn.yml │ │ ├── dimenet_plus_plus │ │ │ ├── dpp.yml │ │ │ └── dpp_relax.yml │ │ └── schnet │ │ │ └── schnet.yml │ │ └── all │ │ ├── base.yml │ │ ├── cgcnn │ │ └── cgcnn.yml │ │ ├── dimenet_plus_plus │ │ ├── dpp.yml │ │ ├── dpp10.7M_forceonly.yml │ │ ├── dpp_energyonly.yml │ │ └── dpp_forceonly.yml │ │ └── schnet │ │ └── schnet.yml ├── docker │ └── Dockerfile ├── docs │ ├── Makefile │ ├── make.bat │ ├── requirements.txt │ └── source │ │ ├── conf.py │ │ ├── index.rst │ │ ├── modules │ │ ├── dataset.rst │ │ ├── model.rst │ │ └── trainer.rst │ │ └── tutorials │ │ ├── data_preprocessing.ipynb │ │ ├── data_visualization.ipynb │ │ ├── getting_started.rst │ │ ├── lmdb_dataset_creation.ipynb │ │ ├── submission.rst │ │ ├── train_s2ef_example.ipynb │ │ └── training.rst ├── env.common.yml ├── env.cpu.yml ├── env.gpu.yml ├── env.yml ├── licenses │ ├── LICENSE.cgcnn │ └── LICENSE.mmf ├── logs │ └── .gitignore ├── main.py ├── ocpmodels │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── data_parallel.py │ │ ├── distutils.py │ │ ├── flags.py │ │ ├── hpo_utils.py │ │ ├── logger.py │ │ ├── registry.py │ │ ├── relaxation │ │ │ ├── __init__.py │ │ │ ├── ase_utils.py │ │ │ ├── ml_relaxation.py │ │ │ └── optimizers │ │ │ │ ├── __init__.py │ │ │ │ └── lbfgs_torch.py │ │ ├── transforms.py │ │ └── utils.py │ ├── datasets │ │ ├── __init__.py │ │ ├── embeddings │ │ │ ├── __init__.py │ │ │ ├── atomic_radii.py │ │ │ ├── continuous_embeddings.py │ │ │ └── khot_embeddings.py │ │ ├── single_point_lmdb.py │ │ └── trajectory_lmdb.py │ ├── models │ │ ├── __init__.py │ │ ├── base.py │ │ ├── cgcnn.py │ │ ├── dimenet.py │ │ ├── dimenet_plus_plus.py │ │ ├── forcenet.py │ │ ├── schnet.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── activations.py │ │ │ └── basis.py │ ├── modules │ │ ├── __init__.py │ │ ├── evaluator.py │ │ ├── exponential_moving_average.py │ │ ├── loss.py │ │ ├── normalizer.py │ │ └── scheduler.py │ ├── preprocessing │ │ ├── __init__.py │ │ └── atoms_to_graphs.py │ ├── tasks │ │ ├── __init__.py │ │ └── task.py │ └── trainers │ │ ├── __init__.py │ │ ├── base_trainer.py │ │ ├── energy_trainer.py │ │ ├── forces_trainer.py │ │ └── mlperf_forces_trainer.py ├── pyproject.toml ├── scripts │ ├── __init__.py │ ├── download_data.py │ ├── gif_maker_parallelized.py │ ├── hpo │ │ ├── README.md │ │ ├── __init__.py │ │ ├── run_tune.py │ │ ├── run_tune_pbt.py │ │ └── slurm │ │ │ ├── start-head.sh │ │ │ ├── start-worker.sh │ │ │ └── submit-ray-cluster.sbatch │ ├── make_submission_file.py │ ├── preprocess_ef.py │ ├── preprocess_relaxed.py │ ├── run_training.sh │ ├── train_cgpu.sh │ ├── train_cgpu_shifter.sh │ ├── train_pm.sh │ ├── train_pm_shifter.sh │ └── uncompress.py ├── setup.py ├── submit.sh └── tests │ ├── __init__.py │ ├── evaluator │ └── test_evaluator.py │ ├── models │ ├── atoms.json │ ├── test_cgcnn.py │ ├── test_dimenet.py │ ├── test_dimenetpp.py │ ├── test_forcenet.py │ └── test_schnet.py │ └── preprocessing │ ├── __init__.py │ ├── atoms.json │ ├── test_atoms_to_graphs.py │ └── test_pbc.py └── openfold ├── Dockerfile ├── LICENSE ├── NOTICE ├── README.md ├── openfold ├── __init__.py ├── checkpoint_utils.py ├── config.py ├── data │ ├── __init__.py │ ├── alignments.py │ ├── cameo_targets.py │ ├── features.py │ ├── mmcif.py │ ├── parsers.py │ ├── residue_constants.py │ ├── resources │ │ ├── README.md │ │ ├── __init__.py │ │ └── stereo_chemical_props.txt │ ├── templates.py │ ├── tools │ │ ├── __init__.py │ │ └── kalign.py │ └── transforms.py ├── dataloaders.py ├── datasets.py ├── distributed.py ├── helpers.py ├── log_utils.py ├── loss.py ├── lr_scheduler.py ├── model │ ├── __init__.py │ ├── alphafold.py │ ├── angle_resnet.py │ ├── attention.py │ ├── auxiliary_heads.py │ ├── backbone_update.py │ ├── dropout.py │ ├── evoformer_block.py │ ├── evoformer_block_core.py │ ├── evoformer_stack.py │ ├── extra_msa_block.py │ ├── extra_msa_embedder.py │ ├── extra_msa_stack.py │ ├── global_attention.py │ ├── input_embedder.py │ ├── invariant_point_attention.py │ ├── layer_norm.py │ ├── linear.py │ ├── msa_column_attention.py │ ├── msa_column_global_attention.py │ ├── msa_row_attention_with_pair_bias.py │ ├── msa_transition.py │ ├── outer_product_mean.py │ ├── pair_transition.py │ ├── recycling_embedder.py │ ├── single_transition.py │ ├── structure_module.py │ ├── template_angle_embedder.py │ ├── template_pair_block.py │ ├── template_pair_embedder.py │ ├── template_pair_stack.py │ ├── template_pointwise_attention.py │ ├── triangular_attention.py │ └── triangular_multiplicative_update.py ├── numpy_utils.py ├── rigid_utils.py ├── samplers.py ├── superimposition.py ├── swa.py ├── torch_utils.py └── validation_metrics.py ├── scripts ├── activate_local_openfold_venv.sh ├── build_local_openfold_venv.sh ├── deactivate_local_openfold_venv.sh ├── download_open_protein_set.sh ├── download_pdb_mmcif.sh ├── multi_node_training.sub ├── preprocess_open_protein_set.py └── preprocess_pdb_mmcif.py ├── setup.py └── train.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These owners will be the default owners for everything in the repo. 2 | # Unless a later match takes precedence,they will be requested for review when someone opens a pull request. 3 | * @mlcommons/wg-hpc @mlcommons/wg-science 4 | 5 | /.github/CODEOWNERS @mlcommons/systems 6 | 7 | /.github/workflows/cla.yml @mlcommons/systems 8 | 9 | /LICENSE.md @mlcommons/systems 10 | -------------------------------------------------------------------------------- /.github/workflows/cla.yml: -------------------------------------------------------------------------------- 1 | 2 | name: "cla-bot" 3 | on: 4 | issue_comment: 5 | types: [created] 6 | pull_request_target: 7 | types: [opened,closed,synchronize] 8 | 9 | jobs: 10 | cla-check: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: "MLCommons CLA bot check" 14 | if: (github.event.comment.body == 'recheck') || github.event_name == 'pull_request_target' 15 | # Alpha Release 16 | uses: mlcommons/cla-bot@master 17 | env: 18 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 19 | # the below token should have repo scope and must be manually added by you in the repository's secret 20 | PERSONAL_ACCESS_TOKEN : ${{ secrets.MLCOMMONS_BOT_CLA_TOKEN }} 21 | with: 22 | path-to-signatures: 'cla-bot/v1/cla.json' 23 | # branch should not be protected 24 | branch: 'main' 25 | allowlist: user1,bot* 26 | remote-organization-name: mlcommons 27 | remote-repository-name: systems 28 | 29 | #below are the optional inputs - If the optional inputs are not given, then default values will be taken 30 | #remote-organization-name: enter the remote organization name where the signatures should be stored (Default is storing the signatures in the same repository) 31 | #remote-repository-name: enter the remote repository name where the signatures should be stored (Default is storing the signatures in the same repository) 32 | #create-file-commit-message: 'For example: Creating file for storing CLA Signatures' 33 | #signed-commit-message: 'For example: $contributorName has signed the CLA in #$pullRequestNo' 34 | #custom-notsigned-prcomment: 'pull request comment with Introductory message to ask new contributors to sign' 35 | #custom-pr-sign-comment: 'The signature to be committed in order to sign the CLA' 36 | #custom-allsigned-prcomment: 'pull request comment when all contributors has signed, defaults to **CLA Assistant Lite bot** All Contributors have signed the CLA.' 37 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/hpc/2c627d457004eff77a014205b3151ed48a6fa149/.gitmodules -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing 2 | 3 | The best way to contribute to the MLCommons is to get involved with one of our many project communities. You find more information about getting involved with MLCommons [here](https://mlcommons.org/en/get-involved/#getting-started). 4 | 5 | Generally we encourage people to become a MLCommons member if they wish to contribute to MLCommons projects, but outside pull requests are very welcome too. 6 | 7 | To get started contributing code, you or your organization needs to sign the MLCommons CLA found at the [MLC policies page](https://mlcommons.org/en/policies/). Once you or your organization has signed the corporate CLA, please fill out this [CLA sign up form](https://forms.gle/Ew1KkBVpyeJDuRw67) form to get your specific GitHub handle authorized so that you can start contributing code under the proper license. 8 | 9 | MLCommons project work is tracked with issue trackers and pull requests. Modify the project in your own fork and issue a pull request once you want other developers to take a look at what you have done and discuss the proposed changes. Ensure that cla-bot and other checks pass for your Pull requests. 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MLPerf™ HPC reference implementations 2 | 3 | This is a repository of reference implementations for the MLPerf HPC benchmarks. 4 | 5 | General format should follow https://github.com/mlperf/training. 6 | 7 | ## Rules 8 | 9 | The MLPerf HPC rules are based on the MLPerf Training rules with 10 | some adjustments. 11 | 12 | The MLPerf Training rules are available at [training\_rules](https://github.com/mlcommons/training_policies/blob/master/training_rules.adoc). 13 | 14 | The MLPerf HPC specific rules are at [hpc\_training\_rules](https://github.com/mlcommons/training_policies/blob/master/hpc_training_rules.adoc). 15 | 16 | ## Compliance 17 | The MLPerf logging package implements logging and compliance-checking utilities. This is available in hpc-1.0-branch of the MLPerf logging repository (https://github.com/mlcommons/logging/tree/hpc-1.0-branch). 18 | These work for the HPC v2.0 submissions as well. 19 | 20 | To install and test compliance of your runs/submissions: 21 | 22 | ``` 23 | # Install the package into your python environment. 24 | # A development install (-e) is recommended for now so you can pull new updates. 25 | git clone -b hpc-1.0-branch https://github.com/mlcommons/logging mlperf-logging 26 | pip install [--user] -e mlperf-logging 27 | 28 | # Test a full submission folder 29 | python3 -m mlperf_logging.package_checker hpc 1.0.0 30 | ``` 31 | 32 | There is also a script that performs compliance checks and summarizes the results. From the mlperf-logging directory (https://github.com/mlcommons/logging), use 33 | ``` 34 | ./scripts/verify_for_v1.0_hpc.sh 35 | ``` 36 | 37 | 38 | -------------------------------------------------------------------------------- /cosmoflow/.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__ 2 | *.ipynb_checkpoints 3 | -------------------------------------------------------------------------------- /cosmoflow/LEGAL: -------------------------------------------------------------------------------- 1 | *** Copyright Notice *** 2 | 'Regression of 3D Sky Map to Cosmological Parameters (CosmoFlow)' Copyright (c) 2018, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved. 3 | If you have questions about your rights to use or distribute this software, please contact Berkeley Lab's Intellectual Property Office at IPO@lbl.gov. 4 | NOTICE. This Software was developed under funding from the U.S. Department of Energy and the U.S. Government consequently retains certain rights. As such, the U.S. Government has been granted for itself and others acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce, distribute copies to the public, prepare derivative works, and perform publicly and display publicly, and to permit other to do so. 5 | **************************** 6 | -------------------------------------------------------------------------------- /cosmoflow/README.md: -------------------------------------------------------------------------------- 1 | # CosmoFlow TensorFlow Keras benchmark implementation 2 | 3 | This is a an implementation of the 4 | [CosmoFlow](https://arxiv.org/abs/1808.04728) 3D convolutional neural network 5 | for benchmarking. It is written in TensorFlow with the Keras API and uses 6 | [Horovod](https://github.com/horovod/horovod) for distributed training. 7 | 8 | You can find the previous TensorFlow implementation which accompanied the CosmoFlow paper at 9 | https://github.com/NERSC/CosmoFlow 10 | 11 | ## Datasets 12 | 13 | The dataset we use for this benchmark comes from simulations run by the 14 | ExaLearn group and hosted at NERSC. The following web portal describes the 15 | technical content of the dataset and provides links to the raw data. 16 | 17 | https://portal.nersc.gov/project/m3363/ 18 | 19 | For this benchmark we currently use a preprocessed version of the dataset which 20 | generates crops of size (128, 128, 128, 4) and stores in TFRecord format. 21 | This preprocessing is done using the [prepare.py](prepare.py) script included 22 | in this package. We describe here how to get access to this processed dataset, 23 | but please refer to the ExaLearn web portal for additional technical details. 24 | 25 | Globus is the current recommended way to transfer the dataset locally. 26 | There is a globus endpoint at: 27 | 28 | https://app.globus.org/file-manager?origin_id=31647fba-a006-4322-ad3e-9a4f124db422 29 | 30 | The contents are also available via HTTPS at: 31 | 32 | https://portal.nersc.gov/project/dasrepo/cosmoflow-benchmark/ 33 | 34 | ### MLPerf HPC v1.0 preliminary dataset 35 | 36 | Preprocessed TFRecord files are available in a 1.7TB tarball named 37 | `cosmoUniverse_2019_05_4parE_tf_v2.tar`. It contains subfolders for 38 | train/val/test file splits. 39 | 40 | In this preparation, there are 524288 samples for training and 65536 samples for 41 | validation. The TFRecord files are written with gzip compression to reduce total 42 | storage size. 43 | 44 | ### MLPerf HPC v0.7 dataset 45 | 46 | The pre-processed dataset in TFRecord format is in the 47 | `cosmoUniverse_2019_05_4parE_tf` folder, which contains training and validation 48 | subfolders. There are 262144 samples for training and 65536 samples 49 | for validation/testing. The combined size of the dataset is 5.1 TB. 50 | 51 | For getting started, there is also a small tarball (179MB) with 32 training 52 | samples and 32 validation samples, called `cosmoUniverse_2019_05_4parE_tf_small.tgz`. 53 | 54 | ## Running the benchmark 55 | 56 | Submission scripts are in `scripts`. YAML configuration files go in `configs`. 57 | 58 | ### Running at NERSC 59 | 60 | `sbatch -N 64 scripts/train_cori.sh` 61 | -------------------------------------------------------------------------------- /cosmoflow/builds/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/tensorflow:22.04-tf2-py3 2 | 3 | RUN python -m pip install --no-cache-dir -U pip 4 | 5 | RUN pip install --no-cache-dir pandas wandb 6 | 7 | # Install MLPerf-logging 8 | RUN pip install --no-cache-dir "git+https://github.com/mlcommons/logging.git" 9 | -------------------------------------------------------------------------------- /cosmoflow/builds/Dockerfile.cpu_mpich: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | WORKDIR /tmp 3 | 4 | RUN apt-get update && \ 5 | apt-get install --yes \ 6 | build-essential cmake git curl \ 7 | gfortran-8 gcc-8 g++-8 \ 8 | python3-dev \ 9 | python3-pip \ 10 | wget less vim && \ 11 | apt-get clean all 12 | 13 | #gfortran gcc g++ \ 14 | 15 | RUN ln -s /usr/bin/python3 /usr/bin/python 16 | 17 | RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ 18 | python get-pip.py && \ 19 | rm get-pip.py 20 | 21 | ARG mpich=3.3.2 22 | ARG mpich_prefix=mpich-$mpich 23 | 24 | ENV FC=gfortran-8 25 | ENV F77=gfortran-8 26 | ENV CC=gcc-8 27 | ENV CXX=g++-8 28 | 29 | RUN wget https://www.mpich.org/static/downloads/$mpich/$mpich_prefix.tar.gz && \ 30 | tar xvzf $mpich_prefix.tar.gz && \ 31 | cd $mpich_prefix && \ 32 | ./configure && \ 33 | make -j 4 && \ 34 | make install && \ 35 | make clean && \ 36 | cd .. && \ 37 | rm -rf $mpich_prefix 38 | 39 | # Now need the python+TF dependencies 40 | RUN pip install \ 41 | ipython \ 42 | pandas \ 43 | pyyaml \ 44 | intel-tensorflow==1.15.2 45 | #tensorflow==1.15.2 46 | 47 | # Now need Horovod 48 | RUN HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir -v horovod 49 | 50 | # Install MLPerf-logging 51 | RUN pip install --no-cache-dir "git+https://github.com/mlperf-hpc/logging.git@hpc-0.5.0" 52 | 53 | RUN /sbin/ldconfig 54 | -------------------------------------------------------------------------------- /cosmoflow/configs/cosmo.yaml: -------------------------------------------------------------------------------- 1 | output_dir: results/cosmo-002 2 | 3 | mlperf: 4 | org: LBNL 5 | division: closed 6 | status: onprem 7 | platform: SUBMISSION_PLATFORM_PLACEHOLDER 8 | 9 | data: 10 | name: cosmo 11 | data_dir: /global/cscratch1/sd/sfarrell/cosmoflow-benchmark/data/cosmoUniverse_2019_05_4parE_tf_v2 12 | compression: GZIP 13 | n_train: 524288 14 | n_valid: 65536 15 | sample_shape: [128, 128, 128, 4] 16 | batch_size: 4 17 | n_epochs: 128 18 | shard: True 19 | apply_log: True 20 | prefetch: 4 21 | 22 | model: 23 | name: cosmoflow 24 | input_shape: [128, 128, 128, 4] 25 | kernel_size: 3 26 | target_size: 4 27 | conv_size: 32 28 | fc1_size: 128 29 | fc2_size: 64 30 | hidden_activation: LeakyReLU 31 | pooling_type: MaxPool3D 32 | dropout: 0.5 33 | 34 | optimizer: 35 | name: SGD 36 | momentum: 0.9 37 | 38 | lr_schedule: 39 | # Standard linear LR scaling configuration 40 | base_lr: 0.001 41 | scaling: linear 42 | base_batch_size: 64 43 | n_warmup_epochs: 4 44 | 45 | # Learning rate decay epochs and decay factors 46 | decay_schedule: 47 | 32: 0.25 48 | 64: 0.125 49 | 50 | train: 51 | loss: mse 52 | metrics: ['mean_absolute_error'] 53 | #early_stopping_patience: 8 54 | 55 | # Uncomment to stop at target quality 56 | #target_mae: 0.124 57 | -------------------------------------------------------------------------------- /cosmoflow/configs/cosmo_dummy.yaml: -------------------------------------------------------------------------------- 1 | output_dir: results/cosmo-dummy-000 2 | 3 | data: 4 | name: dummy 5 | n_train: 256 6 | n_valid: 256 7 | sample_shape: [128, 128, 128, 4] 8 | target_shape: [4] 9 | batch_size: 4 10 | n_epochs: 4 11 | shard: True 12 | 13 | model: 14 | name: cosmoflow 15 | input_shape: [128, 128, 128, 4] 16 | target_size: 4 17 | conv_size: 32 18 | fc1_size: 128 19 | fc2_size: 64 20 | hidden_activation: LeakyReLU 21 | pooling_type: MaxPool3D 22 | dropout: 0.5 23 | 24 | optimizer: 25 | name: SGD 26 | momentum: 0.9 27 | 28 | lr_schedule: 29 | # Standard linear LR scaling configuration, tested up to batch size 1024 30 | base_lr: 0.001 31 | scaling: linear 32 | base_batch_size: 64 33 | 34 | # Alternate sqrt LR scaling which has worked well for batch size 512-1024. 35 | #base_lr: 0.0025 36 | #scaling: sqrt 37 | #base_batch_size: 32 38 | 39 | n_warmup_epochs: 4 40 | 41 | # You may want to adjust these decay epochs depending on your batch size. 42 | # E.g. if training batch size 64 you may want to decay at 16 and 32 epochs. 43 | decay_schedule: 44 | 32: 0.25 45 | 64: 0.125 46 | 47 | train: 48 | loss: mse 49 | metrics: ['mean_absolute_error'] 50 | early_stopping_patience: 8 51 | -------------------------------------------------------------------------------- /cosmoflow/configs/cosmo_v07.yaml: -------------------------------------------------------------------------------- 1 | # This YAML file describes the configuration for the MLPerf HPC v0.7 reference. 2 | 3 | output_dir: results/cosmo-000 4 | 5 | mlperf: 6 | org: LBNL 7 | division: closed 8 | status: onprem 9 | platform: SUBMISSION_PLATFORM_PLACEHOLDER 10 | 11 | data: 12 | name: cosmo 13 | data_dir: /global/cscratch1/sd/sfarrell/cosmoflow-benchmark/data/cosmoUniverse_2019_05_4parE_tf 14 | n_train: 262144 15 | n_valid: 65536 16 | sample_shape: [128, 128, 128, 4] 17 | batch_size: 4 18 | n_epochs: 128 19 | shard: True 20 | apply_log: True 21 | prefetch: 4 22 | 23 | model: 24 | name: cosmoflow 25 | input_shape: [128, 128, 128, 4] 26 | target_size: 4 27 | conv_size: 32 28 | fc1_size: 128 29 | fc2_size: 64 30 | hidden_activation: LeakyReLU 31 | pooling_type: MaxPool3D 32 | dropout: 0.5 33 | 34 | optimizer: 35 | name: SGD 36 | momentum: 0.9 37 | 38 | lr_schedule: 39 | # Standard linear LR scaling configuration, tested up to batch size 1024 40 | base_lr: 0.001 41 | scaling: linear 42 | base_batch_size: 64 43 | 44 | # Alternate sqrt LR scaling which has worked well for batch size 512-1024. 45 | #base_lr: 0.0025 46 | #scaling: sqrt 47 | #base_batch_size: 32 48 | 49 | n_warmup_epochs: 4 50 | 51 | # You may want to adjust these decay epochs depending on your batch size. 52 | # E.g. if training batch size 64 you may want to decay at 16 and 32 epochs. 53 | decay_schedule: 54 | 32: 0.25 55 | 64: 0.125 56 | 57 | train: 58 | loss: mse 59 | metrics: ['mean_absolute_error'] 60 | 61 | # Uncomment to stop at target quality 62 | #target_mae: 0.124 63 | -------------------------------------------------------------------------------- /cosmoflow/data/__init__.py: -------------------------------------------------------------------------------- 1 | # 'Regression of 3D Sky Map to Cosmological Parameters (CosmoFlow)' 2 | # Copyright (c) 2018, The Regents of the University of California, 3 | # through Lawrence Berkeley National Laboratory (subject to receipt of any 4 | # required approvals from the U.S. Dept. of Energy). All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # If you have questions about your rights to use or distribute this software, 19 | # please contact Berkeley Lab's Innovation & Partnerships Office at IPO@lbl.gov. 20 | # 21 | # NOTICE. This Software was developed under funding from the U.S. Department of 22 | # Energy and the U.S. Government consequently retains certain rights. As such, 23 | # the U.S. Government has been granted for itself and others acting on its 24 | # behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software 25 | # to reproduce, distribute copies to the public, prepare derivative works, and 26 | # perform publicly and display publicly, and to permit other to do so. 27 | 28 | """ 29 | Keras dataset specifications. 30 | """ 31 | 32 | 33 | def get_datasets(name, **data_args): 34 | if name == 'dummy': 35 | from .dummy import get_datasets 36 | return get_datasets(**data_args) 37 | elif name == 'cosmo': 38 | from .cosmo import get_datasets 39 | return get_datasets(**data_args) 40 | else: 41 | raise ValueError('Dataset %s unknown' % name) 42 | -------------------------------------------------------------------------------- /cosmoflow/data/dummy.py: -------------------------------------------------------------------------------- 1 | # 'Regression of 3D Sky Map to Cosmological Parameters (CosmoFlow)' 2 | # Copyright (c) 2018, The Regents of the University of California, 3 | # through Lawrence Berkeley National Laboratory (subject to receipt of any 4 | # required approvals from the U.S. Dept. of Energy). All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # If you have questions about your rights to use or distribute this software, 19 | # please contact Berkeley Lab's Innovation & Partnerships Office at IPO@lbl.gov. 20 | # 21 | # NOTICE. This Software was developed under funding from the U.S. Department of 22 | # Energy and the U.S. Government consequently retains certain rights. As such, 23 | # the U.S. Government has been granted for itself and others acting on its 24 | # behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software 25 | # to reproduce, distribute copies to the public, prepare derivative works, and 26 | # perform publicly and display publicly, and to permit other to do so. 27 | 28 | """ 29 | Random dummy dataset specification. 30 | """ 31 | 32 | # Externals 33 | import tensorflow as tf 34 | 35 | 36 | def construct_dataset(sample_shape, target_shape, 37 | batch_size=1, n_samples=32): 38 | x = tf.random.uniform([n_samples]+sample_shape) 39 | y = tf.random.uniform([n_samples]+target_shape) 40 | data = tf.data.Dataset.from_tensor_slices((x, y)) 41 | return data.repeat().batch(batch_size).prefetch(4) 42 | 43 | 44 | def get_datasets(sample_shape, target_shape, batch_size, 45 | n_train, n_valid, dist, n_epochs=None, shard=False): 46 | train_dataset = construct_dataset(sample_shape, target_shape, batch_size=batch_size) 47 | valid_dataset = None 48 | if n_valid > 0: 49 | valid_dataset = construct_dataset(sample_shape, target_shape, batch_size=batch_size) 50 | n_train_steps = n_train // batch_size 51 | n_valid_steps = n_valid // batch_size 52 | if shard: 53 | n_train_steps = n_train_steps // dist.size 54 | n_valid_steps = n_valid_steps // dist.size 55 | 56 | return dict(train_dataset=train_dataset, valid_dataset=valid_dataset, 57 | n_train=n_train, n_valid=n_valid, n_train_steps=n_train_steps, 58 | n_valid_steps=n_valid_steps) 59 | -------------------------------------------------------------------------------- /cosmoflow/logs/.gitignore: -------------------------------------------------------------------------------- 1 | *.out 2 | -------------------------------------------------------------------------------- /cosmoflow/models/__init__.py: -------------------------------------------------------------------------------- 1 | # 'Regression of 3D Sky Map to Cosmological Parameters (CosmoFlow)' 2 | # Copyright (c) 2018, The Regents of the University of California, 3 | # through Lawrence Berkeley National Laboratory (subject to receipt of any 4 | # required approvals from the U.S. Dept. of Energy). All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # If you have questions about your rights to use or distribute this software, 19 | # please contact Berkeley Lab's Innovation & Partnerships Office at IPO@lbl.gov. 20 | # 21 | # NOTICE. This Software was developed under funding from the U.S. Department of 22 | # Energy and the U.S. Government consequently retains certain rights. As such, 23 | # the U.S. Government has been granted for itself and others acting on its 24 | # behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software 25 | # to reproduce, distribute copies to the public, prepare derivative works, and 26 | # perform publicly and display publicly, and to permit other to do so. 27 | 28 | """ 29 | Keras example model factory functions. 30 | """ 31 | 32 | import importlib 33 | 34 | def get_model(name, **model_args): 35 | """Factory function for constructing a model by name with args""" 36 | module = importlib.import_module('.' + name, 'models') 37 | return module.build_model(**model_args) 38 | -------------------------------------------------------------------------------- /cosmoflow/models/layers.py: -------------------------------------------------------------------------------- 1 | # 'Regression of 3D Sky Map to Cosmological Parameters (CosmoFlow)' 2 | # Copyright (c) 2018, The Regents of the University of California, 3 | # through Lawrence Berkeley National Laboratory (subject to receipt of any 4 | # required approvals from the U.S. Dept. of Energy). All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # If you have questions about your rights to use or distribute this software, 19 | # please contact Berkeley Lab's Innovation & Partnerships Office at IPO@lbl.gov. 20 | # 21 | # NOTICE. This Software was developed under funding from the U.S. Department of 22 | # Energy and the U.S. Government consequently retains certain rights. As such, 23 | # the U.S. Government has been granted for itself and others acting on its 24 | # behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software 25 | # to reproduce, distribute copies to the public, prepare derivative works, and 26 | # perform publicly and display publicly, and to permit other to do so. 27 | 28 | """Custom layer functionality""" 29 | 30 | def scale_1p2(x): 31 | """Simple scaling function for Lambda layers. 32 | 33 | Just multiplies the input by 1.2. Useful for extending the coverage of a 34 | tanh activation for targets in the range [-1,1]. 35 | """ 36 | return x*1.2 37 | -------------------------------------------------------------------------------- /cosmoflow/scripts/setup_cgpu.sh: -------------------------------------------------------------------------------- 1 | # Source this script to setup the runtime environment on cori 2 | module load cgpu tensorflow/2.5.0-gpu 3 | module list 4 | -------------------------------------------------------------------------------- /cosmoflow/scripts/train_cgpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -C gpu -c 10 3 | #SBATCH --ntasks-per-node 8 4 | #SBATCH --gpus-per-task 1 5 | #SBATCH --exclusive 6 | #SBATCH -t 4:00:00 7 | #SBATCH -J train-cgpu 8 | #SBATCH -o logs/%x-%j.out 9 | 10 | . scripts/setup_cgpu.sh 11 | #export HOROVOD_TIMELINE=./timeline.json 12 | 13 | # Slurm job variables 14 | env | grep SLURM_JOB 15 | 16 | set -x 17 | srun -l -u python train.py -d --rank-gpu $@ 18 | -------------------------------------------------------------------------------- /cosmoflow/scripts/train_cori_shifter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -C knl 3 | #SBATCH -q debug 4 | #SBATCH -t 30 5 | #SBATCH -J train-cori 6 | #SBATCH --image docker:sfarrell/cosmoflow-cpu-mpich:latest 7 | #SBATCH -o logs/%x-%j.out 8 | 9 | export OMP_NUM_THREADS=32 10 | export KMP_BLOCKTIME=1 11 | export KMP_AFFINITY="granularity=fine,compact,1,0" 12 | export HDF5_USE_FILE_LOCKING=FALSE 13 | 14 | set -x 15 | srun -l -u shifter python train.py -d $@ 16 | -------------------------------------------------------------------------------- /cosmoflow/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | """ 3 | -------------------------------------------------------------------------------- /cosmoflow/utils/argparse.py: -------------------------------------------------------------------------------- 1 | # 'Regression of 3D Sky Map to Cosmological Parameters (CosmoFlow)' 2 | # Copyright (c) 2018, The Regents of the University of California, 3 | # through Lawrence Berkeley National Laboratory (subject to receipt of any 4 | # required approvals from the U.S. Dept. of Energy). All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # If you have questions about your rights to use or distribute this software, 19 | # please contact Berkeley Lab's Innovation & Partnerships Office at IPO@lbl.gov. 20 | # 21 | # NOTICE. This Software was developed under funding from the U.S. Department of 22 | # Energy and the U.S. Government consequently retains certain rights. As such, 23 | # the U.S. Government has been granted for itself and others acting on its 24 | # behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software 25 | # to reproduce, distribute copies to the public, prepare derivative works, and 26 | # perform publicly and display publicly, and to permit other to do so. 27 | 28 | """Utility code for argparse""" 29 | 30 | import argparse 31 | import yaml 32 | 33 | 34 | class ReadYaml(argparse.Action): 35 | def __call__(self, parser, namespace, values, option_string=None): 36 | my_dict = yaml.load(values, Loader=yaml.FullLoader) 37 | setattr(namespace, self.dest, my_dict) 38 | -------------------------------------------------------------------------------- /cosmoflow/utils/checkpoints.py: -------------------------------------------------------------------------------- 1 | # 'Regression of 3D Sky Map to Cosmological Parameters (CosmoFlow)' 2 | # Copyright (c) 2018, The Regents of the University of California, 3 | # through Lawrence Berkeley National Laboratory (subject to receipt of any 4 | # required approvals from the U.S. Dept. of Energy). All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # If you have questions about your rights to use or distribute this software, 19 | # please contact Berkeley Lab's Innovation & Partnerships Office at IPO@lbl.gov. 20 | # 21 | # NOTICE. This Software was developed under funding from the U.S. Department of 22 | # Energy and the U.S. Government consequently retains certain rights. As such, 23 | # the U.S. Government has been granted for itself and others acting on its 24 | # behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software 25 | # to reproduce, distribute copies to the public, prepare derivative works, and 26 | # perform publicly and display publicly, and to permit other to do so. 27 | 28 | """Utility code for handling checkpoint loading""" 29 | 30 | # System imports 31 | import os 32 | import logging 33 | 34 | # External imports 35 | import tensorflow as tf 36 | import horovod.tensorflow.keras as hvd 37 | 38 | def reload_last_checkpoint(checkpoint_format, n_epochs, distributed): 39 | """Finds and loads the last checkpoint matching the provided pattern""" 40 | # Count down from n_epochs to 0 to find the last epoch. 41 | # Note that keras names checkpoint files with epoch number starting from 1. 42 | # So the matched number corresponds to the new initial epoch. 43 | for epoch in range(n_epochs, 0, -1): 44 | checkpoint = checkpoint_format.format(epoch=epoch) 45 | if os.path.exists(checkpoint): 46 | logging.info('Found last checkpoint at %s', checkpoint) 47 | # Use special reload to prepare the DistributedOptimizer 48 | if distributed: 49 | model = hvd.load_model(checkpoint) 50 | else: 51 | model = tf.keras.models.load_model(checkpoint) 52 | return epoch, model 53 | raise Exception('Unable to find a checkpoint file at %s' % checkpoint_format) 54 | -------------------------------------------------------------------------------- /cosmoflow/utils/distributed.py: -------------------------------------------------------------------------------- 1 | # 'Regression of 3D Sky Map to Cosmological Parameters (CosmoFlow)' 2 | # Copyright (c) 2018, The Regents of the University of California, 3 | # through Lawrence Berkeley National Laboratory (subject to receipt of any 4 | # required approvals from the U.S. Dept. of Energy). All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # If you have questions about your rights to use or distribute this software, 19 | # please contact Berkeley Lab's Innovation & Partnerships Office at IPO@lbl.gov. 20 | # 21 | # NOTICE. This Software was developed under funding from the U.S. Department of 22 | # Energy and the U.S. Government consequently retains certain rights. As such, 23 | # the U.S. Government has been granted for itself and others acting on its 24 | # behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software 25 | # to reproduce, distribute copies to the public, prepare derivative works, and 26 | # perform publicly and display publicly, and to permit other to do so. 27 | 28 | """Utilties for distributed processing""" 29 | 30 | import horovod.tensorflow.keras as hvd 31 | 32 | 33 | def rank(): 34 | try: 35 | return hvd.rank() 36 | except ValueError: 37 | return 0 38 | 39 | 40 | def barrier(): 41 | try: 42 | hvd.allreduce([], name='Barrier') 43 | except ValueError: 44 | pass 45 | -------------------------------------------------------------------------------- /cosmoflow/utils/mlperf_logging.py: -------------------------------------------------------------------------------- 1 | # 'Regression of 3D Sky Map to Cosmological Parameters (CosmoFlow)' 2 | # Copyright (c) 2018, The Regents of the University of California, 3 | # through Lawrence Berkeley National Laboratory (subject to receipt of any 4 | # required approvals from the U.S. Dept. of Energy). All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # If you have questions about your rights to use or distribute this software, 19 | # please contact Berkeley Lab's Innovation & Partnerships Office at IPO@lbl.gov. 20 | # 21 | # NOTICE. This Software was developed under funding from the U.S. Department of 22 | # Energy and the U.S. Government consequently retains certain rights. As such, 23 | # the U.S. Government has been granted for itself and others acting on its 24 | # behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software 25 | # to reproduce, distribute copies to the public, prepare derivative works, and 26 | # perform publicly and display publicly, and to permit other to do so. 27 | 28 | """ 29 | Utilities for MLPerf logging. 30 | Depends on the mlperf_logging package at 31 | https://github.com/mlperf/logging 32 | """ 33 | 34 | # System 35 | import os 36 | 37 | # Externals 38 | try: 39 | from mlperf_logging import mllog 40 | have_mlperf_logging = True 41 | except ImportError: 42 | have_mlperf_logging = False 43 | 44 | 45 | def configure_mllogger(log_dir): 46 | """Setup the MLPerf logger""" 47 | if not have_mlperf_logging: 48 | raise RuntimeError('mlperf_logging package unavailable') 49 | mllog.config(filename=os.path.join(log_dir, 'mlperf.log')) 50 | return mllog.get_mllogger() 51 | 52 | 53 | def log_submission_info(benchmark='cosmoflow', 54 | org='UNDEFINED', 55 | division='UNDEFINED', 56 | status='UNDEFINED', 57 | platform='UNDEFINED'): 58 | """Log general MLPerf submission details from config""" 59 | mllogger = mllog.get_mllogger() 60 | mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value=benchmark) 61 | mllogger.event(key=mllog.constants.SUBMISSION_ORG, value=org) 62 | mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value=division) 63 | mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value=status) 64 | mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM, value=platform) 65 | -------------------------------------------------------------------------------- /cosmoflow/utils/staging.py: -------------------------------------------------------------------------------- 1 | # 'Regression of 3D Sky Map to Cosmological Parameters (CosmoFlow)' 2 | # Copyright (c) 2018, The Regents of the University of California, 3 | # through Lawrence Berkeley National Laboratory (subject to receipt of any 4 | # required approvals from the U.S. Dept. of Energy). All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # If you have questions about your rights to use or distribute this software, 19 | # please contact Berkeley Lab's Innovation & Partnerships Office at IPO@lbl.gov. 20 | # 21 | # NOTICE. This Software was developed under funding from the U.S. Department of 22 | # Energy and the U.S. Government consequently retains certain rights. As such, 23 | # the U.S. Government has been granted for itself and others acting on its 24 | # behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software 25 | # to reproduce, distribute copies to the public, prepare derivative works, and 26 | # perform publicly and display publicly, and to permit other to do so. 27 | 28 | """Utility code for staging data files into local storage""" 29 | 30 | # System imports 31 | import os 32 | import shutil 33 | import logging 34 | 35 | 36 | def stage_files(input_dir, output_dir, n_files, rank=0, size=1): 37 | """Stage specified number of files to directory. 38 | 39 | This function works in a distributed fashion. Each rank will only stage 40 | its chunk of the file list. 41 | """ 42 | if rank == 0: 43 | logging.info(f'Staging {n_files} files to {output_dir}') 44 | 45 | # Find all the files in the input directory 46 | files = sorted(os.listdir(input_dir)) 47 | 48 | # Make sure there are at least enough files available 49 | if len(files) < n_files: 50 | raise ValueError(f'Cannot stage {n_files} files; only {len(files)} available') 51 | 52 | # Take the specified number of files 53 | files = files[:n_files] 54 | 55 | # Copy my chunk into the output directory 56 | os.makedirs(output_dir, exist_ok=True) 57 | for f in files[rank::size]: 58 | logging.debug(f'Staging file {f}') 59 | shutil.copyfile(os.path.join(input_dir, f), 60 | os.path.join(output_dir, f)) 61 | logging.debug('Data staging completed') 62 | -------------------------------------------------------------------------------- /deepcam/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /deepcam/compliance/1.0.0/closed_deepcam_adam.yml: -------------------------------------------------------------------------------- 1 | # General Settings 2 | - KEY: 3 | NAME: seed 4 | REQ: EXACTLY_ONE 5 | CHECK: " v['value'] > 0" 6 | 7 | - KEY: 8 | NAME: global_batch_size 9 | REQ: EXACTLY_ONE 10 | CHECK: " v['value'] > 0" 11 | 12 | - KEY: 13 | NAME: num_workers 14 | REQ: EXACTLY_ONE 15 | CHECK: " v['value'] > 0" 16 | 17 | - KEY: 18 | NAME: batchnorm_group_size 19 | REQ: EXACTLY_ONE 20 | CHECK: " v['value'] > 0" 21 | 22 | - KEY: 23 | NAME: gradient_accumulation_frequency 24 | REQ: EXACTLY_ONE 25 | CHECK: " v['value'] > 0" 26 | 27 | 28 | # Optimizer Parameters 29 | - KEY: 30 | NAME: opt_name 31 | REQ: EXACTLY_ONE 32 | CHECK: " v['value'] in ['Adam', 'AdamW']" 33 | POST: " s['opt_name'] = v['value'] " 34 | 35 | - KEY: 36 | NAME: opt_lr 37 | REQ: EXACTLY_ONE 38 | CHECK: " v['value'] >0." 39 | 40 | - KEY: 41 | NAME: opt_betas 42 | REQ: EXACTLY_ONE 43 | CHECK: " len(v['value']) == 2" 44 | 45 | - KEY: 46 | NAME: opt_eps 47 | REQ: EXACTLY_ONE 48 | CHECK: " math.isclose(v['value'], 1e-6)" 49 | 50 | 51 | # Scheduler Parameters 52 | - KEY: 53 | NAME: scheduler_type 54 | REQ: EXACTLY_ONE 55 | CHECK: " v['value'] in ['multistep', 'cosine_annealing']" 56 | POST: " s['scheduler_type'] = v['value'] " 57 | 58 | - KEY: 59 | NAME: scheduler_milestones 60 | REQ: AT_LEAST_ONE_OR(scheduler_t_max) 61 | CHECK: " len(v['value']) >= 0 if s['scheduler_type'] == 'multistep' else True " 62 | 63 | - KEY: 64 | NAME: scheduler_decay_rate 65 | REQ: AT_LEAST_ONE_OR(scheduler_t_max) 66 | CHECK: " v['value'] <= 1. if s['scheduler_type'] == 'multistep' else True " 67 | 68 | - KEY: 69 | NAME: scheduler_t_max 70 | REQ: AT_LEAST_ONE_OR(scheduler_milestones) 71 | CHECK: " v['value'] >= 1. if s['scheduler_type'] == 'cosine_annealing' else True " 72 | 73 | - kEY: 74 | NAME: scheduler_eta_min 75 | REQ: AT_LEAST_ONE_OR(scheduler_milestones) 76 | CHECK: " v['value'] >= 0. if s['scheduler_type'] == 'cosine_annealing' else True " 77 | 78 | - KEY: 79 | NAME: scheduler_lr_warmup_steps 80 | REQ: EXACTLY_ONE 81 | CHECK: " v['value'] >= 0 " 82 | 83 | - KEY: 84 | NAME: scheduler_lr_warmup_factor 85 | REQ: EXACTLY_ONE 86 | CHECK: " v['value'] >= 1. " 87 | 88 | # Dataset Properties 89 | - KEY: 90 | NAME: train_samples 91 | REQ: EXACTLY_ONE 92 | CHECK: " v['value'] == 121266" 93 | 94 | - KEY: 95 | NAME: eval_samples 96 | REQ: EXACTLY_ONE 97 | CHECK: " v['value'] == 15158" 98 | 99 | -------------------------------------------------------------------------------- /deepcam/compliance/1.0.0/closed_deepcam_lamb.yml: -------------------------------------------------------------------------------- 1 | # General Settings 2 | - KEY: 3 | NAME: seed 4 | REQ: EXACTLY_ONE 5 | CHECK: " v['value'] > 0" 6 | 7 | - KEY: 8 | NAME: global_batch_size 9 | REQ: EXACTLY_ONE 10 | CHECK: " v['value'] > 0" 11 | 12 | - KEY: 13 | NAME: num_workers 14 | REQ: EXACTLY_ONE 15 | CHECK: " v['value'] > 0" 16 | 17 | - KEY: 18 | NAME: batchnorm_group_size 19 | REQ: EXACTLY_ONE 20 | CHECK: " v['value'] > 0" 21 | 22 | - KEY: 23 | NAME: gradient_accumulation_frequency 24 | REQ: EXACTLY_ONE 25 | CHECK: " v['value'] > 0" 26 | 27 | # Optimizer Parameters 28 | - KEY: 29 | NAME: opt_name 30 | REQ: EXACTLY_ONE 31 | CHECK: " v['value'] == 'LAMB'" 32 | 33 | - KEY: 34 | NAME: opt_lr 35 | REQ: EXACTLY_ONE 36 | CHECK: " v['value'] >0." 37 | 38 | - KEY: 39 | NAME: opt_betas 40 | REQ: EXACTLY_ONE 41 | CHECK: " len(v['value']) == 2" 42 | 43 | - KEY: 44 | NAME: opt_eps 45 | REQ: EXACTLY_ONE 46 | CHECK: " math.isclose(v['value'], 1e-6)" 47 | 48 | - KEY: 49 | NAME: opt_bias_correction 50 | REQ: EXACTLY_ONE 51 | CHECK: " v['value'] " 52 | 53 | - KEY: 54 | NAME: opt_grad_averaging 55 | REQ: EXACTLY_ONE 56 | CHECK: " v['value'] " 57 | 58 | - KEY: 59 | NAME: opt_max_grad_norm 60 | REQ: EXACTLY_ONE 61 | CHECK: " v['value'] == 1." 62 | 63 | 64 | # Scheduler Parameters 65 | - KEY: 66 | NAME: scheduler_type 67 | REQ: EXACTLY_ONE 68 | CHECK: " v['value'] in ['multistep', 'cosine_annealing']" 69 | POST: " s['scheduler_type'] = v['value'] " 70 | 71 | - KEY: 72 | NAME: scheduler_milestones 73 | REQ: AT_LEAST_ONE_OR(scheduler_t_max) 74 | CHECK: " len(v['value']) >= 0 if s['scheduler_type'] == 'multistep' else True " 75 | 76 | - KEY: 77 | NAME: scheduler_decay_rate 78 | REQ: AT_LEAST_ONE_OR(scheduler_t_max) 79 | CHECK: " v['value'] <= 1. if s['scheduler_type'] == 'multistep' else True " 80 | 81 | - KEY: 82 | NAME: scheduler_t_max 83 | REQ: AT_LEAST_ONE_OR(scheduler_milestones) 84 | CHECK: " v['value'] >= 1. if s['scheduler_type'] == 'cosine_annealing' else True " 85 | 86 | - kEY: 87 | NAME: scheduler_eta_min 88 | REQ: AT_LEAST_ONE_OR(scheduler_milestones) 89 | CHECK: " v['value'] >= 0. if s['scheduler_type'] == 'cosine_annealing' else True " 90 | 91 | - KEY: 92 | NAME: scheduler_lr_warmup_steps 93 | REQ: EXACTLY_ONE 94 | CHECK: " v['value'] >= 0 " 95 | 96 | - KEY: 97 | NAME: scheduler_lr_warmup_factor 98 | REQ: EXACTLY_ONE 99 | CHECK: " v['value'] >= 1. " 100 | 101 | # Dataset Properties 102 | - KEY: 103 | NAME: train_samples 104 | REQ: EXACTLY_ONE 105 | CHECK: " v['value'] == 121266" 106 | 107 | - KEY: 108 | NAME: eval_samples 109 | REQ: EXACTLY_ONE 110 | CHECK: " v['value'] == 15158" 111 | 112 | -------------------------------------------------------------------------------- /deepcam/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | # this software and associated documentation files (the "Software"), to deal in 7 | # the Software without restriction, including without limitation the rights to 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | # the Software, and to permit persons to whom the Software is furnished to do so, 10 | # subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | FROM nvcr.io/nvidia/pytorch:21.12-py3 23 | 24 | # some requirements 25 | RUN pip install h5py==3.2.1 26 | 27 | #install mpi4py 28 | RUN pip install mpi4py==3.0.3 29 | 30 | #install mlperf logging 31 | RUN pip install "git+https://github.com/mlperf/logging.git@501bbde47f005d67c6357da6e5c1931eab339f8e" 32 | 33 | #copy additional stuff 34 | COPY src/deepCam /opt/deepCam 35 | COPY src/utils /opt/utils 36 | 37 | #create additional folders for mapping data in 38 | RUN mkdir -p /data 39 | -------------------------------------------------------------------------------- /deepcam/docker/build_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # The MIT License (MIT) 4 | # 5 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 8 | # this software and associated documentation files (the "Software"), to deal in 9 | # the Software without restriction, including without limitation the rights to 10 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 11 | # the Software, and to permit persons to whom the Software is furnished to do so, 12 | # subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 19 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 20 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 21 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | #we need to step out to expand the build context 25 | cd .. 26 | 27 | #training container 28 | docker build -t mlperf-deepcam:latest -f docker/Dockerfile . 29 | -------------------------------------------------------------------------------- /deepcam/requirements.txt: -------------------------------------------------------------------------------- 1 | apex==0.1 2 | torch=1.8.1 3 | h5py==3.2.1 4 | mpi4py==3.0.3 5 | warmup-scheduler @ git+https://github.com/ildoonet/pytorch-gradual-warmup-lr.git@6b5e8953a80aef5b324104dc0c2e9b8c34d622bd 6 | mlperf-logging @ git+https://github.com/mlperf/logging.git@501bbde47f005d67c6357da6e5c1931eab339f8e 7 | -------------------------------------------------------------------------------- /deepcam/src/deepCam/architecture/__init__.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 2018 Pyjcsx 4 | # Modifications Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 7 | # this software and associated documentation files (the "Software"), to deal in 8 | # the Software without restriction, including without limitation the rights to 9 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 10 | # the Software, and to permit persons to whom the Software is furnished to do so, 11 | # subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 18 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 19 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 20 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /deepcam/src/deepCam/driver/__init__.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | # this software and associated documentation files (the "Software"), to deal in 7 | # the Software without restriction, including without limitation the rights to 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | # the Software, and to permit persons to whom the Software is furnished to do so, 10 | # subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | from .trainer import train_epoch 23 | from .validation import validate 24 | -------------------------------------------------------------------------------- /deepcam/src/deepCam/run_scripts/run_training.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # The MIT License (MIT) 4 | # 5 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 8 | # this software and associated documentation files (the "Software"), to deal in 9 | # the Software without restriction, including without limitation the rights to 10 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 11 | # the Software, and to permit persons to whom the Software is furnished to do so, 12 | # subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 19 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 20 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 21 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | # parameters 25 | data_dir="" 26 | output_dir="" 27 | run_tag="test_run" 28 | local_batch_size=2 29 | 30 | python ./train.py \ 31 | --wireup_method "dummy" \ 32 | --run_tag ${run_tag} \ 33 | --data_dir_prefix ${data_dir} \ 34 | --output_dir ${output_dir} \ 35 | --model_prefix "segmentation" \ 36 | --optimizer "LAMB" \ 37 | --adam_eps 1e-6 \ 38 | --start_lr 0.0055 \ 39 | --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \ 40 | --lr_warmup_steps 400 \ 41 | --lr_warmup_factor 1. \ 42 | --weight_decay 1e-2 \ 43 | --logging_frequency 10 \ 44 | --save_frequency 0 \ 45 | --max_epochs 200 \ 46 | --max_inter_threads 4 \ 47 | --seed $(date +%s) \ 48 | --batchnorm_group_size 1 \ 49 | --local_batch_size ${local_batch_size} 50 | -------------------------------------------------------------------------------- /deepcam/src/deepCam/run_scripts/run_training_nranks1024.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # The MIT License (MIT) 4 | # 5 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 8 | # this software and associated documentation files (the "Software"), to deal in 9 | # the Software without restriction, including without limitation the rights to 10 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 11 | # the Software, and to permit persons to whom the Software is furnished to do so, 12 | # subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 19 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 20 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 21 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | #ranks per node 25 | totalranks=1024 26 | local_batch_size=2 27 | data_dir="" 28 | output_dir="" 29 | run_tag="test_run_nranks-1024" 30 | 31 | mpirun -np ${totalranks} \ 32 | python ./train.py \ 33 | --wireup_method "nccl-openmpi" \ 34 | --run_tag ${run_tag} \ 35 | --data_dir_prefix ${data_dir} \ 36 | --output_dir ${output_dir} \ 37 | --model_prefix "segmentation" \ 38 | --optimizer "LAMB" \ 39 | --adam_eps 1e-6 \ 40 | --start_lr 0.0055 \ 41 | --lr_schedule type="multistep",milestones="800",decay_rate="0.1" \ 42 | --lr_warmup_steps 400 \ 43 | --lr_warmup_factor 1. \ 44 | --weight_decay 1e-2 \ 45 | --logging_frequency 10 \ 46 | --save_frequency 0 \ 47 | --max_epochs 200 \ 48 | --max_inter_threads 4 \ 49 | --seed $(date +%s) \ 50 | --batchnorm_group_size 1 \ 51 | --local_batch_size ${local_batch_size} 52 | -------------------------------------------------------------------------------- /deepcam/src/deepCam/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/hpc/2c627d457004eff77a014205b3151ed48a6fa149/deepcam/src/deepCam/utils/__init__.py -------------------------------------------------------------------------------- /deepcam/src/deepCam/utils/metric.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | # this software and associated documentation files (the "Software"), to deal in 7 | # the Software without restriction, including without limitation the rights to 8 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | # the Software, and to permit persons to whom the Software is furnished to do so, 10 | # subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | import torch 23 | from torch import Tensor 24 | 25 | 26 | def compute_score(prediction: Tensor, gt: Tensor, num_classes: int) -> Tensor: 27 | # flatten input 28 | batch_size = gt.shape[0] 29 | tpt = torch.zeros((batch_size, num_classes), dtype=torch.long, device=prediction.device) 30 | fpt = torch.zeros((batch_size, num_classes), dtype=torch.long, device=prediction.device) 31 | fnt = torch.zeros((batch_size, num_classes), dtype=torch.long, device=prediction.device) 32 | 33 | # create views: 34 | pv = prediction.view(batch_size, -1) 35 | gtv = gt.view(batch_size, -1) 36 | 37 | # compute per class accuracy 38 | for j in range(0, num_classes): 39 | # compute helper tensors 40 | pv_eq_j = (pv == j) 41 | pv_ne_j = (pv != j) 42 | gtv_eq_j = (gtv == j) 43 | gtv_ne_j = (gtv != j) 44 | 45 | #true positve: prediction and gt agree and gt is of class j: (p == j) & (g == j) 46 | tpt[:, j] = torch.sum(torch.logical_and(pv_eq_j, gtv_eq_j), dim=1) 47 | 48 | #false positive: prediction is of class j and gt not of class j: (p == j) & (g != j) 49 | fpt[:, j] = torch.sum(torch.logical_and(pv_eq_j, gtv_ne_j), dim=1) 50 | 51 | #false negative: prediction is not of class j and gt is of class j: (p != j) & (g == j) 52 | fnt[:, j] = torch.sum(torch.logical_and(pv_ne_j, gtv_eq_j), dim=1) 53 | 54 | # compute IoU per batch 55 | uniont = (tpt + fpt + fnt) * num_classes 56 | iout = torch.sum(torch.nan_to_num(tpt.float() / uniont.float(), nan=1./float(num_classes)), dim=1) 57 | 58 | # average over batch dim 59 | iout = torch.mean(iout) 60 | 61 | return iout 62 | -------------------------------------------------------------------------------- /deepcam/src/deepCam/utils/types.py: -------------------------------------------------------------------------------- 1 | # Ported from torch_optimizer 0.0.1a15: https://pypi.org/project/torch-optimizer/ 2 | 3 | from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union 4 | 5 | from torch import Tensor 6 | 7 | Params = Union[Iterable[Tensor], Iterable[Dict[str, Any]]] 8 | 9 | LossClosure = Callable[[], float] 10 | OptLossClosure = Optional[LossClosure] 11 | Betas2 = Tuple[float, float] 12 | State = Dict[str, Any] 13 | OptFloat = Optional[float] 14 | Nus2 = Tuple[float, float] 15 | -------------------------------------------------------------------------------- /deepcam/src/utils/run_stage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # data dir 4 | mode=$1 5 | src_dir=$2 6 | dest_dir=$3 7 | 8 | # step into source dir 9 | cd $(dirname ${src_dir}) 10 | 11 | # tar 12 | if [ "${mode}" == "tar" ]; then 13 | /opt/mpifileutils/bin/dtar -cf $(basename ${src_dir}).tar $(basename ${src_dir}) 14 | fi 15 | 16 | # zip 17 | if [ "${mode}" == "compress" ]; then 18 | /opt/mpifileutils/bin/dbz2 -z $(basename ${src_dir}).tar 19 | fi 20 | 21 | # bcast 22 | if [ "${mode}" == "broadcast" ]; then 23 | if [ -f $(basename ${src_dir}).dbz2 ]; then 24 | srcfile=$(basename ${src_dir}).tar.dbz2 25 | else 26 | srcfile=$(basename ${src_dir}).tar 27 | fi 28 | /opt/mpifileutils/bin/dbcast ${srcfile} ${dest_dir}/$(basename ${srcfile}) 29 | fi 30 | 31 | # untar 32 | if [ "${mode}" == "untar" ]; then 33 | local_rank=$(( ${PMIX_RANK} % 8 )) 34 | if [ "${local_rank}" == "0" ]; then 35 | time tar -xf ${dest_dir}/$(basename ${src_dir}).tar 36 | fi 37 | fi 38 | -------------------------------------------------------------------------------- /deepcam/src/utils/run_summarize_circe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A hpc 3 | #SBATCH -J summarize_cam5 4 | #SBATCH -t 01:00:00 5 | 6 | # The MIT License (MIT) 7 | # 8 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 9 | # 10 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 11 | # this software and associated documentation files (the "Software"), to deal in 12 | # the Software without restriction, including without limitation the rights to 13 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 14 | # the Software, and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | # 17 | # The above copyright notice and this permission notice shall be included in all 18 | # copies or substantial portions of the Software. 19 | # 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 22 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 23 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 24 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 25 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 | 27 | rankspernode=48 28 | totalranks=$(( ${SLURM_NNODES} * ${rankspernode} )) 29 | 30 | srun --wait=60 --mpi=pmix -N ${SLURM_NNODES} -n ${totalranks} -c $(( 96 / ${rankspernode} )) \ 31 | --container-workdir=/opt/utils \ 32 | --container-mounts=/gpfs/fs1/tkurth/cam5_dataset/All-Hist:/data \ 33 | --container-image=gitlab-master.nvidia.com/tkurth/mlperf-deepcam:debug \ 34 | python summarize_data.py 35 | -------------------------------------------------------------------------------- /open_catalyst/.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | jobs: 4 | build: 5 | docker: 6 | - image: circleci/python:3.7 7 | 8 | steps: 9 | - checkout 10 | 11 | - restore_cache: 12 | keys: 13 | - v0.3-dependencies-{{ checksum "env.common.yml" }}-{{ checksum "env.cpu.yml" }}-{{ checksum "env.gpu.yml" }} 14 | 15 | - run: 16 | name: Install conda 17 | command: | 18 | if [ ! -d "/home/circleci/miniconda" ]; then 19 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 20 | bash miniconda.sh -b -p "$HOME"/miniconda 21 | source /home/circleci/miniconda/etc/profile.d/conda.sh 22 | conda activate base 23 | # Conda configuration 24 | conda config --set always_yes yes --set auto_update_conda false 25 | # Update conda 26 | conda update conda 27 | fi 28 | - run: 29 | name: Create environment 30 | command: | 31 | if [ ! -d "/home/circleci/miniconda/envs/ocp-models" ]; then 32 | source /home/circleci/miniconda/etc/profile.d/conda.sh 33 | conda activate base 34 | conda install -c conda-forge conda-merge 35 | conda-merge env.common.yml env.cpu.yml > env.yml 36 | conda env create -f env.yml 37 | fi 38 | - save_cache: 39 | paths: 40 | - /home/circleci/miniconda 41 | key: v0.3-dependencies-{{ checksum "env.common.yml" }}-{{ checksum "env.cpu.yml" }}-{{ checksum "env.gpu.yml" }} 42 | 43 | - run: 44 | name: Run tests 45 | command: | 46 | source /home/circleci/miniconda/etc/profile.d/conda.sh 47 | conda activate ocp-models 48 | pip install -e . 49 | pre-commit install 50 | pytest /home/circleci/project/tests 51 | 52 | - run: 53 | name: Run black 54 | command: | 55 | source /home/circleci/miniconda/etc/profile.d/conda.sh 56 | conda activate ocp-models 57 | pip install black==20.8b1 58 | black . --check 59 | -------------------------------------------------------------------------------- /open_catalyst/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, E731, W503, F403, F401 3 | max-line-length = 79 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 6 | -------------------------------------------------------------------------------- /open_catalyst/.gitignore: -------------------------------------------------------------------------------- 1 | wandb 2 | data 3 | checkpoints 4 | results 5 | *.traj 6 | experimental 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *,cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | docs/source/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # IPython Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # User directories 100 | Local 101 | 102 | # .DS_Store 103 | .DS_Store 104 | 105 | # VIM swap files 106 | *.swp 107 | 108 | # PyCharm 109 | .idea/ 110 | 111 | # VS Code 112 | .vscode/ 113 | -------------------------------------------------------------------------------- /open_catalyst/.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | multi_line_output=3 3 | include_trailing_comma=True 4 | force_grid_wrap=0 5 | use_parentheses=True 6 | line_length=79 7 | -------------------------------------------------------------------------------- /open_catalyst/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: 20.8b1 4 | hooks: 5 | - id: black 6 | language_version: python3.8 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v2.3.0 9 | hooks: 10 | - id: flake8 11 | - id: trailing-whitespace 12 | - id: check-added-large-files 13 | - id: end-of-file-fixer 14 | - repo: https://github.com/pre-commit/mirrors-isort 15 | rev: v5.9.1 16 | hooks: 17 | - id: isort 18 | args: ["--profile", "black", "--filter-files"] 19 | -------------------------------------------------------------------------------- /open_catalyst/LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) Facebook, Inc. and its affiliates. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/100k/base.yml: -------------------------------------------------------------------------------- 1 | trainer: energy 2 | 3 | dataset: 4 | - src: data/is2re/100k/train/data.lmdb 5 | normalize_labels: True 6 | target_mean: -1.525913953781128 7 | target_std: 2.279365062713623 8 | - src: data/is2re/all/val_id/data.lmdb 9 | 10 | logger: tensorboard 11 | 12 | task: 13 | dataset: single_point_lmdb 14 | description: "Relaxed state energy prediction from initial structure." 15 | type: regression 16 | metric: mae 17 | labels: 18 | - relaxed energy 19 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/100k/cgcnn/cgcnn.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/is2re/100k/base.yml 3 | 4 | model: 5 | name: cgcnn 6 | atom_embedding_size: 384 7 | fc_feat_size: 128 8 | num_fc_layers: 4 9 | num_graph_conv_layers: 5 10 | num_gaussians: 100 11 | cutoff: 6.0 12 | regress_forces: False 13 | use_pbc: True 14 | 15 | # *** Important note *** 16 | # The total number of gpus used for this run was 1. 17 | # If the global batch size (num_gpus * batch_size) is modified 18 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 19 | 20 | optim: 21 | batch_size: 16 22 | eval_batch_size: 16 23 | num_workers: 16 24 | lr_initial: 0.01 25 | lr_gamma: 0.1 26 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 27 | - 31250 28 | - 56250 29 | - 75000 30 | warmup_steps: 18750 31 | warmup_factor: 0.2 32 | max_epochs: 30 33 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/100k/dimenet_plus_plus/dpp.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/is2re/100k/base.yml 3 | 4 | model: 5 | name: dimenetplusplus 6 | hidden_channels: 256 7 | out_emb_channels: 192 8 | num_blocks: 3 9 | cutoff: 6.0 10 | num_radial: 6 11 | num_spherical: 7 12 | num_before_skip: 1 13 | num_after_skip: 2 14 | num_output_layers: 3 15 | regress_forces: False 16 | use_pbc: True 17 | 18 | # *** Important note *** 19 | # The total number of gpus used for this run was 1. 20 | # If the global batch size (num_gpus * batch_size) is modified 21 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 22 | 23 | optim: 24 | batch_size: 2 25 | eval_batch_size: 2 26 | num_workers: 2 27 | lr_initial: 0.0001 28 | lr_gamma: 0.1 29 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 30 | - 200000 31 | - 400000 32 | - 600000 33 | warmup_steps: 100000 34 | warmup_factor: 0.2 35 | max_epochs: 20 36 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/100k/schnet/schnet.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/is2re/100k/base.yml 3 | 4 | model: 5 | name: schnet 6 | hidden_channels: 384 7 | num_filters: 128 8 | num_interactions: 4 9 | num_gaussians: 100 10 | cutoff: 6.0 11 | use_pbc: True 12 | regress_forces: False 13 | 14 | # *** Important note *** 15 | # The total number of gpus used for this run was 1. 16 | # If the global batch size (num_gpus * batch_size) is modified 17 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 18 | 19 | optim: 20 | batch_size: 32 21 | eval_batch_size: 32 22 | num_workers: 16 23 | lr_initial: 0.0005 24 | lr_gamma: 0.1 25 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 26 | - 15625 27 | - 31250 28 | - 46875 29 | warmup_steps: 9375 30 | warmup_factor: 0.2 31 | max_epochs: 30 32 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/10k/base.yml: -------------------------------------------------------------------------------- 1 | trainer: energy 2 | 3 | dataset: 4 | - src: data/is2re/10k/train/data.lmdb 5 | normalize_labels: True 6 | target_mean: -1.525913953781128 7 | target_std: 2.279365062713623 8 | - src: data/is2re/all/val_id/data.lmdb 9 | 10 | logger: tensorboard 11 | 12 | task: 13 | dataset: single_point_lmdb 14 | description: "Relaxed state energy prediction from initial structure." 15 | type: regression 16 | metric: mae 17 | labels: 18 | - relaxed energy 19 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/10k/cgcnn/cgcnn.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/is2re/10k/base.yml 3 | 4 | model: 5 | name: cgcnn 6 | atom_embedding_size: 128 7 | fc_feat_size: 256 8 | num_fc_layers: 4 9 | num_graph_conv_layers: 5 10 | num_gaussians: 100 11 | cutoff: 6.0 12 | regress_forces: False 13 | use_pbc: True 14 | 15 | # *** Important note *** 16 | # The total number of gpus used for this run was 1. 17 | # If the global batch size (num_gpus * batch_size) is modified 18 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 19 | 20 | optim: 21 | batch_size: 64 22 | eval_batch_size: 64 23 | num_workers: 16 24 | lr_initial: 0.01 25 | lr_gamma: 0.1 26 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 27 | - 781 28 | - 1406 29 | - 2031 30 | warmup_steps: 468 31 | warmup_factor: 0.2 32 | max_epochs: 20 33 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/10k/dimenet_plus_plus/dpp.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/is2re/10k/base.yml 3 | 4 | model: 5 | name: dimenetplusplus 6 | hidden_channels: 256 7 | out_emb_channels: 192 8 | num_blocks: 3 9 | cutoff: 6.0 10 | num_radial: 6 11 | num_spherical: 7 12 | num_before_skip: 1 13 | num_after_skip: 2 14 | num_output_layers: 3 15 | regress_forces: False 16 | use_pbc: True 17 | 18 | # *** Important note *** 19 | # The total number of gpus used for this run was 1. 20 | # If the global batch size (num_gpus * batch_size) is modified 21 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 22 | 23 | optim: 24 | batch_size: 2 25 | eval_batch_size: 2 26 | num_workers: 2 27 | lr_initial: 0.0001 28 | lr_gamma: 0.1 29 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 30 | - 20000 31 | - 40000 32 | - 60000 33 | warmup_steps: 10000 34 | warmup_factor: 0.2 35 | max_epochs: 20 36 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/10k/schnet/schnet.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/is2re/10k/base.yml 3 | 4 | model: 5 | name: schnet 6 | hidden_channels: 256 7 | num_filters: 128 8 | num_interactions: 3 9 | num_gaussians: 100 10 | cutoff: 6.0 11 | use_pbc: True 12 | regress_forces: False 13 | 14 | # *** Important note *** 15 | # The total number of gpus used for this run was 1. 16 | # If the global batch size (num_gpus * batch_size) is modified 17 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 18 | 19 | optim: 20 | batch_size: 64 21 | eval_batch_size: 64 22 | num_workers: 16 23 | lr_initial: 0.005 24 | lr_gamma: 0.1 25 | lr_milestones: # epochs at which lr_initial <- lr_initial * lr_gamma 26 | - 1562 27 | - 2343 28 | - 3125 29 | warmup_steps: 468 30 | warmup_factor: 0.2 31 | max_epochs: 30 32 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/all/base.yml: -------------------------------------------------------------------------------- 1 | trainer: energy 2 | 3 | dataset: 4 | - src: data/is2re/all/train/data.lmdb 5 | normalize_labels: True 6 | target_mean: -1.525913953781128 7 | target_std: 2.279365062713623 8 | - src: data/is2re/all/val_id/data.lmdb 9 | 10 | logger: tensorboard 11 | 12 | task: 13 | dataset: single_point_lmdb 14 | description: "Relaxed state energy prediction from initial structure." 15 | type: regression 16 | metric: mae 17 | labels: 18 | - relaxed energy 19 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/all/cgcnn/cgcnn.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/is2re/all/base.yml 3 | 4 | model: 5 | name: cgcnn 6 | atom_embedding_size: 384 7 | fc_feat_size: 512 8 | num_fc_layers: 4 9 | num_graph_conv_layers: 6 10 | num_gaussians: 100 11 | cutoff: 6.0 12 | regress_forces: False 13 | use_pbc: True 14 | 15 | # *** Important note *** 16 | # The total number of gpus used for this run was 4. 17 | # If the global batch size (num_gpus * batch_size) is modified 18 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 19 | 20 | optim: 21 | batch_size: 32 22 | eval_batch_size: 32 23 | num_workers: 16 24 | lr_initial: 0.01 25 | lr_gamma: 0.1 26 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 27 | - 17981 28 | - 32366 29 | - 46752 30 | warmup_steps: 10788 31 | warmup_factor: 0.2 32 | max_epochs: 20 33 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/all/dimenet_plus_plus/dpp.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/is2re/all/base.yml 3 | 4 | model: 5 | name: dimenetplusplus 6 | hidden_channels: 256 7 | out_emb_channels: 192 8 | num_blocks: 3 9 | cutoff: 6.0 10 | num_radial: 6 11 | num_spherical: 7 12 | num_before_skip: 1 13 | num_after_skip: 2 14 | num_output_layers: 3 15 | regress_forces: False 16 | use_pbc: True 17 | 18 | # *** Important note *** 19 | # The total number of gpus used for this run was 4. 20 | # If the global batch size (num_gpus * batch_size) is modified 21 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 22 | 23 | optim: 24 | batch_size: 4 25 | eval_batch_size: 4 26 | num_workers: 4 27 | lr_initial: 0.0001 28 | lr_gamma: 0.1 29 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 30 | - 115082 31 | - 230164 32 | - 345246 33 | warmup_steps: 57541 34 | warmup_factor: 0.2 35 | max_epochs: 20 36 | -------------------------------------------------------------------------------- /open_catalyst/configs/is2re/all/schnet/schnet.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/is2re/all/base.yml 3 | 4 | model: 5 | name: schnet 6 | hidden_channels: 384 7 | num_filters: 128 8 | num_interactions: 4 9 | num_gaussians: 100 10 | cutoff: 6.0 11 | use_pbc: True 12 | regress_forces: False 13 | 14 | # *** Important note *** 15 | # The total number of gpus used for this run was 4. 16 | # If the global batch size (num_gpus * batch_size) is modified 17 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 18 | 19 | optim: 20 | batch_size: 64 21 | eval_batch_size: 64 22 | num_workers: 16 23 | lr_initial: 0.001 24 | lr_gamma: 0.1 25 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 26 | - 17981 27 | - 26972 28 | - 35963 29 | warmup_steps: 5394 30 | warmup_factor: 0.2 31 | max_epochs: 30 32 | -------------------------------------------------------------------------------- /open_catalyst/configs/mlperf_hpc.yml: -------------------------------------------------------------------------------- 1 | trainer: mlperf_forces 2 | 3 | dataset: 4 | - src: /global/cfs/cdirs/m1759/catalysis_dl/oc20_data/s2ef/2M/train 5 | normalize_labels: True 6 | target_mean: -0.7554450631141663 7 | target_std: 2.887317180633545 8 | grad_target_mean: 0.0 9 | grad_target_std: 2.887317180633545 10 | - src: /global/cfs/cdirs/m1759/catalysis_dl/oc20_data/s2ef/all/val_id 11 | 12 | logger: wandb 13 | 14 | task: 15 | mlperf_benchmark: oc20 16 | mlperf_org: LBNL 17 | mlperf_division: closed 18 | mlperf_status: onprem 19 | mlperf_platform: SUBMISSION_PLATFORM_PLACEHOLDER 20 | mlperf_accelerators_per_node: 8 21 | mlperf_accelerators_per_rank: 1 22 | 23 | dataset: trajectory_lmdb 24 | description: "Regressing to energies and forces for DFT trajectories from OCP" 25 | type: regression 26 | metric: mae 27 | primary_metric: forces_mae 28 | target_forces_mae: 0.036 29 | labels: 30 | - potential energy 31 | grad_input: atomic forces 32 | train_on_free_atoms: True 33 | eval_on_free_atoms: True 34 | 35 | model: 36 | name: dimenetplusplus 37 | hidden_channels: 192 38 | out_emb_channels: 192 39 | num_blocks: 3 40 | cutoff: 6.0 41 | num_radial: 6 42 | num_spherical: 7 43 | num_before_skip: 1 44 | num_after_skip: 2 45 | num_output_layers: 3 46 | regress_forces: True 47 | use_pbc: True 48 | #otf_graph: True 49 | 50 | # These settings optimized for global batch size (batch_size * gpus) = 256 51 | optim: 52 | batch_size: 8 53 | eval_batch_size: 8 54 | num_workers: 8 55 | 56 | optimizer: AdamW 57 | lr_initial: 0.0004 58 | warmup_steps: 31252 # 4 epochs 59 | warmup_factor: 0.2 60 | lr_milestones: 61 | - 125008 # 16 epochs 62 | - 187512 # 24 epochs 63 | - 250016 # 32 epochs 64 | lr_gamma: 0.1 65 | 66 | max_epochs: 30 67 | energy_coefficient: 0 68 | force_coefficient: 50 69 | disable_tqdm: True 70 | 71 | slurm: 72 | partition: null 73 | constraint: gpu 74 | account: m1759 75 | qos: special 76 | time_min: "4:00:00" 77 | -------------------------------------------------------------------------------- /open_catalyst/configs/pm_b2048.yml: -------------------------------------------------------------------------------- 1 | trainer: mlperf_forces 2 | 3 | dataset: 4 | - src: /pscratch/sd/s/sfarrell/ocp/data/s2ef/2M/train 5 | normalize_labels: True 6 | target_mean: -0.7554450631141663 7 | target_std: 2.887317180633545 8 | grad_target_mean: 0.0 9 | grad_target_std: 2.887317180633545 10 | - src: /pscratch/sd/s/sfarrell/ocp/data/s2ef/all/val_id 11 | 12 | logger: wandb 13 | 14 | task: 15 | mlperf_benchmark: oc20 16 | mlperf_org: LBNL 17 | mlperf_division: closed 18 | mlperf_status: onprem 19 | mlperf_platform: SUBMISSION_PLATFORM_PLACEHOLDER 20 | mlperf_accelerators_per_node: 4 21 | mlperf_accelerators_per_rank: 1 22 | 23 | dataset: trajectory_lmdb 24 | description: "Regressing to energies and forces for DFT trajectories from OCP" 25 | type: regression 26 | metric: mae 27 | primary_metric: forces_mae 28 | target_forces_mae: 0.036 29 | labels: 30 | - potential energy 31 | grad_input: atomic forces 32 | train_on_free_atoms: True 33 | eval_on_free_atoms: True 34 | 35 | model: 36 | name: dimenetplusplus 37 | hidden_channels: 192 38 | out_emb_channels: 192 39 | num_blocks: 3 40 | cutoff: 6.0 41 | num_radial: 6 42 | num_spherical: 7 43 | num_before_skip: 1 44 | num_after_skip: 2 45 | num_output_layers: 3 46 | regress_forces: True 47 | use_pbc: True 48 | #otf_graph: True 49 | 50 | # These settings optimized for global batch size (batch_size * gpus) = 2048 51 | optim: 52 | batch_size: 4 53 | eval_batch_size: 8 54 | num_workers: 8 55 | 56 | optimizer: AdamW 57 | lr_initial: 0.0016 58 | warmup_steps: 3908 # 4 epochs 59 | warmup_factor: 0.2 60 | lr_milestones: 61 | - 23448 # 24 epochs 62 | - 31264 # 32 epochs 63 | lr_gamma: 0.1 64 | 65 | max_epochs: 48 66 | energy_coefficient: 0 67 | force_coefficient: 50 68 | 69 | hide_eval_progressbar: True 70 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/200k/base.yml: -------------------------------------------------------------------------------- 1 | trainer: forces 2 | 3 | dataset: 4 | - src: data/s2ef/200k/train/ 5 | normalize_labels: True 6 | target_mean: -0.7554450631141663 7 | target_std: 2.887317180633545 8 | grad_target_mean: 0.0 9 | grad_target_std: 2.887317180633545 10 | - src: data/s2ef/all/val_id/ 11 | 12 | logger: tensorboard 13 | 14 | task: 15 | dataset: trajectory_lmdb 16 | description: "Regressing to energies and forces for DFT trajectories from OCP" 17 | type: regression 18 | metric: mae 19 | labels: 20 | - potential energy 21 | grad_input: atomic forces 22 | train_on_free_atoms: True 23 | eval_on_free_atoms: True 24 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/200k/cgcnn/cgcnn.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/200k/base.yml 3 | 4 | model: 5 | name: cgcnn 6 | atom_embedding_size: 128 7 | fc_feat_size: 128 8 | num_fc_layers: 3 9 | num_graph_conv_layers: 2 10 | cutoff: 6.0 11 | num_gaussians: 100 12 | use_pbc: True 13 | 14 | # *** Important note *** 15 | # The total number of gpus used for this run was 4. 16 | # If the global batch size (num_gpus * batch_size) is modified 17 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 18 | 19 | optim: 20 | batch_size: 32 21 | eval_batch_size: 32 22 | num_workers: 16 23 | lr_initial: 0.0005 24 | lr_gamma: 0.1 25 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 26 | - 23437 27 | - 31250 28 | warmup_steps: 3125 29 | warmup_factor: 0.2 30 | max_epochs: 50 31 | force_coefficient: 10 32 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/200k/dimenet_plus_plus/dpp.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/200k/base.yml 3 | 4 | model: 5 | name: dimenetplusplus 6 | hidden_channels: 192 7 | out_emb_channels: 192 8 | num_blocks: 3 9 | cutoff: 6.0 10 | num_radial: 6 11 | num_spherical: 7 12 | num_before_skip: 1 13 | num_after_skip: 2 14 | num_output_layers: 3 15 | regress_forces: True 16 | use_pbc: True 17 | 18 | # *** Important note *** 19 | # The total number of gpus used for this run was 16. 20 | # If the global batch size (num_gpus * batch_size) is modified 21 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 22 | 23 | optim: 24 | batch_size: 12 25 | eval_batch_size: 12 26 | num_workers: 8 27 | lr_initial: 0.00001 28 | lr_gamma: 0.1 29 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 30 | - 5208 31 | - 8333 32 | - 10416 33 | warmup_steps: 3125 34 | warmup_factor: 0.2 35 | max_epochs: 30 36 | force_coefficient: 50 37 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/200k/forcenet/fn_forceonly.yml: -------------------------------------------------------------------------------- 1 | trainer: forces 2 | 3 | dataset: 4 | - src: data/s2ef/200k/train/ 5 | - src: data/s2ef/all/val_id/ 6 | 7 | model: 8 | name: forcenet 9 | num_interactions: 5 10 | cutoff: 6 11 | basis: "sphallmul" 12 | ablation: "none" 13 | depth_mlp_edge: 2 14 | depth_mlp_node: 1 15 | activation_str: "swish" 16 | decoder_activation_str: "swish" 17 | feat: "full" 18 | hidden_channels: 512 19 | decoder_hidden_channels: 512 20 | max_n: 3 21 | 22 | # *** Important note *** 23 | # The total number of gpus used for this run was 8. 24 | # If the global batch size (num_gpus * batch_size) is modified 25 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 26 | 27 | optim: 28 | batch_size: 8 29 | eval_batch_size: 8 30 | eval_every: 10000 31 | num_workers: 8 32 | lr_initial: 0.0005 33 | max_epochs: 20 34 | energy_coefficient: 0 35 | lr_gamma: 0.1 36 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 37 | - 15625 38 | - 25000 39 | - 31250 40 | warmup_steps: 9375 41 | warmup_factor: 0.2 42 | 43 | task: 44 | dataset: trajectory_lmdb 45 | description: "Regressing to energies and forces for DFT trajectories from OCP" 46 | type: regression 47 | metric: mae 48 | primary_metric: forces_mae 49 | labels: 50 | - potential energy 51 | grad_input: atomic forces 52 | tag_specific_weights: 53 | - 0.05 54 | - 1.0 55 | - 1.0 56 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/200k/schnet/schnet.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/200k/base.yml 3 | 4 | model: 5 | name: schnet 6 | hidden_channels: 1024 7 | num_filters: 256 8 | num_interactions: 3 9 | num_gaussians: 200 10 | cutoff: 6.0 11 | use_pbc: True 12 | 13 | # *** Important note *** 14 | # The total number of gpus used for this run was 4. 15 | # If the global batch size (num_gpus * batch_size) is modified 16 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 17 | 18 | optim: 19 | batch_size: 32 20 | eval_batch_size: 32 21 | num_workers: 16 22 | lr_initial: 0.0005 23 | lr_gamma: 0.1 24 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 25 | - 7812 26 | - 12500 27 | - 15625 28 | warmup_steps: 4687 29 | warmup_factor: 0.2 30 | max_epochs: 30 31 | force_coefficient: 100 32 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/20M/base.yml: -------------------------------------------------------------------------------- 1 | trainer: forces 2 | 3 | dataset: 4 | - src: data/s2ef/20M/train/ 5 | normalize_labels: True 6 | target_mean: -0.7554450631141663 7 | target_std: 2.887317180633545 8 | grad_target_mean: 0.0 9 | grad_target_std: 2.887317180633545 10 | - src: data/s2ef/all/val_id/ 11 | 12 | logger: tensorboard 13 | 14 | task: 15 | dataset: trajectory_lmdb 16 | description: "Regressing to energies and forces for DFT trajectories from OCP" 17 | type: regression 18 | metric: mae 19 | labels: 20 | - potential energy 21 | grad_input: atomic forces 22 | train_on_free_atoms: True 23 | eval_on_free_atoms: True 24 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/20M/cgcnn/cgcnn.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/20M/base.yml 3 | 4 | model: 5 | name: cgcnn 6 | atom_embedding_size: 512 7 | fc_feat_size: 128 8 | num_fc_layers: 3 9 | num_graph_conv_layers: 3 10 | cutoff: 6.0 11 | num_gaussians: 100 12 | use_pbc: True 13 | 14 | # *** Important note *** 15 | # The total number of gpus used for this run was 48. 16 | # If the global batch size (num_gpus * batch_size) is modified 17 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 18 | 19 | optim: 20 | batch_size: 24 21 | eval_batch_size: 24 22 | num_workers: 16 23 | lr_initial: 0.0005 24 | lr_gamma: 0.1 25 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 26 | - 52083 27 | - 86805 28 | - 121527 29 | warmup_steps: 34722 30 | warmup_factor: 0.2 31 | max_epochs: 20 32 | force_coefficient: 100 33 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/20M/dimenet_plus_plus/dpp.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/20M/base.yml 3 | 4 | model: 5 | name: dimenetplusplus 6 | hidden_channels: 192 7 | out_emb_channels: 192 8 | num_blocks: 3 9 | cutoff: 6.0 10 | num_radial: 6 11 | num_spherical: 7 12 | num_before_skip: 1 13 | num_after_skip: 2 14 | num_output_layers: 3 15 | regress_forces: True 16 | use_pbc: True 17 | 18 | # *** Important note *** 19 | # The total number of gpus used for this run was 64. 20 | # If the global batch size (num_gpus * batch_size) is modified 21 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 22 | 23 | optim: 24 | batch_size: 12 25 | eval_batch_size: 12 26 | eval_every: 10000 27 | num_workers: 8 28 | lr_initial: 0.0001 29 | lr_gamma: 0.1 30 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 31 | - 78125 32 | - 130208 33 | - 208333 34 | warmup_steps: 52083 35 | warmup_factor: 0.2 36 | max_epochs: 15 37 | force_coefficient: 50 38 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/20M/schnet/schnet.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/20M/base.yml 3 | 4 | model: 5 | name: schnet 6 | hidden_channels: 1024 7 | num_filters: 256 8 | num_interactions: 5 9 | num_gaussians: 200 10 | cutoff: 6.0 11 | use_pbc: True 12 | 13 | # *** Important note *** 14 | # The total number of gpus used for this run was 48. 15 | # If the global batch size (num_gpus * batch_size) is modified 16 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 17 | 18 | optim: 19 | batch_size: 24 20 | eval_batch_size: 24 21 | eval_every: 10000 22 | num_workers: 16 23 | lr_initial: 0.0001 24 | lr_gamma: 0.1 25 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 26 | - 86805 27 | - 138888 28 | - 173611 29 | warmup_steps: 52083 30 | warmup_factor: 0.2 31 | max_epochs: 30 32 | force_coefficient: 50 33 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/2M/base.yml: -------------------------------------------------------------------------------- 1 | trainer: forces 2 | 3 | dataset: 4 | - src: data/s2ef/2M/train/ 5 | normalize_labels: True 6 | target_mean: -0.7554450631141663 7 | target_std: 2.887317180633545 8 | grad_target_mean: 0.0 9 | grad_target_std: 2.887317180633545 10 | - src: data/s2ef/all/val_id/ 11 | 12 | logger: tensorboard 13 | 14 | task: 15 | dataset: trajectory_lmdb 16 | description: "Regressing to energies and forces for DFT trajectories from OCP" 17 | type: regression 18 | metric: mae 19 | labels: 20 | - potential energy 21 | grad_input: atomic forces 22 | train_on_free_atoms: True 23 | eval_on_free_atoms: True 24 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/2M/cgcnn/cgcnn.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/2M/base.yml 3 | 4 | model: 5 | name: cgcnn 6 | atom_embedding_size: 384 7 | fc_feat_size: 128 8 | num_fc_layers: 3 9 | num_graph_conv_layers: 3 10 | cutoff: 6.0 11 | num_gaussians: 100 12 | use_pbc: True 13 | 14 | # *** Important note *** 15 | # The total number of gpus used for this run was 8. 16 | # If the global batch size (num_gpus * batch_size) is modified 17 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 18 | 19 | optim: 20 | batch_size: 8 21 | eval_batch_size: 8 22 | num_workers: 8 23 | lr_initial: 0.001 24 | lr_gamma: 0.1 25 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 26 | - 156250 27 | - 281250 28 | - 437500 29 | warmup_steps: 62500 30 | warmup_factor: 0.2 31 | max_epochs: 20 32 | force_coefficient: 10 33 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/2M/dimenet_plus_plus/dpp.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/2M/base.yml 3 | 4 | model: 5 | name: dimenetplusplus 6 | hidden_channels: 192 7 | out_emb_channels: 192 8 | num_blocks: 3 9 | cutoff: 6.0 10 | num_radial: 6 11 | num_spherical: 7 12 | num_before_skip: 1 13 | num_after_skip: 2 14 | num_output_layers: 3 15 | regress_forces: True 16 | use_pbc: True 17 | 18 | # *** Important note *** 19 | # The total number of gpus used for this run was 32. 20 | # If the global batch size (num_gpus * batch_size) is modified 21 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 22 | 23 | optim: 24 | batch_size: 12 25 | eval_batch_size: 12 26 | eval_every: 10000 27 | num_workers: 8 28 | lr_initial: 0.0001 29 | lr_gamma: 0.1 30 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 31 | - 20833 32 | - 31250 33 | - 41666 34 | warmup_steps: 10416 35 | warmup_factor: 0.2 36 | max_epochs: 15 37 | force_coefficient: 50 38 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/2M/dimenet_plus_plus/dpp_relax.yml: -------------------------------------------------------------------------------- 1 | trainer: forces 2 | 3 | dataset: 4 | - src: data/s2ef/2M/train/ 5 | normalize_labels: True 6 | target_mean: -0.7554450631141663 7 | target_std: 2.887317180633545 8 | grad_target_mean: 0.0 9 | grad_target_std: 2.887317180633545 10 | - src: data/s2ef/all/val_id/ 11 | 12 | logger: tensorboard 13 | 14 | task: 15 | dataset: trajectory_lmdb 16 | description: "Regressing to energies and forces for DFT trajectories from OCP" 17 | type: regression 18 | metric: mae 19 | labels: 20 | - potential energy 21 | grad_input: atomic forces 22 | train_on_free_atoms: True 23 | eval_on_free_atoms: True 24 | relax_dataset: 25 | src: data/is2re/all/test_id/data.lmdb 26 | write_pos: True 27 | relaxation_steps: 200 28 | relax_opt: 29 | maxstep: 0.04 30 | memory: 50 31 | damping: 1.0 32 | alpha: 70.0 33 | traj_dir: "ml-relaxations/dpp-2M-test-id" 34 | 35 | model: 36 | name: dimenetplusplus 37 | hidden_channels: 192 38 | out_emb_channels: 192 39 | num_blocks: 3 40 | cutoff: 6.0 41 | num_radial: 6 42 | num_spherical: 7 43 | num_before_skip: 1 44 | num_after_skip: 2 45 | num_output_layers: 3 46 | regress_forces: True 47 | use_pbc: True 48 | 49 | # *** Important note *** 50 | # The total number of gpus used for this run was 32. 51 | # If the global batch size (num_gpus * batch_size) is modified 52 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 53 | 54 | optim: 55 | batch_size: 12 56 | eval_batch_size: 12 57 | eval_every: 10000 58 | num_workers: 8 59 | lr_initial: 0.0001 60 | lr_gamma: 0.1 61 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 62 | - 20833 63 | - 31250 64 | - 41666 65 | warmup_steps: 10416 66 | warmup_factor: 0.2 67 | max_epochs: 15 68 | force_coefficient: 50 69 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/2M/schnet/schnet.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/2M/base.yml 3 | 4 | model: 5 | name: schnet 6 | hidden_channels: 1024 7 | num_filters: 256 8 | num_interactions: 5 9 | num_gaussians: 200 10 | cutoff: 6.0 11 | use_pbc: True 12 | 13 | # *** Important note *** 14 | # The total number of gpus used for this run was 8. 15 | # If the global batch size (num_gpus * batch_size) is modified 16 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 17 | 18 | optim: 19 | batch_size: 24 20 | eval_batch_size: 24 21 | num_workers: 16 22 | lr_initial: 0.0001 23 | lr_gamma: 0.1 24 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 25 | - 52083 26 | - 83333 27 | - 104166 28 | warmup_steps: 31250 29 | warmup_factor: 0.2 30 | max_epochs: 30 31 | force_coefficient: 100 32 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/all/base.yml: -------------------------------------------------------------------------------- 1 | trainer: forces 2 | 3 | dataset: 4 | - src: data/s2ef/all/train/ 5 | normalize_labels: True 6 | target_mean: -0.7554450631141663 7 | target_std: 2.887317180633545 8 | grad_target_mean: 0.0 9 | grad_target_std: 2.887317180633545 10 | - src: data/s2ef/all/val_id/ 11 | 12 | logger: tensorboard 13 | 14 | task: 15 | dataset: trajectory_lmdb 16 | description: "Regressing to energies and forces for DFT trajectories from OCP" 17 | type: regression 18 | metric: mae 19 | labels: 20 | - potential energy 21 | grad_input: atomic forces 22 | train_on_free_atoms: True 23 | eval_on_free_atoms: True 24 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/all/cgcnn/cgcnn.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/all/base.yml 3 | 4 | model: 5 | name: cgcnn 6 | atom_embedding_size: 512 7 | fc_feat_size: 128 8 | num_fc_layers: 3 9 | num_graph_conv_layers: 3 10 | cutoff: 6.0 11 | num_gaussians: 100 12 | use_pbc: True 13 | 14 | # *** Important note *** 15 | # The total number of gpus used for this run was 32. 16 | # If the global batch size (num_gpus * batch_size) is modified 17 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 18 | 19 | optim: 20 | batch_size: 24 21 | eval_batch_size: 24 22 | num_workers: 16 23 | lr_initial: 0.0005 24 | lr_gamma: 0.1 25 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 26 | - 523179 27 | - 871966 28 | - 1220752 29 | warmup_steps: 348786 30 | warmup_factor: 0.2 31 | max_epochs: 20 32 | force_coefficient: 10 33 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/all/dimenet_plus_plus/dpp.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/all/base.yml 3 | 4 | model: 5 | name: dimenetplusplus 6 | hidden_channels: 192 7 | out_emb_channels: 192 8 | num_blocks: 3 9 | cutoff: 6.0 10 | num_radial: 6 11 | num_spherical: 7 12 | num_before_skip: 1 13 | num_after_skip: 2 14 | num_output_layers: 3 15 | regress_forces: True 16 | use_pbc: True 17 | 18 | # *** Important note *** 19 | # The total number of gpus used for this run was 256. 20 | # If the global batch size (num_gpus * batch_size) is modified 21 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 22 | 23 | optim: 24 | batch_size: 8 25 | eval_batch_size: 8 26 | eval_every: 10000 27 | num_workers: 8 28 | lr_initial: 0.0001 29 | lr_gamma: 0.1 30 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 31 | - 130794 32 | - 196192 33 | - 261589 34 | warmup_steps: 130794 35 | warmup_factor: 0.2 36 | max_epochs: 7 37 | force_coefficient: 50 38 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/all/dimenet_plus_plus/dpp10.7M_forceonly.yml: -------------------------------------------------------------------------------- 1 | trainer: forces 2 | 3 | dataset: 4 | - src: data/s2ef/all/train/ 5 | normalize_labels: True 6 | target_mean: -0.7554450631141663 7 | target_std: 2.887317180633545 8 | grad_target_mean: 0.0 9 | grad_target_std: 2.887317180633545 10 | - src: data/s2ef/all/val_id/ 11 | 12 | logger: tensorboard 13 | 14 | task: 15 | dataset: trajectory_lmdb 16 | description: "Regressing to energies and forces for DFT trajectories from OCP" 17 | type: regression 18 | metric: mae 19 | primary_metric: forces_mae 20 | labels: 21 | - potential energy 22 | grad_input: atomic forces 23 | train_on_free_atoms: True 24 | eval_on_free_atoms: True 25 | 26 | model: 27 | name: dimenetplusplus 28 | hidden_channels: 512 29 | out_emb_channels: 384 30 | num_blocks: 3 31 | cutoff: 6.0 32 | num_radial: 6 33 | num_spherical: 7 34 | num_before_skip: 1 35 | num_after_skip: 2 36 | num_output_layers: 3 37 | regress_forces: True 38 | use_pbc: True 39 | 40 | # *** Important note *** 41 | # The total number of gpus used for this run was 256. 42 | # If the global batch size (num_gpus * batch_size) is modified 43 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 44 | 45 | optim: 46 | batch_size: 3 47 | eval_batch_size: 3 48 | eval_every: 10000 49 | num_workers: 3 50 | lr_initial: 0.0001 51 | lr_gamma: 0.1 52 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 53 | - 174393 54 | - 348786 55 | - 523179 56 | warmup_steps: 174393 57 | warmup_factor: 0.2 58 | max_epochs: 5 59 | energy_coefficient: 0 60 | force_coefficient: 100 61 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/all/dimenet_plus_plus/dpp_energyonly.yml: -------------------------------------------------------------------------------- 1 | trainer: forces 2 | 3 | dataset: 4 | - src: data/s2ef/all/train/ 5 | normalize_labels: True 6 | target_mean: -0.7554450631141663 7 | target_std: 2.887317180633545 8 | grad_target_mean: 0.0 9 | grad_target_std: 2.887317180633545 10 | - src: data/s2ef/all/val_id/ 11 | 12 | logger: tensorboard 13 | 14 | task: 15 | dataset: trajectory_lmdb 16 | description: "Regressing to energies and forces for DFT trajectories from OCP" 17 | type: regression 18 | metric: mae 19 | primary_metric: energy_mae 20 | labels: 21 | - potential energy 22 | grad_input: atomic forces 23 | train_on_free_atoms: True 24 | eval_on_free_atoms: True 25 | 26 | model: 27 | name: dimenetplusplus 28 | hidden_channels: 192 29 | out_emb_channels: 192 30 | num_blocks: 3 31 | cutoff: 6.0 32 | num_radial: 6 33 | num_spherical: 7 34 | num_before_skip: 1 35 | num_after_skip: 2 36 | num_output_layers: 3 37 | regress_forces: True 38 | use_pbc: True 39 | 40 | # *** Important note *** 41 | # The total number of gpus used for this run was 256. 42 | # If the global batch size (num_gpus * batch_size) is modified 43 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 44 | 45 | optim: 46 | batch_size: 8 47 | eval_batch_size: 8 48 | eval_every: 10000 49 | num_workers: 8 50 | lr_initial: 0.0001 51 | lr_gamma: 0.1 52 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 53 | - 130794 54 | - 196192 55 | - 261589 56 | warmup_steps: 130794 57 | warmup_factor: 0.2 58 | max_epochs: 7 59 | energy_coefficient: 100 60 | force_coefficient: 0 61 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/all/dimenet_plus_plus/dpp_forceonly.yml: -------------------------------------------------------------------------------- 1 | trainer: forces 2 | 3 | dataset: 4 | - src: data/s2ef/all/train/ 5 | normalize_labels: True 6 | target_mean: -0.7554450631141663 7 | target_std: 2.887317180633545 8 | grad_target_mean: 0.0 9 | grad_target_std: 2.887317180633545 10 | - src: data/s2ef/all/val_id/ 11 | 12 | logger: tensorboard 13 | 14 | task: 15 | dataset: trajectory_lmdb 16 | description: "Regressing to energies and forces for DFT trajectories from OCP" 17 | type: regression 18 | metric: mae 19 | primary_metric: forces_mae 20 | labels: 21 | - potential energy 22 | grad_input: atomic forces 23 | train_on_free_atoms: True 24 | eval_on_free_atoms: True 25 | 26 | model: 27 | name: dimenetplusplus 28 | hidden_channels: 192 29 | out_emb_channels: 192 30 | num_blocks: 3 31 | cutoff: 6.0 32 | num_radial: 6 33 | num_spherical: 7 34 | num_before_skip: 1 35 | num_after_skip: 2 36 | num_output_layers: 3 37 | regress_forces: True 38 | use_pbc: True 39 | 40 | # *** Important note *** 41 | # The total number of gpus used for this run was 64. 42 | # If the global batch size (num_gpus * batch_size) is modified 43 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 44 | 45 | optim: 46 | batch_size: 8 47 | eval_batch_size: 8 48 | eval_every: 10000 49 | num_workers: 8 50 | lr_initial: 0.0001 51 | lr_gamma: 0.1 52 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 53 | - 523179 54 | - 784769 55 | - 1046359 56 | warmup_steps: 523179 57 | warmup_factor: 0.2 58 | max_epochs: 7 59 | energy_coefficient: 0 60 | force_coefficient: 100 61 | -------------------------------------------------------------------------------- /open_catalyst/configs/s2ef/all/schnet/schnet.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - configs/s2ef/all/base.yml 3 | 4 | model: 5 | name: schnet 6 | hidden_channels: 1024 7 | num_filters: 256 8 | num_interactions: 5 9 | num_gaussians: 200 10 | cutoff: 6.0 11 | use_pbc: True 12 | 13 | # *** Important note *** 14 | # The total number of gpus used for this run was 64. 15 | # If the global batch size (num_gpus * batch_size) is modified 16 | # the lr_milestones and warmup_steps need to be adjusted accordingly. 17 | 18 | optim: 19 | batch_size: 20 20 | eval_batch_size: 20 21 | eval_every: 10000 22 | num_workers: 16 23 | lr_initial: 0.0001 24 | lr_gamma: 0.1 25 | lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma 26 | - 313907 27 | - 523179 28 | - 732451 29 | warmup_steps: 209271 30 | warmup_factor: 0.2 31 | max_epochs: 15 32 | force_coefficient: 30 33 | -------------------------------------------------------------------------------- /open_catalyst/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:21.08-py3 2 | 3 | # PyG 4 | RUN FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST="7.0 8.0" pip install --no-cache-dir \ 5 | torch-scatter torch-sparse torch-geometric==1.7.2 6 | 7 | # MLPerf logging 8 | RUN pip install --no-cache-dir git+https://github.com/mlcommons/logging.git 9 | 10 | # Other packages 11 | RUN pip install --no-cache-dir ray submitit demjson wandb ase pymatgen 12 | -------------------------------------------------------------------------------- /open_catalyst/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /open_catalyst/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /open_catalyst/docs/requirements.txt: -------------------------------------------------------------------------------- 1 | nbsphinx 2 | -------------------------------------------------------------------------------- /open_catalyst/docs/source/conf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | # Configuration file for the Sphinx documentation builder. 9 | # 10 | # This file only contains a selection of the most common options. For a full 11 | # list see the documentation: 12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 13 | 14 | # -- Path setup -------------------------------------------------------------- 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | 23 | sys.path.insert(0, os.path.abspath("../../")) 24 | 25 | 26 | # -- Project information ----------------------------------------------------- 27 | 28 | project = "Open Catalyst Project" 29 | copyright = "2020, Facebook, Inc." 30 | author = "Anuroop Sriram" 31 | 32 | 33 | # -- General configuration --------------------------------------------------- 34 | 35 | # Add any Sphinx extension module names here, as strings. They can be 36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 37 | # ones. 38 | extensions = [ 39 | "sphinx.ext.autodoc", 40 | "sphinx.ext.coverage", 41 | "sphinx.ext.napoleon", 42 | "sphinx_rtd_theme", 43 | "nbsphinx", 44 | ] 45 | 46 | # Add any paths that contain templates here, relative to this directory. 47 | templates_path = ["_templates"] 48 | 49 | # List of patterns, relative to source directory, that match files and 50 | # directories to ignore when looking for source files. 51 | # This pattern also affects html_static_path and html_extra_path. 52 | exclude_patterns = [] 53 | 54 | 55 | # -- Options for HTML output ------------------------------------------------- 56 | 57 | # The theme to use for HTML and HTML Help pages. See the documentation for 58 | # a list of builtin themes. 59 | # 60 | html_theme = "sphinx_rtd_theme" 61 | 62 | # Add any paths that contain custom static files (such as style sheets) here, 63 | # relative to this directory. They are copied after the builtin static files, 64 | # so a file named "default.css" will overwrite the builtin "default.css". 65 | html_static_path = ["_static"] 66 | 67 | master_doc = "index" 68 | -------------------------------------------------------------------------------- /open_catalyst/docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Open Catalyst Project 2 | ===================== 3 | 4 | The Open Catalyst Project is a collaborative research effort between Facebook AI 5 | Research (FAIR) and Carnegie Mellon University’s (CMU) Department of Chemical Engineering. 6 | The aim is to use AI to model and discover new catalysts for use in renewable energy 7 | storage to help in addressing climate change. 8 | 9 | Scalable and cost-effective solutions to renewable energy storage are essential to 10 | addressing the world’s rising energy needs while reducing climate change. As we 11 | increase our reliance on renewable energy sources such as wind and solar, which produce 12 | intermittent power, storage is needed to transfer power from times of peak generation to 13 | peak demand. This may require the storage of power for hours, days, or months. One solution 14 | that offers the potential of scaling to nation-sized grids is the conversion of 15 | renewable energy to other fuels, such as hydrogen. To be widely adopted, this 16 | process requires cost-effective solutions to running chemical reactions. 17 | 18 | An open challenge is finding low-cost catalysts to drive these reactions at high rates. 19 | Through the use of quantum mechanical simulations (density functional theory), new 20 | catalyst structures can be tested and evaluated. Unfortunately, the high computational 21 | cost of these simulations limits the number of structures that may be tested. The use of 22 | AI or machine learning may provide a method to efficiently approximate these calculations, 23 | leading to new approaches in finding effective catalysts. 24 | 25 | To enable the broader research community to participate in this important project, 26 | we provide baseline models and code at 27 | `Github page `_. 28 | 29 | 30 | .. toctree:: 31 | :maxdepth: 1 32 | :caption: Tutorials 33 | 34 | tutorials/getting_started 35 | tutorials/data_playground.ipynb 36 | tutorials/train_s2ef_example.ipynb 37 | tutorials/training 38 | tutorials/submission 39 | 40 | .. 41 | .. toctree:: 42 | :maxdepth: 1 43 | :caption: Modules 44 | 45 | modules/model 46 | modules/dataset 47 | modules/trainer 48 | 49 | Indices and tables 50 | ================== 51 | 52 | * :ref:`genindex` 53 | * :ref:`modindex` 54 | * :ref:`search` 55 | -------------------------------------------------------------------------------- /open_catalyst/docs/source/modules/dataset.rst: -------------------------------------------------------------------------------- 1 | ocpmodels.datasets 2 | ================== 3 | 4 | .. .. currentmodule:: ocpmodels.datasets 5 | 6 | .. .. autosummary:: 7 | .. :toctree: generated 8 | .. :nosignatures: 9 | 10 | .. automodule:: ocpmodels.datasets 11 | :members: 12 | :exclude-members: data_list_collater 13 | -------------------------------------------------------------------------------- /open_catalyst/docs/source/modules/model.rst: -------------------------------------------------------------------------------- 1 | ocpmodels.models 2 | ================ 3 | 4 | .. .. currentmodule:: ocpmodels.models 5 | 6 | .. .. autosummary:: 7 | .. :toctree: generated 8 | .. :nosignatures: 9 | 10 | .. automodule:: ocpmodels.models 11 | :members: 12 | :exclude-members: 13 | -------------------------------------------------------------------------------- /open_catalyst/docs/source/modules/trainer.rst: -------------------------------------------------------------------------------- 1 | ocpmodels.trainers 2 | ================== 3 | 4 | .. .. currentmodule:: ocpmodels.trainers 5 | 6 | .. .. autosummary:: 7 | .. :toctree: generated 8 | .. :nosignatures: 9 | 10 | .. automodule:: ocpmodels.trainers 11 | :members: 12 | :exclude-members: 13 | -------------------------------------------------------------------------------- /open_catalyst/docs/source/tutorials/submission.rst: -------------------------------------------------------------------------------- 1 | Create EvalAI submission files 2 | ============================== 3 | 4 | EvalAI expects results to be structured in a specific format for a submission to be successful. A submission must contain results from the 4 different splits - in distribution (id), out of distribution adsorbate (ood ads), out of distribution catalyst (ood cat), and out of distribution adsorbate and catalyst (ood both). Constructing the submission file for each of the above tasks is as follows: 5 | 6 | S2EF / IS2RE 7 | ************ 8 | 9 | 1. Run predictions :obj:`--mode predict` on all 4 splits, generating :obj:`predictions.json` files for each split. 10 | 2. Modify :obj:`scripts/make_evalai_json.py` with the corresponding paths of the :obj:`predictions.json` files and run to generate your final submission file :obj:`taskname_split_submission.json` (filename may be modified). 11 | 3. Upload :obj:`taskname_split_submission.json` to EvalAI. 12 | 13 | 14 | IS2RS 15 | ***** 16 | 17 | 1. Ensure :obj:`write_pos: True` is included in your configuration file. Run relaxations :obj:`--mode run-relaxations` on all 4 splits, generating :obj:`relaxed_pos_[DEVICE #].json` files for each split. 18 | 2. For each split, if relaxations were run with multiple GPUs, combine :obj:`relaxed_pos_[DEVICE #].json` into one :obj:`relaxed_pos.json` file using :obj:`scripts/make_evalai_json.py`, otherwise skip to 3. 19 | 3. Modify :obj:`scripts/make_evalai_json.py` with the corresponding paths of the :obj:`relaxed_pos.json` files and run to generate your final submission file :obj:`taskname_split_submission.json` (filename may be modified). 20 | 4. Upload :obj:`taskname_split_submission.json` to EvalAI. 21 | -------------------------------------------------------------------------------- /open_catalyst/env.common.yml: -------------------------------------------------------------------------------- 1 | name: ocp-models 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - ase=3.21.* 8 | - matplotlib=3.3.* 9 | - pip 10 | - pre-commit=2.10.* 11 | - pymatgen=2020.12.31 12 | - python=3.8.* 13 | - pytorch=1.8.1 14 | - pyyaml=5.4.* 15 | - tensorboard=2.4.* 16 | - tqdm=4.58.* 17 | - sphinx 18 | - nbsphinx 19 | - pandoc 20 | - black 21 | - pip: 22 | - demjson 23 | - Pillow 24 | - git+https://github.com/rusty1s/pytorch_geometric.git@4ea63d3 25 | - wandb 26 | - lmdb==1.1.1 27 | - pytest==6.2.2 28 | - submitit 29 | - sphinx-rtd-theme 30 | -------------------------------------------------------------------------------- /open_catalyst/env.cpu.yml: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - cpuonly 3 | - pip: 4 | - -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html 5 | - torch-cluster 6 | - torch-scatter 7 | - torch-sparse 8 | - torch-spline-conv 9 | -------------------------------------------------------------------------------- /open_catalyst/env.gpu.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - pytorch 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - cudatoolkit=10.2 7 | - pip: 8 | - -f https://pytorch-geometric.com/whl/torch-1.8.0+cu102.html 9 | - torch-cluster 10 | - torch-scatter 11 | - torch-sparse 12 | - torch-spline-conv 13 | -------------------------------------------------------------------------------- /open_catalyst/env.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - pytorch 3 | - nvidia 4 | - defaults 5 | - conda-forge 6 | dependencies: 7 | - ase=3.21.* 8 | - black 9 | - cudatoolkit=11.1 10 | - matplotlib=3.3.* 11 | - nbsphinx 12 | - pandoc 13 | - pip 14 | - pre-commit=2.10.* 15 | - pymatgen=2020.12.31 16 | - python=3.8.* 17 | - pytorch=1.8.0 18 | - pyyaml=5.4.* 19 | - sphinx 20 | - tensorboard=2.4.* 21 | - tqdm=4.58.* 22 | - pip: 23 | - -f https://pytorch-geometric.com/whl/torch-1.8.0+cu111.html 24 | - Pillow 25 | - demjson 26 | - lmdb==1.1.1 27 | - pytest==6.2.2 28 | - ray 29 | - sphinx-rtd-theme 30 | - submitit 31 | - torch-cluster 32 | - torch-scatter 33 | - torch-sparse 34 | - torch-spline-conv 35 | - git+https://github.com/rusty1s/pytorch_geometric.git 36 | - wandb 37 | - git+https://github.com/mlperf-hpc/logging.git@hpc-0.5.0 38 | -------------------------------------------------------------------------------- /open_catalyst/licenses/LICENSE.cgcnn: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Tian Xie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /open_catalyst/licenses/LICENSE.mmf: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For MMF software 4 | 5 | Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /open_catalyst/logs/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/common/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/common/hpo_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import math 9 | 10 | from ray import tune 11 | 12 | 13 | def tune_reporter( 14 | iters, 15 | train_metrics, 16 | val_metrics, 17 | test_metrics=None, 18 | metric_to_opt="val_loss", 19 | min_max="min", 20 | ): 21 | """ 22 | Wrapper function for tune.report() 23 | 24 | Args: 25 | iters(dict): dict with training iteration info (e.g. steps, epochs) 26 | train_metrics(dict): train metrics dict 27 | val_metrics(dict): val metrics dict 28 | test_metrics(dict, optional): test metrics dict, default is None 29 | metric_to_opt(str, optional): str for val metric to optimize, default is val_loss 30 | min_max(str, optional): either "min" or "max", determines whether metric_to_opt is to be minimized or maximized, default is min 31 | 32 | """ 33 | # labels metric dicts 34 | train = label_metric_dict(train_metrics, "train") 35 | val = label_metric_dict(val_metrics, "val") 36 | # this enables tolerance for NaNs assumes val set is used for optimization 37 | if math.isnan(val[metric_to_opt]): 38 | if min_max == "min": 39 | val[metric_to_opt] = 100000.0 40 | if min_max == "max": 41 | val[metric_to_opt] = 0.0 42 | if test_metrics: 43 | test = label_metric_dict(test_metrics, "test") 44 | else: 45 | test = {} 46 | # report results to Ray Tune 47 | tune.report(**iters, **train, **val, **test) 48 | 49 | 50 | def label_metric_dict(metric_dict, split): 51 | new_dict = {} 52 | for key in metric_dict: 53 | new_dict["{}_{}".format(split, key)] = metric_dict[key] 54 | metric_dict = new_dict 55 | return metric_dict 56 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/common/relaxation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/hpc/2c627d457004eff77a014205b3151ed48a6fa149/open_catalyst/ocpmodels/common/relaxation/__init__.py -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/common/relaxation/ml_relaxation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | from pathlib import Path 9 | 10 | import torch 11 | 12 | from ocpmodels.common.registry import registry 13 | 14 | from .optimizers.lbfgs_torch import LBFGS, TorchCalc 15 | 16 | 17 | def ml_relax( 18 | batch, 19 | model, 20 | steps, 21 | fmax, 22 | relax_opt, 23 | device="cuda:0", 24 | transform=None, 25 | early_stop_batch=False, 26 | ): 27 | """ 28 | Runs ML-based relaxations. 29 | Args: 30 | batch: object 31 | model: object 32 | steps: int 33 | Max number of steps in the structure relaxation. 34 | fmax: float 35 | Structure relaxation terminates when the max force 36 | of the system is no bigger than fmax. 37 | relax_opt: str 38 | Optimizer and corresponding parameters to be used for structure relaxations. 39 | """ 40 | batch = batch[0] 41 | ids = batch.sid 42 | calc = TorchCalc(model, transform) 43 | 44 | # Run ML-based relaxation 45 | traj_dir = relax_opt.get("traj_dir", None) 46 | optimizer = LBFGS( 47 | batch, 48 | calc, 49 | maxstep=relax_opt.get("maxstep", 0.04), 50 | memory=relax_opt["memory"], 51 | damping=relax_opt.get("damping", 1.0), 52 | alpha=relax_opt.get("alpha", 70.0), 53 | device=device, 54 | traj_dir=Path(traj_dir) if traj_dir is not None else None, 55 | traj_names=ids, 56 | early_stop_batch=early_stop_batch, 57 | ) 58 | relaxed_batch = optimizer.run(fmax=fmax, steps=steps) 59 | 60 | return relaxed_batch 61 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/common/relaxation/optimizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/hpc/2c627d457004eff77a014205b3151ed48a6fa149/open_catalyst/ocpmodels/common/relaxation/optimizers/__init__.py -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | __all__ = [ 7 | "SinglePointLmdbDataset", 8 | "TrajectoryLmdbDataset", 9 | "data_list_collater", 10 | ] 11 | 12 | from .single_point_lmdb import SinglePointLmdbDataset 13 | from .trajectory_lmdb import TrajectoryLmdbDataset, data_list_collater 14 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/datasets/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "ATOMIC_RADII", 3 | "KHOT_EMBEDDINGS", 4 | "CONTINUOUS_EMBEDDINGS", 5 | ] 6 | 7 | from .atomic_radii import ATOMIC_RADII 8 | from .continuous_embeddings import CONTINUOUS_EMBEDDINGS 9 | from .khot_embeddings import KHOT_EMBEDDINGS 10 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/datasets/embeddings/atomic_radii.py: -------------------------------------------------------------------------------- 1 | """ 2 | Atomic radii in picometers 3 | 4 | NaN stored for unavailable parameters. 5 | """ 6 | ATOMIC_RADII = { 7 | 0: float("NaN"), 8 | 1: 25.0, 9 | 2: 120.0, 10 | 3: 145.0, 11 | 4: 105.0, 12 | 5: 85.0, 13 | 6: 70.0, 14 | 7: 65.0, 15 | 8: 60.0, 16 | 9: 50.0, 17 | 10: 160.0, 18 | 11: 180.0, 19 | 12: 150.0, 20 | 13: 125.0, 21 | 14: 110.0, 22 | 15: 100.0, 23 | 16: 100.0, 24 | 17: 100.0, 25 | 18: 71.0, 26 | 19: 220.0, 27 | 20: 180.0, 28 | 21: 160.0, 29 | 22: 140.0, 30 | 23: 135.0, 31 | 24: 140.0, 32 | 25: 140.0, 33 | 26: 140.0, 34 | 27: 135.0, 35 | 28: 135.0, 36 | 29: 135.0, 37 | 30: 135.0, 38 | 31: 130.0, 39 | 32: 125.0, 40 | 33: 115.0, 41 | 34: 115.0, 42 | 35: 115.0, 43 | 36: float("NaN"), 44 | 37: 235.0, 45 | 38: 200.0, 46 | 39: 180.0, 47 | 40: 155.0, 48 | 41: 145.0, 49 | 42: 145.0, 50 | 43: 135.0, 51 | 44: 130.0, 52 | 45: 135.0, 53 | 46: 140.0, 54 | 47: 160.0, 55 | 48: 155.0, 56 | 49: 155.0, 57 | 50: 145.0, 58 | 51: 145.0, 59 | 52: 140.0, 60 | 53: 140.0, 61 | 54: float("NaN"), 62 | 55: 260.0, 63 | 56: 215.0, 64 | 57: 195.0, 65 | 58: 185.0, 66 | 59: 185.0, 67 | 60: 185.0, 68 | 61: 185.0, 69 | 62: 185.0, 70 | 63: 185.0, 71 | 64: 180.0, 72 | 65: 175.0, 73 | 66: 175.0, 74 | 67: 175.0, 75 | 68: 175.0, 76 | 69: 175.0, 77 | 70: 175.0, 78 | 71: 175.0, 79 | 72: 155.0, 80 | 73: 145.0, 81 | 74: 135.0, 82 | 75: 135.0, 83 | 76: 130.0, 84 | 77: 135.0, 85 | 78: 135.0, 86 | 79: 135.0, 87 | 80: 150.0, 88 | 81: 190.0, 89 | 82: 180.0, 90 | 83: 160.0, 91 | 84: 190.0, 92 | 85: float("NaN"), 93 | 86: float("NaN"), 94 | 87: float("NaN"), 95 | 88: 215.0, 96 | 89: 195.0, 97 | 90: 180.0, 98 | 91: 180.0, 99 | 92: 175.0, 100 | 93: 175.0, 101 | 94: 175.0, 102 | 95: 175.0, 103 | 96: float("NaN"), 104 | 97: float("NaN"), 105 | 98: float("NaN"), 106 | 99: float("NaN"), 107 | 100: float("NaN"), 108 | } 109 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/datasets/single_point_lmdb.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import os 9 | import pickle 10 | 11 | import lmdb 12 | from torch.utils.data import Dataset 13 | 14 | from ocpmodels.common.registry import registry 15 | 16 | 17 | @registry.register_dataset("single_point_lmdb") 18 | class SinglePointLmdbDataset(Dataset): 19 | r"""Dataset class to load from LMDB files containing single point computations. 20 | Useful for Initial Structure to Relaxed Energy (IS2RE) task. 21 | 22 | Args: 23 | config (dict): Dataset configuration 24 | transform (callable, optional): Data transform function. 25 | (default: :obj:`None`) 26 | """ 27 | 28 | def __init__(self, config, transform=None): 29 | super(SinglePointLmdbDataset, self).__init__() 30 | 31 | self.config = config 32 | 33 | self.db_path = self.config["src"] 34 | assert os.path.isfile(self.db_path), "{} not found".format( 35 | self.db_path 36 | ) 37 | 38 | self.env = self.connect_db(self.db_path) 39 | 40 | self._keys = [ 41 | f"{j}".encode("ascii") for j in range(self.env.stat()["entries"]) 42 | ] 43 | self.transform = transform 44 | 45 | def __len__(self): 46 | return len(self._keys) 47 | 48 | def __getitem__(self, idx): 49 | # Return features. 50 | datapoint_pickled = self.env.begin().get(self._keys[idx]) 51 | data_object = pickle.loads(datapoint_pickled) 52 | data_object = ( 53 | data_object 54 | if self.transform is None 55 | else self.transform(data_object) 56 | ) 57 | 58 | return data_object 59 | 60 | def connect_db(self, lmdb_path=None): 61 | env = lmdb.open( 62 | lmdb_path, 63 | subdir=False, 64 | readonly=True, 65 | lock=False, 66 | readahead=False, 67 | meminit=False, 68 | max_readers=1, 69 | ) 70 | return env 71 | 72 | def close_db(self): 73 | self.env.close() 74 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | __all__ = [ 7 | "BaseModel", 8 | "CGCNN", 9 | "DimeNet", 10 | "DimeNetPlusPlus", 11 | "SchNet", 12 | "ForceNet", 13 | ] 14 | 15 | from .base import BaseModel 16 | from .cgcnn import CGCNN 17 | from .dimenet import DimeNetWrap as DimeNet 18 | from .dimenet_plus_plus import DimeNetPlusPlusWrap as DimeNetPlusPlus 19 | from .forcenet import ForceNet 20 | from .schnet import SchNetWrap as SchNet 21 | 22 | DimeNet.__module__ = __name__ 23 | DimeNet.__name__ = "DimeNet" 24 | 25 | DimeNetPlusPlus.__module__ = __name__ 26 | DimeNetPlusPlus.__name__ = "DimeNetPlusPlus" 27 | 28 | SchNet.__module__ = __name__ 29 | SchNet.__name__ = "SchNet" 30 | 31 | ForceNet.__module__ = __name__ 32 | ForceNet.__name__ = "ForceNet" 33 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/models/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import torch.nn as nn 9 | 10 | 11 | class BaseModel(nn.Module): 12 | def __init__(self, num_atoms=None, bond_feat_dim=None, num_targets=None): 13 | super(BaseModel, self).__init__() 14 | self.num_atoms = num_atoms 15 | self.bond_feat_dim = bond_feat_dim 16 | self.num_targets = num_targets 17 | 18 | def forward(self, data): 19 | raise NotImplementedError 20 | 21 | @property 22 | def num_params(self): 23 | return sum(p.numel() for p in self.parameters()) 24 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/models/utils/activations.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | class Act(torch.nn.Module): 13 | def __init__(self, act, slope=0.05): 14 | super(Act, self).__init__() 15 | self.act = act 16 | self.slope = slope 17 | self.shift = torch.log(torch.tensor(2.0)).item() 18 | 19 | def forward(self, input): 20 | if self.act == "relu": 21 | return F.relu(input) 22 | elif self.act == "leaky_relu": 23 | return F.leaky_relu(input) 24 | elif self.act == "sp": 25 | return F.softplus(input, beta=1) 26 | elif self.act == "leaky_sp": 27 | return F.softplus(input, beta=1) - self.slope * F.relu(-input) 28 | elif self.act == "elu": 29 | return F.elu(input, alpha=1) 30 | elif self.act == "leaky_elu": 31 | return F.elu(input, alpha=1) - self.slope * F.relu(-input) 32 | elif self.act == "ssp": 33 | return F.softplus(input, beta=1) - self.shift 34 | elif self.act == "leaky_ssp": 35 | return ( 36 | F.softplus(input, beta=1) 37 | - self.slope * F.relu(-input) 38 | - self.shift 39 | ) 40 | elif self.act == "tanh": 41 | return torch.tanh(input) 42 | elif self.act == "leaky_tanh": 43 | return torch.tanh(input) + self.slope * input 44 | elif self.act == "swish": 45 | return torch.sigmoid(input) * input 46 | else: 47 | raise RuntimeError(f"Undefined activation called {self.act}") 48 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/modules/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/modules/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from ocpmodels.common import distutils 5 | 6 | 7 | class L2MAELoss(nn.Module): 8 | def __init__(self, reduction="mean"): 9 | super().__init__() 10 | self.reduction = reduction 11 | assert reduction in ["mean", "sum"] 12 | 13 | def forward(self, input: torch.Tensor, target: torch.Tensor): 14 | dists = torch.norm(input - target, p=2, dim=-1) 15 | if self.reduction == "mean": 16 | return torch.mean(dists) 17 | elif self.reduction == "sum": 18 | return torch.sum(dists) 19 | 20 | 21 | class DDPLoss(nn.Module): 22 | def __init__(self, loss_fn, reduction="mean"): 23 | super().__init__() 24 | self.loss_fn = loss_fn 25 | self.loss_fn.reduction = "sum" 26 | self.reduction = reduction 27 | assert reduction in ["mean", "sum"] 28 | 29 | def forward(self, input: torch.Tensor, target: torch.Tensor): 30 | loss = self.loss_fn(input, target) 31 | if self.reduction == "mean": 32 | num_samples = input.shape[0] 33 | num_samples = distutils.all_reduce( 34 | num_samples, device=input.device 35 | ) 36 | # Multiply by world size since gradients are averaged 37 | # across DDP replicas 38 | return loss * distutils.get_world_size() / num_samples 39 | else: 40 | return loss 41 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/modules/normalizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import torch 9 | 10 | 11 | class Normalizer(object): 12 | """Normalize a Tensor and restore it later.""" 13 | 14 | def __init__(self, tensor=None, mean=None, std=None, device=None): 15 | """tensor is taken as a sample to calculate the mean and std""" 16 | if tensor is None and mean is None: 17 | return 18 | 19 | if device is None: 20 | device = "cpu" 21 | 22 | if tensor is not None: 23 | self.mean = torch.mean(tensor, dim=0).to(device) 24 | self.std = torch.std(tensor, dim=0).to(device) 25 | return 26 | 27 | if mean is not None and std is not None: 28 | self.mean = torch.tensor(mean).to(device) 29 | self.std = torch.tensor(std).to(device) 30 | 31 | def to(self, device): 32 | self.mean = self.mean.to(device) 33 | self.std = self.std.to(device) 34 | 35 | def norm(self, tensor): 36 | return (tensor - self.mean) / self.std 37 | 38 | def denorm(self, normed_tensor): 39 | return normed_tensor * self.std + self.mean 40 | 41 | def state_dict(self): 42 | return {"mean": self.mean, "std": self.std} 43 | 44 | def load_state_dict(self, state_dict): 45 | self.mean = state_dict["mean"].to(self.mean.device) 46 | self.std = state_dict["std"].to(self.mean.device) 47 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/modules/scheduler.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | import torch.optim.lr_scheduler as lr_scheduler 4 | 5 | from ocpmodels.common.utils import warmup_lr_lambda 6 | 7 | 8 | class LRScheduler: 9 | """ 10 | Learning rate scheduler class for torch.optim learning rate schedulers 11 | 12 | Notes: 13 | If no learning rate scheduler is specified in the config the default 14 | scheduler is warmup_lr_lambda (ocpmodels.common.utils) not no scheduler, 15 | this is for backward-compatibility reasons. To run without a lr scheduler 16 | specify scheduler: "Null" in the optim section of the config. 17 | 18 | Args: 19 | config (dict): Optim dict from the input config 20 | optimizer (obj): torch optim object 21 | """ 22 | 23 | def __init__(self, optimizer, config): 24 | self.optimizer = optimizer 25 | self.config = config.copy() 26 | if "scheduler" in self.config: 27 | self.scheduler_type = self.config["scheduler"] 28 | else: 29 | self.scheduler_type = "LambdaLR" 30 | scheduler_lambda_fn = lambda x: warmup_lr_lambda(x, self.config) 31 | self.config["lr_lambda"] = scheduler_lambda_fn 32 | 33 | if self.scheduler_type != "Null": 34 | self.scheduler = getattr(lr_scheduler, self.scheduler_type) 35 | scheduler_args = self.filter_kwargs(config) 36 | self.scheduler = self.scheduler(optimizer, **scheduler_args) 37 | 38 | def step(self, metrics=None, epoch=None): 39 | if self.scheduler_type == "Null": 40 | return 41 | if self.scheduler_type == "ReduceLROnPlateau": 42 | if metrics is None: 43 | raise Exception( 44 | "Validation set required for ReduceLROnPlateau." 45 | ) 46 | self.scheduler.step(metrics) 47 | else: 48 | self.scheduler.step() 49 | 50 | def filter_kwargs(self, config): 51 | # adapted from https://stackoverflow.com/questions/26515595/ 52 | sig = inspect.signature(self.scheduler) 53 | filter_keys = [ 54 | param.name 55 | for param in sig.parameters.values() 56 | if param.kind == param.POSITIONAL_OR_KEYWORD 57 | ] 58 | filter_keys.remove("optimizer") 59 | scheduler_args = { 60 | arg: self.config[arg] for arg in self.config if arg in filter_keys 61 | } 62 | return scheduler_args 63 | 64 | def get_lr(self): 65 | for group in self.optimizer.param_groups: 66 | return group["lr"] 67 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | from .atoms_to_graphs import AtomsToGraphs 9 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | __all__ = ["TrainTask", "PredictTask", "ValidateTask", "RelxationTask"] 7 | 8 | from .task import PredictTask, RelxationTask, TrainTask, ValidateTask 9 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/tasks/task.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import os 9 | 10 | from ocpmodels.common.registry import registry 11 | from ocpmodels.trainers.forces_trainer import ForcesTrainer 12 | 13 | 14 | class BaseTask: 15 | def __init__(self, config): 16 | self.config = config 17 | 18 | def setup(self, trainer): 19 | self.trainer = trainer 20 | if self.config["checkpoint"] is not None: 21 | self.trainer.load_checkpoint(self.config["checkpoint"]) 22 | 23 | # save checkpoint path to runner state for slurm resubmissions 24 | self.chkpt_path = os.path.join( 25 | self.trainer.config["cmd"]["checkpoint_dir"], "checkpoint.pt" 26 | ) 27 | 28 | def run(self): 29 | raise NotImplementedError 30 | 31 | 32 | @registry.register_task("train") 33 | class TrainTask(BaseTask): 34 | def run(self): 35 | self.trainer.train( 36 | disable_eval_tqdm=self.config.get("hide_eval_progressbar", False) 37 | ) 38 | 39 | 40 | @registry.register_task("predict") 41 | class PredictTask(BaseTask): 42 | def run(self): 43 | assert ( 44 | self.trainer.test_loader is not None 45 | ), "Test dataset is required for making predictions" 46 | assert self.config["checkpoint"] 47 | results_file = "predictions" 48 | self.trainer.predict( 49 | self.trainer.test_loader, 50 | results_file=results_file, 51 | disable_tqdm=self.config.get("hide_eval_progressbar", False), 52 | ) 53 | 54 | 55 | @registry.register_task("validate") 56 | class ValidateTask(BaseTask): 57 | def run(self): 58 | # Note that the results won't be precise on multi GPUs due to padding of extra images (although the difference should be minor) 59 | assert ( 60 | self.trainer.val_loader is not None 61 | ), "Val dataset is required for making predictions" 62 | assert self.config["checkpoint"] 63 | self.trainer.validate( 64 | split="val", 65 | disable_tqdm=self.config.get("hide_eval_progressbar", False), 66 | ) 67 | 68 | 69 | @registry.register_task("run-relaxations") 70 | class RelxationTask(BaseTask): 71 | def run(self): 72 | assert isinstance( 73 | self.trainer, ForcesTrainer 74 | ), "Relaxations are only possible for ForcesTrainer" 75 | assert ( 76 | self.trainer.relax_dataset is not None 77 | ), "Relax dataset is required for making predictions" 78 | assert self.config["checkpoint"] 79 | self.trainer.run_relaxations() 80 | -------------------------------------------------------------------------------- /open_catalyst/ocpmodels/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | __all__ = [ 7 | "BaseTrainer", 8 | "ForcesTrainer", 9 | "EnergyTrainer", 10 | ] 11 | 12 | from .base_trainer import BaseTrainer 13 | from .energy_trainer import EnergyTrainer 14 | from .forces_trainer import ForcesTrainer 15 | -------------------------------------------------------------------------------- /open_catalyst/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 79 3 | include = '\.pyi?$' 4 | exclude = ''' 5 | /( 6 | \.git 7 | | \.hg 8 | | \.mypy_cache 9 | | \.tox 10 | | \.venv 11 | | _build 12 | | buck-out 13 | | build 14 | | dist 15 | )/ 16 | ''' 17 | -------------------------------------------------------------------------------- /open_catalyst/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | -------------------------------------------------------------------------------- /open_catalyst/scripts/hpo/README.md: -------------------------------------------------------------------------------- 1 | # Running Hyperparameter Optimization with Ray Tune 2 | 3 | # Installation 4 | `pip install ray ray[tune]` 5 | 6 | ## Model config considerations 7 | 8 | The current Ray Tune implementation uses the standard OCP config. However, there are a number of config settings that require additional consideration. 9 | 10 | ``` 11 | logger: None 12 | is_hpo: True 13 | 14 | optim: 15 | … 16 | eval_every: (int) number of steps 17 | checkpoint_every: (int: optional) number of steps 18 | ``` 19 | The first two are easily set. The logger is set to None because Ray Tune internally handles the logging. 20 | 21 | The `eval_every` setting is case specific and will likely require some experimentation. The `eval_every` flag sets how often the validation set is run in number of steps. Depending on the OCP model and dataset of interest, training for a single epoch can take a substantial amount of time. However, to take full advantage of HPO methods that minimize compute by terminating trials that are not promising, such as successive halving, communication of train and val metrics need to happen on shorter timescales. Paraphrasing the Ray Tune docs, `eval_every` should be set large enough to avoid overheads but short enough to report progress periodically — minutes timescale recommended. 22 | 23 | The `eval_every` setting is only available for the force trainer so when using the energy trainer validation will be run and reporting to Ray Tune will occur on a per epoch basis. 24 | 25 | The `checkpoint_every` setting determines how frequently, in steps, Ray Tune will write a checkpoint. Checkpointing can create a lot of overhead for certain HPO methods so do not do it too frequently. The default behavior is no checkpointing. 26 | 27 | ## Usage with Slurm 28 | 29 | 1. Make necessary changes to `run_tune.py` and `slurm/submit-ray-cluster.sbatch` 30 | 31 | Example `run_tune.py` updates 32 | - choose search and scheduler algorithms and set associated parameters (see [Ray Tune docs](https://docs.ray.io/en/master/tune/index.html) for details) 33 | - set the resources to use per individual trial 34 | 35 | Example `slurm/submit-ray-cluster.sbatch` updates 36 | - load modules or set conda env 37 | - change the total run time and resources to use 38 | 39 | 2. submit using `sbatch slurm/submit-ray-cluster.sbatch` 40 | 41 | Slurm scripts taken from https://github.com/NERSC/slurm-ray-cluster 42 | 43 | For usage with other cluster managers or cloud resources please refer to the 44 | [Distributed Ray Docs](https://docs.ray.io/en/master/cluster/index.html#) 45 | 46 | ## Examples 47 | 48 | 1. Asynchronous Successive Halving — `ocp/scripts/hpo/run_tune.py` 49 | 2. Population Based Training — `ocp/scripts/hpo/run_tune_pbt.py` 50 | 51 | ## Testing/Debugging Ray Tune 52 | 53 | - In `run_tune.py` set `ray.init(local_mode=True)` 54 | - run `python path_to/run_tune.py --mode train --config-yml path_to/config --run_dir path_to_run_dir` 55 | -------------------------------------------------------------------------------- /open_catalyst/scripts/hpo/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | -------------------------------------------------------------------------------- /open_catalyst/scripts/hpo/slurm/start-head.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export LC_ALL=C.UTF-8 4 | export LANG=C.UTF-8 5 | 6 | echo "starting ray head node" 7 | # Launch the head node 8 | ray start --head --node-ip-address=$1 --port=6379 --redis-password=$2 9 | sleep infinity 10 | -------------------------------------------------------------------------------- /open_catalyst/scripts/hpo/slurm/start-worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export LC_ALL=C.UTF-8 4 | export LANG=C.UTF-8 5 | 6 | echo "starting ray worker node" 7 | ray start --address $1 --redis-password=$2 8 | sleep infinity 9 | -------------------------------------------------------------------------------- /open_catalyst/scripts/hpo/slurm/submit-ray-cluster.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -C gpu 4 | #SBATCH --time=00:10:00 5 | 6 | ### This script works for any number of nodes, Ray will find and manage all resources 7 | #SBATCH --nodes=1 8 | 9 | ### Give all resources to a single Ray task, ray can manage the resources internally 10 | #SBATCH --ntasks-per-node=1 11 | #SBATCH --gpus-per-task=8 12 | #SBATCH --cpus-per-task=80 13 | 14 | 15 | # Load modules or your own conda environment here 16 | # e.g. conda activate ocp-models 17 | 18 | ################# DON NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ############### 19 | # This script is a modification to the implementation suggest by gregSchwartz18 here: 20 | # https://github.com/ray-project/ray/issues/826#issuecomment-522116599 21 | redis_password=$(uuidgen) 22 | export redis_password 23 | 24 | nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names 25 | nodes_array=( $nodes ) 26 | 27 | node_1=${nodes_array[0]} 28 | ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address 29 | port=6379 30 | ip_head=$ip:$port 31 | export ip_head 32 | echo "IP Head: $ip_head" 33 | 34 | echo "STARTING HEAD at $node_1" 35 | srun --nodes=1 --ntasks=1 -w $node_1 start-head.sh $ip $redis_password & 36 | sleep 45 37 | 38 | worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node 39 | for (( i=1; i<=$worker_num; i++ )) 40 | do 41 | node_i=${nodes_array[$i]} 42 | echo "STARTING WORKER $i at $node_i" 43 | srun --nodes=1 --ntasks=1 -w $node_i start-worker.sh $ip_head $redis_password & 44 | sleep 5 45 | done 46 | ############################################################################################## 47 | 48 | #### call your code below 49 | # e.g. python path_to/run_tune.py --mode train --config-yml path_to/configs/s2ef/200k/forcenet/fn_forceonly.yml --run_dir path_to_run_dir 50 | exit 51 | -------------------------------------------------------------------------------- /open_catalyst/scripts/run_training.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script will run the actual training command with provided 4 | # command line options. It is run by every rank and sets the per-rank 5 | # environment variables needed for pytorch distributed initialization. 6 | 7 | args=$@ 8 | id=${SLURM_JOB_NAME}-n${SLURM_NTASKS}-${SLURM_JOB_ID} 9 | 10 | export WORLD_SIZE=$SLURM_NTASKS 11 | export RANK=$SLURM_PROCID 12 | export LOCAL_RANK=$SLURM_LOCALID 13 | 14 | python main.py --mode train \ 15 | --distributed \ 16 | --local_rank $LOCAL_RANK \ 17 | --identifier $id $args 18 | -------------------------------------------------------------------------------- /open_catalyst/scripts/train_cgpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -C gpu 3 | #SBATCH -J ocp-cgpu 4 | #SBATCH --ntasks-per-node=2 5 | #SBATCH --gpus-per-task=1 6 | #SBATCH --cpus-per-task=10 7 | #SBATCH --time 4:00:00 8 | #SBATCH -o logs/slurm-%x-%j.out 9 | 10 | args=$@ 11 | 12 | # Default settings 13 | : "${OCP_CONFIG:=configs/mlperf_hpc.yml}" 14 | 15 | # Setup software 16 | conda activate ocp-dev 17 | module load cuda/11.1.1 18 | 19 | # Distributed config 20 | export MASTER_ADDR=$(hostname) 21 | export MASTER_PORT=29504 22 | export NCCL_DEBUG=WARN 23 | export NCCL_SOCKET_IFNAME=eth 24 | export NCCL_IB_HCA=mlx5_0:1,mlx5_2:1,mlx5_4:1,mlx5_6:1 25 | 26 | set -x 27 | srun -u -l scripts/run_training.sh --config-yml $OCP_CONFIG $args 28 | -------------------------------------------------------------------------------- /open_catalyst/scripts/train_cgpu_shifter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -C gpu 3 | #SBATCH -J ocp-cgpu 4 | #SBATCH --image=sfarrell/mlperf-ocp:latest 5 | #SBATCH --ntasks-per-node=2 6 | #SBATCH --gpus-per-task=1 7 | #SBATCH --cpus-per-task=10 8 | #SBATCH --time 4:00:00 9 | #SBATCH -o logs/slurm-%x-%j.out 10 | 11 | args=$@ 12 | 13 | # Default settings 14 | : "${OCP_CONFIG:=configs/mlperf_hpc.yml}" 15 | 16 | # Distributed config 17 | export MASTER_ADDR=$(hostname) 18 | export MASTER_PORT=29504 19 | export NCCL_DEBUG=WARN 20 | export NCCL_SOCKET_IFNAME=eth 21 | export NCCL_IB_HCA=mlx5_0:1,mlx5_2:1,mlx5_4:1,mlx5_6:1 22 | 23 | set -x 24 | srun -l -u shifter scripts/run_training.sh --config-yml $OCP_CONFIG $args 25 | -------------------------------------------------------------------------------- /open_catalyst/scripts/train_pm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -C gpu 3 | #SBATCH -J ocp-pm 4 | #SBATCH --ntasks-per-node=4 5 | #SBATCH --gpus-per-task=1 6 | #SBATCH --cpus-per-task=32 7 | #SBATCH --time 4:00:00 8 | #SBATCH -o logs/slurm-%x-%j.out 9 | 10 | args=$@ 11 | 12 | # Default settings 13 | : "${OCP_CONFIG:=configs/mlperf_hpc_pm.yml}" 14 | 15 | # Setup software 16 | module purge 17 | source $CONDA_INIT_SCRIPT 18 | conda activate ocp-dev 19 | module load cuda/11.1.1 20 | 21 | # Distributed config 22 | export MASTER_ADDR=$(hostname) 23 | export MASTER_PORT=29504 24 | export NCCL_IB_DISABLE=1 25 | export NCCL_DEBUG=WARN 26 | export NCCL_SOCKET_IFNAME=hsn 27 | 28 | set -x 29 | srun -l -u scripts/run_training.sh --config-yml $OCP_CONFIG $args 30 | -------------------------------------------------------------------------------- /open_catalyst/scripts/train_pm_shifter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -C gpu 3 | #SBATCH -J ocp-pm 4 | #SBATCH -A nstaff_g 5 | #SBATCH -q early_science 6 | #SBATCH --image=sfarrell/mlperf-ocp:latest 7 | #SBATCH --ntasks-per-node=4 8 | #SBATCH --gpus-per-task=1 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --gpu-bind=none 11 | #SBATCH --time 4:00:00 12 | #SBATCH -o logs/slurm-%x-%j.out 13 | 14 | args=$@ 15 | 16 | # Default settings 17 | : "${OCP_CONFIG:=configs/mlperf_hpc_pm.yml}" 18 | 19 | # Distributed config 20 | export MASTER_ADDR=$(hostname) 21 | export MASTER_PORT=29504 22 | export NCCL_DEBUG=WARN 23 | export NCCL_SOCKET_IFNAME=hsn 24 | 25 | set -x 26 | srun -l -u shifter scripts/run_training.sh --config-yml $OCP_CONFIG $args 27 | -------------------------------------------------------------------------------- /open_catalyst/scripts/uncompress.py: -------------------------------------------------------------------------------- 1 | """ 2 | Uncompresses downloaded S2EF datasets to be used by the LMDB preprocessing 3 | script - preprocess_ef.py 4 | """ 5 | 6 | import argparse 7 | import glob 8 | import lzma 9 | import multiprocessing as mp 10 | import os 11 | 12 | from tqdm import tqdm 13 | 14 | 15 | def read_lzma(inpfile, outfile): 16 | with open(inpfile, "rb") as f: 17 | contents = lzma.decompress(f.read()) 18 | with open(outfile, "wb") as op: 19 | op.write(contents) 20 | 21 | 22 | def decompress_list_of_files(ip_op_pair): 23 | ip_file, op_file = ip_op_pair 24 | read_lzma(ip_file, op_file) 25 | 26 | 27 | def get_parser(): 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument( 30 | "--ipdir", type=str, help="Path to compressed dataset directory" 31 | ) 32 | parser.add_argument( 33 | "--opdir", type=str, help="Directory path to uncompress files to" 34 | ) 35 | parser.add_argument( 36 | "--num-workers", type=int, help="# of processes to parallelize across" 37 | ) 38 | return parser 39 | 40 | 41 | def main(args): 42 | os.makedirs(args.opdir, exist_ok=True) 43 | 44 | filelist = glob.glob(os.path.join(args.ipdir, "*txt.xz")) + glob.glob( 45 | os.path.join(args.ipdir, "*extxyz.xz") 46 | ) 47 | ip_op_pairs = [] 48 | for i in filelist: 49 | fname_base = os.path.basename(i) 50 | ip_op_pairs.append((i, os.path.join(args.opdir, fname_base[:-3]))) 51 | 52 | pool = mp.Pool(args.num_workers) 53 | list( 54 | tqdm( 55 | pool.imap(decompress_list_of_files, ip_op_pairs), 56 | total=len(ip_op_pairs), 57 | desc=f"Uncompressing {args.ipdir}", 58 | ) 59 | ) 60 | 61 | 62 | if __name__ == "__main__": 63 | parser = get_parser() 64 | args = parser.parse_args() 65 | main(args) 66 | -------------------------------------------------------------------------------- /open_catalyst/setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | from setuptools import find_packages, setup 9 | 10 | setup( 11 | name="ocp-models", 12 | version="0.0.3", 13 | description="Machine learning models for use in catalysis as part of the Open Catalyst Project", 14 | url="https://github.com/Open-Catalyst-Project/ocp", 15 | packages=find_packages(), 16 | include_package_data=True, 17 | ) 18 | -------------------------------------------------------------------------------- /open_catalyst/submit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | conda activate ocp-models 4 | export NCCL_SOCKET_IFNAME=eth 5 | id=cgpu-005-n64 6 | 7 | set -x 8 | python main.py --config-yml configs/mlperf_hpc.yml \ 9 | --mode train --distributed --submit --amp \ 10 | --identifier $id \ 11 | --num-gpus 8 \ 12 | --num-workers 8 \ 13 | --num-nodes 8 \ 14 | --slurm-timeout 8 15 | -------------------------------------------------------------------------------- /open_catalyst/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | -------------------------------------------------------------------------------- /open_catalyst/tests/models/test_dimenet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import os 9 | import random 10 | 11 | import numpy as np 12 | import pytest 13 | import torch 14 | from ase.io import read 15 | from torch_geometric.data import Data 16 | 17 | from ocpmodels.common.transforms import RandomRotate 18 | from ocpmodels.datasets import data_list_collater 19 | from ocpmodels.models import DimeNet 20 | from ocpmodels.preprocessing import AtomsToGraphs 21 | 22 | 23 | @pytest.fixture(scope="class") 24 | def load_data(request): 25 | atoms = read( 26 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "atoms.json"), 27 | index=0, 28 | format="json", 29 | ) 30 | a2g = AtomsToGraphs( 31 | max_neigh=200, 32 | radius=6, 33 | r_energy=True, 34 | r_forces=True, 35 | r_distances=True, 36 | ) 37 | data_list = a2g.convert_all([atoms]) 38 | request.cls.data = data_list[0] 39 | 40 | 41 | @pytest.fixture(scope="class") 42 | def load_model(request): 43 | torch.manual_seed(4) 44 | model = DimeNet( 45 | None, 46 | 32, 47 | 1, 48 | cutoff=6.0, 49 | regress_forces=True, 50 | use_pbc=False, 51 | ) 52 | request.cls.model = model 53 | 54 | 55 | @pytest.mark.usefixtures("load_data") 56 | @pytest.mark.usefixtures("load_model") 57 | class TestDimeNet: 58 | def test_rotation_invariance(self): 59 | random.seed(1) 60 | data = self.data 61 | 62 | # Sampling a random rotation within [-180, 180] for all axes. 63 | transform = RandomRotate([-180, 180], [0, 1, 2]) 64 | data_rotated, rot, inv_rot = transform(data.clone()) 65 | assert not np.array_equal(data.pos, data_rotated.pos) 66 | 67 | # Pass it through the model. 68 | batch = data_list_collater([data, data_rotated]) 69 | out = self.model(batch) 70 | 71 | # Compare predicted energies and forces (after inv-rotation). 72 | energies = out[0].detach() 73 | np.testing.assert_almost_equal(energies[0], energies[1], decimal=5) 74 | 75 | forces = out[1].detach() 76 | np.testing.assert_array_almost_equal( 77 | forces[: forces.shape[0] // 2], 78 | torch.matmul(forces[forces.shape[0] // 2 :], inv_rot), 79 | decimal=5, 80 | ) 81 | 82 | def test_energy_force_shape(self): 83 | data = self.data 84 | 85 | # Pass it through the model. 86 | out = self.model(data_list_collater([data])) 87 | 88 | # Compare shape of predicted energies, forces. 89 | energy = out[0].detach() 90 | np.testing.assert_equal(energy.shape, (1, 1)) 91 | 92 | forces = out[1].detach() 93 | np.testing.assert_equal(forces.shape, (data.pos.shape[0], 3)) 94 | -------------------------------------------------------------------------------- /open_catalyst/tests/models/test_dimenetpp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import logging 9 | import os 10 | import random 11 | 12 | import numpy as np 13 | import pytest 14 | import torch 15 | from ase.io import read 16 | from torch_geometric.data import Data 17 | 18 | from ocpmodels.common.transforms import RandomRotate 19 | from ocpmodels.datasets import data_list_collater 20 | from ocpmodels.models import DimeNetPlusPlus 21 | from ocpmodels.preprocessing import AtomsToGraphs 22 | 23 | 24 | @pytest.fixture(scope="class") 25 | def load_data(request): 26 | atoms = read( 27 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "atoms.json"), 28 | index=0, 29 | format="json", 30 | ) 31 | a2g = AtomsToGraphs( 32 | max_neigh=200, 33 | radius=6, 34 | r_energy=True, 35 | r_forces=True, 36 | r_distances=True, 37 | ) 38 | data_list = a2g.convert_all([atoms]) 39 | request.cls.data = data_list[0] 40 | 41 | 42 | @pytest.fixture(scope="class") 43 | def load_model(request): 44 | torch.manual_seed(4) 45 | model = DimeNetPlusPlus( 46 | None, 47 | 32, 48 | 1, 49 | cutoff=6.0, 50 | regress_forces=True, 51 | use_pbc=False, 52 | ) 53 | request.cls.model = model 54 | 55 | 56 | @pytest.mark.usefixtures("load_data") 57 | @pytest.mark.usefixtures("load_model") 58 | class TestDimeNet: 59 | def test_rotation_invariance(self): 60 | random.seed(1) 61 | data = self.data 62 | 63 | # Sampling a random rotation within [-180, 180] for all axes. 64 | transform = RandomRotate([-180, 180], [0, 1, 2]) 65 | data_rotated, rot, inv_rot = transform(data.clone()) 66 | assert not np.array_equal(data.pos, data_rotated.pos) 67 | 68 | # Pass it through the model. 69 | batch = data_list_collater([data, data_rotated]) 70 | out = self.model(batch) 71 | 72 | # Compare predicted energies and forces (after inv-rotation). 73 | energies = out[0].detach() 74 | np.testing.assert_almost_equal(energies[0], energies[1], decimal=5) 75 | 76 | forces = out[1].detach() 77 | logging.info(forces) 78 | np.testing.assert_array_almost_equal( 79 | forces[: forces.shape[0] // 2], 80 | torch.matmul(forces[forces.shape[0] // 2 :], inv_rot), 81 | decimal=5, 82 | ) 83 | 84 | def test_energy_force_shape(self): 85 | data = self.data 86 | 87 | # Pass it through the model. 88 | out = self.model(data_list_collater([data])) 89 | 90 | # Compare shape of predicted energies, forces. 91 | energy = out[0].detach() 92 | np.testing.assert_equal(energy.shape, (1, 1)) 93 | 94 | forces = out[1].detach() 95 | np.testing.assert_equal(forces.shape, (data.pos.shape[0], 3)) 96 | -------------------------------------------------------------------------------- /open_catalyst/tests/models/test_forcenet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import os 9 | import random 10 | 11 | import numpy as np 12 | import pytest 13 | import torch 14 | from ase.io import read 15 | from torch_geometric.data import Data 16 | 17 | from ocpmodels.common.transforms import RandomRotate 18 | from ocpmodels.datasets import data_list_collater 19 | from ocpmodels.models import ForceNet 20 | from ocpmodels.preprocessing import AtomsToGraphs 21 | 22 | 23 | @pytest.fixture(scope="class") 24 | def load_data(request): 25 | atoms = read( 26 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "atoms.json"), 27 | index=0, 28 | format="json", 29 | ) 30 | a2g = AtomsToGraphs( 31 | max_neigh=200, 32 | radius=6, 33 | r_energy=True, 34 | r_forces=True, 35 | r_distances=True, 36 | ) 37 | data_list = a2g.convert_all([atoms]) 38 | request.cls.data = data_list[0] 39 | 40 | 41 | @pytest.fixture(scope="class") 42 | def load_model(request): 43 | model = ForceNet( 44 | None, 45 | 32, 46 | 1, 47 | cutoff=6.0, 48 | ) 49 | request.cls.model = model 50 | 51 | 52 | @pytest.mark.usefixtures("load_data") 53 | @pytest.mark.usefixtures("load_model") 54 | class TestForceNet: 55 | def test_energy_force_shape(self): 56 | data = self.data 57 | 58 | # Pass it through the model. 59 | out = self.model(data_list_collater([data])) 60 | 61 | # Compare shape of predicted energies, forces. 62 | energy = out[0].detach() 63 | np.testing.assert_equal(energy.shape, (1, 1)) 64 | 65 | forces = out[1].detach() 66 | np.testing.assert_equal(forces.shape, (data.pos.shape[0], 3)) 67 | -------------------------------------------------------------------------------- /open_catalyst/tests/models/test_schnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import os 9 | import random 10 | 11 | import numpy as np 12 | import pytest 13 | import torch 14 | from ase.io import read 15 | from torch_geometric.data import Batch, Data 16 | 17 | from ocpmodels.common.transforms import RandomRotate 18 | from ocpmodels.datasets import data_list_collater 19 | from ocpmodels.models import SchNet 20 | from ocpmodels.preprocessing import AtomsToGraphs 21 | 22 | 23 | @pytest.fixture(scope="class") 24 | def load_data(request): 25 | atoms = read( 26 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "atoms.json"), 27 | index=0, 28 | format="json", 29 | ) 30 | a2g = AtomsToGraphs( 31 | max_neigh=200, 32 | radius=6, 33 | r_energy=True, 34 | r_forces=True, 35 | r_distances=True, 36 | ) 37 | data_list = a2g.convert_all([atoms]) 38 | request.cls.data = data_list[0] 39 | 40 | 41 | @pytest.fixture(scope="class") 42 | def load_model(request): 43 | torch.manual_seed(4) 44 | model = SchNet(None, 32, 1, cutoff=6.0, regress_forces=True, use_pbc=True) 45 | request.cls.model = model 46 | 47 | 48 | @pytest.mark.usefixtures("load_data") 49 | @pytest.mark.usefixtures("load_model") 50 | class TestSchNet: 51 | def test_rotation_invariance(self): 52 | random.seed(1) 53 | data = self.data 54 | 55 | # Sampling a random rotation within [-180, 180] for all axes. 56 | transform = RandomRotate([-180, 180], [0, 1, 2]) 57 | data_rotated, rot, inv_rot = transform(data.clone()) 58 | assert not np.array_equal(data.pos, data_rotated.pos) 59 | 60 | # Pass it through the model. 61 | batch = data_list_collater([data, data_rotated]) 62 | out = self.model(batch) 63 | 64 | # Compare predicted energies and forces (after inv-rotation). 65 | energies = out[0].detach() 66 | np.testing.assert_almost_equal(energies[0], energies[1], decimal=5) 67 | 68 | forces = out[1].detach() 69 | np.testing.assert_array_almost_equal( 70 | forces[: forces.shape[0] // 2], 71 | torch.matmul(forces[forces.shape[0] // 2 :], inv_rot), 72 | decimal=5, 73 | ) 74 | 75 | def test_energy_force_shape(self): 76 | data = self.data 77 | 78 | # Pass it through the model. 79 | out = self.model(data_list_collater([data])) 80 | 81 | # Compare shape of predicted energies, forces. 82 | energy = out[0].detach() 83 | np.testing.assert_equal(energy.shape, (1, 1)) 84 | 85 | forces = out[1].detach() 86 | np.testing.assert_equal(forces.shape, (data.pos.shape[0], 3)) 87 | -------------------------------------------------------------------------------- /open_catalyst/tests/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | -------------------------------------------------------------------------------- /open_catalyst/tests/preprocessing/test_pbc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | This source code is licensed under the MIT license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import os 9 | 10 | import ase 11 | import numpy as np 12 | import pytest 13 | from ase.io import read 14 | from pymatgen.io.ase import AseAtomsAdaptor 15 | 16 | from ocpmodels.common.utils import get_pbc_distances 17 | from ocpmodels.datasets import data_list_collater 18 | from ocpmodels.preprocessing import AtomsToGraphs 19 | 20 | 21 | @pytest.fixture(scope="class") 22 | def load_data(request): 23 | atoms = read( 24 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "atoms.json"), 25 | index=0, 26 | format="json", 27 | ) 28 | a2g = AtomsToGraphs( 29 | max_neigh=12, 30 | radius=6, 31 | r_energy=True, 32 | r_forces=True, 33 | r_distances=True, 34 | ) 35 | data_list = a2g.convert_all([atoms]) 36 | request.cls.data = data_list[0] 37 | 38 | 39 | @pytest.mark.usefixtures("load_data") 40 | class TestPBC: 41 | def test_pbc_distances(self): 42 | data = self.data 43 | batch = data_list_collater([data] * 5) 44 | out = get_pbc_distances( 45 | batch.pos, 46 | batch.edge_index, 47 | batch.cell, 48 | batch.cell_offsets, 49 | batch.neighbors, 50 | ) 51 | edge_index, pbc_distances = out["edge_index"], out["distances"] 52 | 53 | np.testing.assert_array_equal( 54 | batch.edge_index, 55 | edge_index, 56 | ) 57 | np.testing.assert_array_almost_equal(batch.distances, pbc_distances) 58 | -------------------------------------------------------------------------------- /openfold/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2023 NVIDIA CORPORATION 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:23.04-py3 16 | 17 | FROM ${FROM_IMAGE_NAME} 18 | 19 | ENV DEBIAN_FRONTEND=noninteractive 20 | 21 | # Install pip requirements: 22 | RUN pip install \ 23 | biopython==1.79 \ 24 | Pympler==1.0.1 \ 25 | dacite==1.8.0 \ 26 | "git+https://github.com/mlcommons/logging.git@2.1.0" \ 27 | "git+https://github.com/NVIDIA/mlperf-common.git" 28 | 29 | # Build and install Kalign from source: 30 | RUN wget -q -P /workspace/downloads https://github.com/TimoLassmann/kalign/archive/refs/tags/v3.3.5.tar.gz \ 31 | && tar -xzf /workspace/downloads/v3.3.5.tar.gz --directory /workspace \ 32 | && rm -r /workspace/downloads \ 33 | && ls /workspace \ 34 | && cd /workspace/kalign-3.3.5 \ 35 | && mkdir build \ 36 | && cd build \ 37 | && cmake .. \ 38 | && make -j \ 39 | && make install \ 40 | && rm -r /workspace/kalign-3.3.5 41 | 42 | # Copy OpenFold source code into the docker image: 43 | COPY . /workspace/openfold 44 | WORKDIR /workspace/openfold 45 | 46 | # Install OpenFold source code package in editable mode: 47 | RUN pip install -e . 48 | -------------------------------------------------------------------------------- /openfold/NOTICE: -------------------------------------------------------------------------------- 1 | This repository defines the reference implementation for the MLPerf HPC OpenFold benchmark. 2 | 3 | This repository includes software from https://github.com/deepmind/alphafold licensed under the Apache-2.0 License. 4 | 5 | This repository includes software from https://github.com/aqlaboratory/openfold licensed under the Apache-2.0 License. 6 | -------------------------------------------------------------------------------- /openfold/openfold/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/hpc/2c627d457004eff77a014205b3151ed48a6fa149/openfold/openfold/__init__.py -------------------------------------------------------------------------------- /openfold/openfold/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/hpc/2c627d457004eff77a014205b3151ed48a6fa149/openfold/openfold/data/__init__.py -------------------------------------------------------------------------------- /openfold/openfold/data/alignments.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 NVIDIA CORPORATION 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | from pathlib import Path 17 | from typing import Dict 18 | 19 | 20 | def load_alignments_super_index( 21 | alignments_super_index_filepath: Path, 22 | verbose: bool = False, 23 | pprefix: str = "", 24 | ) -> Dict[str, dict]: 25 | if verbose: 26 | print(f"{pprefix}Loading {repr(alignments_super_index_filepath)}...") 27 | with open(alignments_super_index_filepath) as f: 28 | alignments_super_index = json.load(f) 29 | if verbose: 30 | print( 31 | f"{pprefix}alignments_super_index ({len(alignments_super_index)})" 32 | f" loaded from {repr(alignments_super_index_filepath)} successfully!" 33 | ) 34 | return alignments_super_index 35 | 36 | 37 | def load_alignments( 38 | alignments_super_index: Dict[str, dict], 39 | alignments_dirpath: Path, 40 | key: str, 41 | ) -> dict: 42 | alignments_index = alignments_super_index[key] 43 | alignments_db_path = alignments_dirpath / alignments_index["db"] 44 | alignments = {} 45 | with open(alignments_db_path, "rb") as f: 46 | for file_index in alignments_index["files"]: 47 | filename, start, size = file_index 48 | f.seek(start) 49 | content = f.read(size).decode("utf-8") 50 | alignments[filename] = content 51 | return alignments 52 | -------------------------------------------------------------------------------- /openfold/openfold/data/resources/README.md: -------------------------------------------------------------------------------- 1 | # Resources 2 | 3 | ## 1. `stereo_chemical_props.txt` 4 | 5 | source: https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt 6 | -------------------------------------------------------------------------------- /openfold/openfold/data/resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/hpc/2c627d457004eff77a014205b3151ed48a6fa149/openfold/openfold/data/resources/__init__.py -------------------------------------------------------------------------------- /openfold/openfold/data/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/hpc/2c627d457004eff77a014205b3151ed48a6fa149/openfold/openfold/data/tools/__init__.py -------------------------------------------------------------------------------- /openfold/openfold/log_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 NVIDIA CORPORATION 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | from pathlib import Path 17 | from typing import List 18 | 19 | import pandas as pd 20 | 21 | 22 | def save_logs(logs: List[dict], outpath: Path, append: bool) -> None: 23 | outpath.parent.mkdir(parents=True, exist_ok=True) 24 | lines = [] 25 | for log in logs: 26 | line = json.dumps(log) 27 | lines.append(line) 28 | outstr = "\n".join(lines) + "\n" 29 | mode = "a" if append else "w" 30 | with open(outpath, mode) as f: 31 | f.write(outstr) 32 | 33 | 34 | def read_logs( 35 | filepath: Path, 36 | drop_overridden_iterations: bool = True, 37 | ) -> pd.DataFrame: 38 | with open(filepath) as f: 39 | logs = f.read().strip().split("\n") 40 | logs = [json.loads(log) for log in logs] 41 | logs_df = pd.DataFrame(logs) 42 | if drop_overridden_iterations: 43 | logs_df = logs_df.drop_duplicates("iteration", keep="last") 44 | logs_df = logs_df.reset_index(drop=True).copy() 45 | return logs_df 46 | -------------------------------------------------------------------------------- /openfold/openfold/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/hpc/2c627d457004eff77a014205b3151ed48a6fa149/openfold/openfold/model/__init__.py -------------------------------------------------------------------------------- /openfold/openfold/model/backbone_update.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 DeepMind Technologies Limited 2 | # Copyright 2022 AlQuraishi Laboratory 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import torch 18 | import torch.nn as nn 19 | 20 | from openfold.model.linear import Linear 21 | 22 | 23 | class BackboneUpdate(nn.Module): 24 | """Backbone Update module. 25 | 26 | Supplementary '1.8.3 Backbone update': Algorithm 23. 27 | 28 | Args: 29 | c_s: Single representation dimension (channels). 30 | 31 | """ 32 | 33 | def __init__(self, c_s: int) -> None: 34 | super(BackboneUpdate, self).__init__() 35 | self.linear = Linear(c_s, 6, bias=True, init="final") 36 | 37 | def forward(self, s: torch.Tensor) -> torch.Tensor: 38 | return self.linear(s) 39 | -------------------------------------------------------------------------------- /openfold/openfold/model/dropout.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 DeepMind Technologies Limited 2 | # Copyright 2022 AlQuraishi Laboratory 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from typing import Tuple, Union 18 | 19 | import torch 20 | import torch.nn as nn 21 | import torch.nn.functional as F 22 | 23 | 24 | class Dropout(nn.Module): 25 | """Dropout module. 26 | 27 | Implementation of dropout with the ability to share the dropout mask 28 | along a particular dimension. 29 | 30 | If not in training mode, this module computes the identity function. 31 | 32 | Supplementary '1.11.6 Dropout details'. 33 | 34 | Args: 35 | p: Dropout rate (probability of an element to be zeroed). 36 | share_dim: Dimension(s) along which the dropout mask is shared. 37 | inplace: If set to `True`, will do this operation in-place. 38 | 39 | """ 40 | 41 | def __init__( 42 | self, 43 | p: float, 44 | share_dim: Union[int, Tuple[int, ...]] = (), 45 | inplace: bool = False, 46 | ) -> None: 47 | super(Dropout, self).__init__() 48 | assert 0.0 <= p <= 1.0 49 | self.p = p 50 | if type(share_dim) == int: 51 | share_dim = (share_dim,) 52 | else: 53 | assert isinstance(share_dim, tuple) 54 | self.share_dim = share_dim 55 | self.inplace = inplace 56 | 57 | def forward(self, x: torch.Tensor) -> torch.Tensor: 58 | shape = list(x.shape) 59 | for d in self.share_dim: 60 | shape[d] = 1 61 | mask = x.new_ones(shape) 62 | mask = F.dropout( 63 | input=mask, 64 | p=self.p, 65 | training=self.training, 66 | inplace=self.inplace, 67 | ) 68 | x *= mask 69 | return x 70 | 71 | 72 | class DropoutRowwise(Dropout): 73 | """Dropout Rowwise module.""" 74 | 75 | def __init__(self, p: float) -> None: 76 | super(DropoutRowwise, self).__init__(p=p, share_dim=-3) 77 | 78 | 79 | class DropoutColumnwise(Dropout): 80 | """Dropout Columnwise module.""" 81 | 82 | def __init__(self, p: float) -> None: 83 | super(DropoutColumnwise, self).__init__(p=p, share_dim=-2) 84 | -------------------------------------------------------------------------------- /openfold/openfold/model/extra_msa_embedder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 DeepMind Technologies Limited 2 | # Copyright 2022 AlQuraishi Laboratory 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import torch 18 | import torch.nn as nn 19 | 20 | from openfold.model.linear import Linear 21 | 22 | 23 | class ExtraMSAEmbedder(nn.Module): 24 | """Extra MSA Embedder module. 25 | 26 | Embeds the "extra_msa_feat" feature. 27 | 28 | Supplementary '1.4 AlphaFold Inference': Algorithm 2, line 15. 29 | 30 | Args: 31 | emsa_dim: Input `extra_msa_feat` dimension (channels). 32 | c_e: Output extra MSA representation dimension (channels). 33 | 34 | """ 35 | 36 | def __init__( 37 | self, 38 | emsa_dim: int, 39 | c_e: int, 40 | ) -> None: 41 | super(ExtraMSAEmbedder, self).__init__() 42 | self.linear = Linear(emsa_dim, c_e, bias=True, init="default") 43 | 44 | def forward( 45 | self, 46 | extra_msa_feat: torch.Tensor, 47 | ) -> torch.Tensor: 48 | """Extra MSA Embedder forward pass. 49 | 50 | Args: 51 | extra_msa_feat: [batch, N_extra_seq, N_res, emsa_dim] 52 | 53 | Returns: 54 | extra_msa_embedding: [batch, N_extra_seq, N_res, c_e] 55 | 56 | """ 57 | return self.linear(extra_msa_feat) 58 | -------------------------------------------------------------------------------- /openfold/openfold/model/layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 DeepMind Technologies Limited 2 | # Copyright 2022 AlQuraishi Laboratory 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | 21 | 22 | class LayerNorm(nn.Module): 23 | """Layer Normalization module. 24 | 25 | Supplementary '1.11.4 Parameters initialization': Layer normalization. 26 | 27 | Args: 28 | in_channels: Last dimension of the input tensor. 29 | eps: A value added to the denominator for numerical stability. 30 | 31 | """ 32 | 33 | def __init__( 34 | self, 35 | in_channels: int, 36 | eps: float = 1e-5, 37 | ) -> None: 38 | super(LayerNorm, self).__init__() 39 | self.normalized_shape = (in_channels,) 40 | self.eps = eps 41 | self.weight = nn.Parameter(torch.ones(in_channels)) 42 | self.bias = nn.Parameter(torch.zeros(in_channels)) 43 | 44 | def forward(self, x: torch.Tensor) -> torch.Tensor: 45 | return F.layer_norm( 46 | input=x, 47 | normalized_shape=self.normalized_shape, 48 | weight=self.weight, 49 | bias=self.bias, 50 | eps=self.eps, 51 | ) 52 | -------------------------------------------------------------------------------- /openfold/openfold/model/msa_column_attention.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 DeepMind Technologies Limited 2 | # Copyright 2022 AlQuraishi Laboratory 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from typing import Optional 18 | 19 | import torch 20 | import torch.nn as nn 21 | 22 | from openfold.model.attention import SelfAttentionWithGate 23 | from openfold.model.layer_norm import LayerNorm 24 | 25 | 26 | class MSAColumnAttention(nn.Module): 27 | """MSA Column Attention module. 28 | 29 | Supplementary '1.6.2 MSA column-wise gated self-attention': Algorithm 8. 30 | 31 | Args: 32 | c_m: MSA representation dimension (channels). 33 | c_hidden: Per-head hidden dimension (channels). 34 | num_heads: Number of attention heads. 35 | inf: Safe infinity value. 36 | chunk_size: Optional chunk size for a batch-like dimension. 37 | 38 | """ 39 | 40 | def __init__( 41 | self, 42 | c_m: int, 43 | c_hidden: int, 44 | num_heads: int, 45 | inf: float, 46 | chunk_size: Optional[int], 47 | ) -> None: 48 | super(MSAColumnAttention, self).__init__() 49 | self.layer_norm_m = LayerNorm(c_m) 50 | self.mha = SelfAttentionWithGate( 51 | c_qkv=c_m, 52 | c_hidden=c_hidden, 53 | num_heads=num_heads, 54 | inf=inf, 55 | chunk_size=chunk_size, 56 | ) 57 | 58 | def forward( 59 | self, 60 | m: torch.Tensor, 61 | mask: torch.Tensor, 62 | ) -> torch.Tensor: 63 | """MSA Column Attention forward pass. 64 | 65 | Args: 66 | m: [batch, N_seq, N_res, c_m] MSA representation 67 | mask: [batch, N_seq, N_res] MSA mask 68 | 69 | Returns: 70 | m_update: [batch, N_seq, N_res, c_m] MSA representation update 71 | 72 | """ 73 | m = m.transpose(-2, -3) 74 | # m: [batch, N_res, N_seq, c_m] 75 | 76 | mask = mask.transpose(-1, -2) 77 | # mask: [batch, N_res, N_seq] 78 | 79 | mask = mask.unsqueeze(-2).unsqueeze(-3) 80 | # mask: [batch, N_res, 1, 1, N_seq] 81 | 82 | m = self.layer_norm_m(m) 83 | m = self.mha( 84 | input_qkv=m, 85 | mask=mask, 86 | bias=None, 87 | ) 88 | # m: [batch, N_res, N_seq, c_m] 89 | 90 | m = m.transpose(-2, -3) 91 | # m: [batch, N_seq, N_res, c_m] 92 | 93 | return m 94 | -------------------------------------------------------------------------------- /openfold/openfold/model/msa_transition.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 DeepMind Technologies Limited 2 | # Copyright 2022 AlQuraishi Laboratory 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import torch 18 | import torch.nn as nn 19 | 20 | from openfold.model.layer_norm import LayerNorm 21 | from openfold.model.linear import Linear 22 | 23 | 24 | class MSATransition(nn.Module): 25 | """MSA Transition module. 26 | 27 | Supplementary '1.6.3 MSA transition': Algorithm 9. 28 | 29 | Args: 30 | c_m: MSA (or Extra MSA) representation dimension (channels). 31 | n: `c_m` multiplier to obtain hidden dimension (channels). 32 | 33 | """ 34 | 35 | def __init__( 36 | self, 37 | c_m: int, 38 | n: int, 39 | ) -> None: 40 | super(MSATransition, self).__init__() 41 | self.layer_norm = LayerNorm(c_m) 42 | self.linear_1 = Linear(c_m, n * c_m, bias=True, init="relu") 43 | self.linear_2 = Linear(n * c_m, c_m, bias=True, init="final") 44 | 45 | def forward( 46 | self, 47 | m: torch.Tensor, 48 | mask: torch.Tensor, 49 | ) -> torch.Tensor: 50 | """MSA Transition forward pass. 51 | 52 | Args: 53 | m: [batch, N_seq, N_res, c_m] MSA representation 54 | mask: [batch, N_seq, N_res] MSA mask 55 | 56 | Returns: 57 | m_update: [batch, N_seq, N_res, c_m] MSA representation update 58 | 59 | """ 60 | # DeepMind forgets to apply the MSA mask here. 61 | m = self.layer_norm(m) 62 | m = self.linear_1(m) 63 | m = torch.relu(m) 64 | m = self.linear_2(m) 65 | return m 66 | -------------------------------------------------------------------------------- /openfold/openfold/model/pair_transition.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 DeepMind Technologies Limited 2 | # Copyright 2022 AlQuraishi Laboratory 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import torch 18 | import torch.nn as nn 19 | 20 | from openfold.model.layer_norm import LayerNorm 21 | from openfold.model.linear import Linear 22 | 23 | 24 | class PairTransition(nn.Module): 25 | """Pair Transition module. 26 | 27 | Supplementary '1.6.7 Transition in the pair stack': Algorithm 15. 28 | 29 | Args: 30 | c_z: Pair or template representation dimension (channels). 31 | n: `c_z` multiplier to obtain hidden dimension (channels). 32 | 33 | """ 34 | 35 | def __init__( 36 | self, 37 | c_z: int, 38 | n: int, 39 | ) -> None: 40 | super(PairTransition, self).__init__() 41 | self.layer_norm = LayerNorm(c_z) 42 | self.linear_1 = Linear(c_z, n * c_z, bias=True, init="relu") 43 | self.linear_2 = Linear(n * c_z, c_z, bias=True, init="final") 44 | 45 | def forward( 46 | self, 47 | z: torch.Tensor, 48 | mask: torch.Tensor, 49 | ) -> torch.Tensor: 50 | """Pair Transition forward pass. 51 | 52 | Args: 53 | z: [batch, N_res, N_res, c_z] pair representation 54 | mask: [batch, N_res, N_res] pair mask 55 | 56 | Returns: 57 | z_update: [batch, N_res, N_res, c_z] pair representation update 58 | 59 | """ 60 | # DeepMind forgets to apply the MSA mask here. 61 | z = self.layer_norm(z) 62 | z = self.linear_1(z) 63 | z = torch.relu(z) 64 | z = self.linear_2(z) 65 | return z 66 | -------------------------------------------------------------------------------- /openfold/openfold/model/single_transition.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 DeepMind Technologies Limited 2 | # Copyright 2022 AlQuraishi Laboratory 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import torch 18 | import torch.nn as nn 19 | 20 | from openfold.model.layer_norm import LayerNorm 21 | from openfold.model.linear import Linear 22 | 23 | 24 | class SingleTransition(nn.Module): 25 | """Single Transition module. 26 | 27 | Supplementary '1.8 Structure module': Algorithm 20, lines 8-9. 28 | 29 | Args: 30 | c_s: Single representation dimension (channels). 31 | dropout_rate: Dropout rate. 32 | 33 | """ 34 | 35 | def __init__( 36 | self, 37 | c_s: int, 38 | dropout_rate: float, 39 | ) -> None: 40 | super(SingleTransition, self).__init__() 41 | self.linear_1 = Linear(c_s, c_s, bias=True, init="relu") 42 | self.linear_2 = Linear(c_s, c_s, bias=True, init="relu") 43 | self.linear_3 = Linear(c_s, c_s, bias=True, init="final") 44 | self.dropout = nn.Dropout(dropout_rate) 45 | self.layer_norm = LayerNorm(c_s) 46 | 47 | def forward(self, s: torch.Tensor) -> torch.Tensor: 48 | s = s + self.linear_3(torch.relu(self.linear_2(torch.relu(self.linear_1(s))))) 49 | s = self.layer_norm(self.dropout(s)) 50 | return s 51 | -------------------------------------------------------------------------------- /openfold/openfold/model/template_angle_embedder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 DeepMind Technologies Limited 2 | # Copyright 2022 AlQuraishi Laboratory 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import torch 18 | import torch.nn as nn 19 | 20 | from openfold.model.linear import Linear 21 | 22 | 23 | class TemplateAngleEmbedder(nn.Module): 24 | """Template Angle Embedder module. 25 | 26 | Embeds the "template_angle_feat" feature. 27 | 28 | Supplementary '1.4 AlphaFold Inference': Algorithm 2, line 7. 29 | 30 | Args: 31 | ta_dim: Input `template_angle_feat` dimension (channels). 32 | c_m: Output MSA representation dimension (channels). 33 | 34 | """ 35 | 36 | def __init__( 37 | self, 38 | ta_dim: int, 39 | c_m: int, 40 | ) -> None: 41 | super(TemplateAngleEmbedder, self).__init__() 42 | self.linear_1 = Linear(ta_dim, c_m, bias=True, init="relu") 43 | self.linear_2 = Linear(c_m, c_m, bias=True, init="relu") 44 | 45 | def forward( 46 | self, 47 | template_angle_feat: torch.Tensor, 48 | ) -> torch.Tensor: 49 | """Template Angle Embedder forward pass. 50 | 51 | Args: 52 | template_angle_feat: [batch, N_templ, N_res, ta_dim] 53 | 54 | Returns: 55 | template_angle_embedding: [batch, N_templ, N_res, c_m] 56 | 57 | """ 58 | return self.linear_2(torch.relu(self.linear_1(template_angle_feat))) 59 | -------------------------------------------------------------------------------- /openfold/openfold/model/template_pair_embedder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 DeepMind Technologies Limited 2 | # Copyright 2022 AlQuraishi Laboratory 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import torch 18 | import torch.nn as nn 19 | 20 | from openfold.model.linear import Linear 21 | 22 | 23 | class TemplatePairEmbedder(nn.Module): 24 | """Template Pair Embedder module. 25 | 26 | Embeds the "template_pair_feat" feature. 27 | 28 | Supplementary '1.4 AlphaFold Inference': Algorithm 2, line 9. 29 | 30 | Args: 31 | tp_dim: Input `template_pair_feat` dimension (channels). 32 | c_t: Output template representation dimension (channels). 33 | 34 | """ 35 | 36 | def __init__( 37 | self, 38 | tp_dim: int, 39 | c_t: int, 40 | ) -> None: 41 | super(TemplatePairEmbedder, self).__init__() 42 | self.tp_dim = tp_dim 43 | self.c_t = c_t 44 | self.linear = Linear(tp_dim, c_t, bias=True, init="relu") 45 | 46 | def forward( 47 | self, 48 | template_pair_feat: torch.Tensor, 49 | ) -> torch.Tensor: 50 | """Template Pair Embedder forward pass. 51 | 52 | Args: 53 | template_pair_feat: [batch, N_res, N_res, tp_dim] 54 | 55 | Returns: 56 | template_pair_embedding: [batch, N_res, N_res, c_t] 57 | 58 | """ 59 | return self.linear(template_pair_feat) 60 | -------------------------------------------------------------------------------- /openfold/openfold/numpy_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 NVIDIA CORPORATION 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Callable 16 | 17 | import numpy as np 18 | 19 | from openfold.helpers import map_tree_leaves 20 | 21 | NUMPY_SEED_MODULUS = 0xFFFF_FFFF + 1 22 | 23 | 24 | def map_array_tree(fn: Callable, tree: dict) -> dict: 25 | """Maps array tree using given function.""" 26 | return map_tree_leaves(fn=fn, tree=tree, leaf_type=np.ndarray) 27 | -------------------------------------------------------------------------------- /openfold/openfold/swa.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 NVIDIA CORPORATION 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | from openfold.model.alphafold import AlphaFold 19 | 20 | 21 | class AlphaFoldSWA(nn.Module): 22 | """AlphaFold SWA (Stochastic Weight Averaging) module wrapper.""" 23 | 24 | def __init__(self, alphafold: AlphaFold, enabled: bool, decay_rate: float) -> None: 25 | super(AlphaFoldSWA, self).__init__() 26 | if enabled: 27 | self.averaged_model = torch.optim.swa_utils.AveragedModel( 28 | model=alphafold, 29 | avg_fn=swa_avg_fn(decay_rate=decay_rate), 30 | ) 31 | self.enabled = True 32 | else: 33 | self.averaged_model = None 34 | self.enabled = False 35 | 36 | def update(self, alphafold: AlphaFold) -> None: 37 | if self.enabled: 38 | self.averaged_model.update_parameters(model=alphafold) 39 | 40 | def forward(self, batch): 41 | if not self.enabled: 42 | raise RuntimeError("AlphaFoldSWA is not enabled") 43 | return self.averaged_model(batch) 44 | 45 | 46 | class swa_avg_fn: 47 | """Averaging function for EMA with configurable decay rate 48 | (Supplementary '1.11.7 Evaluator setup').""" 49 | 50 | def __init__(self, decay_rate: float) -> None: 51 | self._decay_rate = decay_rate 52 | 53 | def __call__( 54 | self, 55 | averaged_model_parameter: torch.Tensor, 56 | model_parameter: torch.Tensor, 57 | num_averaged: torch.Tensor, 58 | ) -> torch.Tensor: 59 | # for decay_rate = 0.999: 60 | # return averaged_model_parameter * 0.999 + model_parameter * 0.001 61 | # avg * 0.999 + m * 0.001 62 | # 999*avg/1000 + m/1000 63 | # (999*avg + avg - avg)/1000 + m/1000 64 | # (1000*avg - avg)/1000 + m/1000 65 | # 1000*avg/1000 - avg/1000 + m/1000 66 | # avg + (m - avg)/1000 67 | # avg + (m - avg)*0.001 68 | return averaged_model_parameter + ( 69 | model_parameter - averaged_model_parameter 70 | ) * (1.0 - self._decay_rate) 71 | -------------------------------------------------------------------------------- /openfold/openfold/torch_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 NVIDIA CORPORATION 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from typing import Callable, List 17 | 18 | import torch 19 | 20 | from openfold.helpers import map_tree_leaves 21 | 22 | TORCH_SEED_MODULUS = 0xFFFF_FFFF_FFFF_FFFF + 1 23 | 24 | 25 | def enable_tf32() -> None: 26 | os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"] = "1" 27 | torch.backends.cuda.matmul.allow_tf32 = True 28 | torch.backends.cudnn.allow_tf32 = True 29 | 30 | 31 | def disable_tf32() -> None: 32 | os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"] = "0" 33 | torch.backends.cuda.matmul.allow_tf32 = False 34 | torch.backends.cudnn.allow_tf32 = False 35 | 36 | 37 | def is_autocast_fp16_enabled() -> bool: 38 | return ( 39 | torch.is_autocast_enabled() and torch.get_autocast_gpu_dtype() == torch.float16 40 | ) 41 | 42 | 43 | def map_tensor_tree(fn: Callable, tree: dict) -> dict: 44 | """Maps tensor tree using given function.""" 45 | return map_tree_leaves(fn=fn, tree=tree, leaf_type=torch.Tensor) 46 | 47 | 48 | def collate(samples: List[dict]) -> dict: 49 | """Converts list of samples into a batch dict.""" 50 | assert isinstance(samples, list) 51 | assert len(samples) > 0 52 | sample0 = samples[0] 53 | assert isinstance(sample0, dict) 54 | batch = {} 55 | for key in list(sample0.keys()): 56 | batch[key] = [sample[key] for sample in samples] 57 | if isinstance(sample0[key], torch.Tensor): 58 | batch[key] = torch.stack(batch[key], dim=0) 59 | return batch 60 | -------------------------------------------------------------------------------- /openfold/scripts/activate_local_openfold_venv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Usage: source scripts/activate_local_openfold_venv.sh /path/to/openfold-venv 18 | # 19 | # Exit: conda deactivate 20 | 21 | # Setup text effects: 22 | GREEN=$(tput setaf 2) 23 | BOLD=$(tput bold) 24 | NORMAL=$(tput sgr0) 25 | 26 | # Read input argument: 27 | PREFIX_PATH=$1 28 | 29 | # Activate conda environment: 30 | source $PREFIX_PATH/conda/etc/profile.d/conda.sh && \ 31 | conda activate openfold-venv && \ 32 | echo -e "${GREEN}${BOLD}openfold-venv activated!${NORMAL}" 33 | -------------------------------------------------------------------------------- /openfold/scripts/build_local_openfold_venv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Usage: bash scripts/build_local_openfold_venv.sh /path/to/openfold-venv 18 | 19 | set -e # immediately exit on first error 20 | 21 | # Setup text effects: 22 | RED=$(tput setaf 1) 23 | GREEN=$(tput setaf 2) 24 | BOLD=$(tput bold) 25 | NORMAL=$(tput sgr0) 26 | 27 | # Read input argument: 28 | PREFIX_PATH=$1 29 | if [ -z $PREFIX_PATH ]; then 30 | echo "${BOLD}${RED}Input error:${NORMAL} missing path!" 31 | echo "Please, specify venv location!" 32 | exit 1 33 | fi 34 | 35 | # Check if prefix path already exists: 36 | if [ -f $PREFIX_PATH ] || [ -d $PREFIX_PATH ] ; then 37 | echo "${BOLD}${RED}Build error:${NORMAL} ${BOLD}$PREFIX_PATH${NORMAL} already exists!" 38 | echo "Remove ${BOLD}$PREFIX_PATH${NORMAL} manually or set different location." 39 | exit 1 40 | fi 41 | 42 | echo "Building ${GREEN}${BOLD}$PREFIX_PATH${NORMAL}..." 43 | 44 | # Install conda to specified prefix path: 45 | wget -4 https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ 46 | && bash Miniconda3-latest-Linux-x86_64.sh -b -p $PREFIX_PATH/conda \ 47 | && rm Miniconda3-latest-Linux-x86_64.sh 48 | 49 | # Create conda environment: 50 | $PREFIX_PATH/conda/bin/conda create --name=openfold-venv -y python==3.8.* 51 | 52 | # Activate conda environment: 53 | source scripts/activate_local_openfold_venv.sh $PREFIX_PATH 54 | 55 | # Install requirements: 56 | echo "Installing requirements..." 57 | conda install -y \ 58 | pytorch::pytorch==2.0.* \ 59 | conda-forge::numpy==1.22.2 \ 60 | conda-forge::pandas==1.5.2 \ 61 | conda-forge::scipy==1.10.1 \ 62 | conda-forge::tqdm==4.65.0 \ 63 | conda-forge::psutil==5.9.4 \ 64 | conda-forge::biopython==1.79 \ 65 | conda-forge::Pympler==1.0.1 \ 66 | bioconda::kalign3==3.3.* 67 | 68 | pip install dacite==1.8.0 \ 69 | "git+https://github.com/mlcommons/logging.git@2.1.0" \ 70 | "git+https://github.com/NVIDIA/mlperf-common.git" 71 | 72 | # Install OpenFold source code package in editable mode: 73 | pip install -e . 74 | 75 | echo "${GREEN}${BOLD}$0 finished successfully!${NORMAL}" 76 | -------------------------------------------------------------------------------- /openfold/scripts/deactivate_local_openfold_venv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2023 NVIDIA CORPORATION 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Usage: source scripts/deactivate_local_openfold_venv.sh 18 | 19 | # Setup text effects: 20 | CYAN=$(tput setaf 6) 21 | BOLD=$(tput bold) 22 | NORMAL=$(tput sgr0) 23 | 24 | # Deactivate conda environment: 25 | conda deactivate && \ 26 | echo -e "${CYAN}${BOLD}openfold-venv deactivated!${NORMAL}" 27 | -------------------------------------------------------------------------------- /openfold/scripts/download_open_protein_set.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2021 DeepMind Technologies Limited 4 | # Copyright 2022 OpenFold Consortium 5 | # Copyright 2023 NVIDIA CORPORATION 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | # Usage: bash download_open_protein_set.sh /path/to/data/open_protein_set/original 20 | 21 | set -e 22 | 23 | if [[ $# -eq 0 ]]; then 24 | echo "Error: download directory must be provided as an input argument." 25 | exit 1 26 | fi 27 | 28 | if ! command -v aws &> /dev/null ; then 29 | echo "Error: AWS CLI could not be found. Check https://aws.amazon.com/cli/ and install AWS CLI." 30 | exit 1 31 | fi 32 | 33 | DOWNLOAD_DIR="${1}/" 34 | mkdir -p "${DOWNLOAD_DIR}" 35 | 36 | # download root files: 37 | aws s3 cp --no-sign-request s3://openfold/LICENSE "${DOWNLOAD_DIR}" 38 | aws s3 cp --no-sign-request s3://openfold/duplicate_pdb_chains.txt "${DOWNLOAD_DIR}" 39 | 40 | # download pdb directory: 41 | mkdir -p "${DOWNLOAD_DIR}/pdb" 42 | aws s3 cp --no-sign-request s3://openfold/pdb "${DOWNLOAD_DIR}/pdb" --recursive 43 | -------------------------------------------------------------------------------- /openfold/scripts/download_pdb_mmcif.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2021 DeepMind Technologies Limited 4 | # Copyright 2023 NVIDIA CORPORATION 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # Usage: bash download_pdb_mmcif.sh /path/to/data/pdb_mmcif/original 19 | 20 | set -e 21 | 22 | if [[ $# -eq 0 ]]; then 23 | echo "Error: download directory must be provided as an input argument." 24 | exit 1 25 | fi 26 | 27 | if ! command -v aria2c &> /dev/null ; then 28 | echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." 29 | exit 1 30 | fi 31 | 32 | if ! command -v rsync &> /dev/null ; then 33 | echo "Error: rsync could not be found. Please install rsync." 34 | exit 1 35 | fi 36 | 37 | DOWNLOAD_DIR="$1" 38 | DOWNLOAD_RAW_DIR="${DOWNLOAD_DIR}/raw" 39 | 40 | echo "Running rsync to fetch all mmCIF files (note that the rsync progress estimate might be inaccurate)..." 41 | echo "If the download speed is too slow, try changing the mirror to:" 42 | echo " * rsync.ebi.ac.uk::pub/databases/pdb/data/structures/divided/mmCIF/ (Europe)" 43 | echo " * ftp.pdbj.org::ftp_data/structures/divided/mmCIF/ (Asia)" 44 | echo "or see https://www.wwpdb.org/ftp/pdb-ftp-sites for more download options." 45 | mkdir -p "${DOWNLOAD_RAW_DIR}" 46 | rsync --recursive --links --perms --times --compress --info=progress2 --delete --port=33444 \ 47 | rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ \ 48 | "${DOWNLOAD_RAW_DIR}" 49 | 50 | aria2c "ftp://ftp.wwpdb.org/pub/pdb/data/status/obsolete.dat" --dir="${DOWNLOAD_DIR}" 51 | 52 | aria2c "https://cdn.rcsb.org/resources/sequence/clusters/clusters-by-entity-40.txt" --dir="${DOWNLOAD_DIR}" 53 | -------------------------------------------------------------------------------- /openfold/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 NVIDIA CORPORATION 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from setuptools import find_packages, setup 16 | 17 | setup( 18 | name="openfold", 19 | version="1.0.0", 20 | packages=find_packages(), 21 | include_package_data=True, 22 | package_data={ 23 | "openfold": [ 24 | "data/resources/stereo_chemical_props.txt", 25 | ], 26 | }, 27 | ) 28 | --------------------------------------------------------------------------------