├── assets ├── vis.jpg └── model.jpg ├── MolBART ├── megatron_molbart │ ├── Megatron-LM-v1.1.5-3D_parallelism │ │ ├── megatron_lm.egg-info │ │ │ ├── not-zip-safe │ │ │ ├── dependency_links.txt │ │ │ ├── top_level.txt │ │ │ ├── requires.txt │ │ │ └── SOURCES.txt │ │ ├── build │ │ │ └── lib │ │ │ │ └── megatron │ │ │ │ ├── mpu │ │ │ │ ├── tests │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── commons.py │ │ │ │ │ ├── test_data.py │ │ │ │ │ └── test_initialize.py │ │ │ │ ├── __init__.py │ │ │ │ └── utils.py │ │ │ │ ├── data │ │ │ │ ├── __init__.py │ │ │ │ └── Makefile │ │ │ │ ├── tokenizer │ │ │ │ └── __init__.py │ │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ ├── fused_bias_gelu.py │ │ │ │ └── utils.py │ │ │ │ ├── fp16 │ │ │ │ └── __init__.py │ │ │ │ ├── module.py │ │ │ │ ├── package_info.py │ │ │ │ ├── __init__.py │ │ │ │ └── deprecated_data_utils │ │ │ │ └── corpora.py │ │ ├── changes.md │ │ ├── requirements.txt │ │ ├── MANIFEST.in │ │ ├── images │ │ │ ├── cases.png │ │ │ ├── scaling-dp.png │ │ │ ├── scaling-mp.png │ │ │ ├── Makefile │ │ │ └── tables.tex │ │ ├── dist │ │ │ ├── megatron_lm-1.1.5-py3.6.egg │ │ │ └── megatron_lm-1.1.5-py3.7.egg │ │ ├── megatron │ │ │ ├── data │ │ │ │ ├── test │ │ │ │ │ └── test_preprocess_data.sh │ │ │ │ ├── Makefile │ │ │ │ └── __init__.py │ │ │ ├── mpu │ │ │ │ ├── tests │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── commons.py │ │ │ │ │ ├── test_data.py │ │ │ │ │ └── test_initialize.py │ │ │ │ ├── __init__.py │ │ │ │ └── utils.py │ │ │ ├── tokenizer │ │ │ │ └── __init__.py │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ ├── fused_bias_gelu.py │ │ │ │ └── utils.py │ │ │ ├── fp16 │ │ │ │ └── __init__.py │ │ │ ├── module.py │ │ │ ├── package_info.py │ │ │ ├── __init__.py │ │ │ ├── deprecated_data_utils │ │ │ │ ├── scripts │ │ │ │ │ └── presplit_sentences_json.py │ │ │ │ └── corpora.py │ │ │ └── fused_kernels │ │ │ │ ├── scaled_upper_triang_masked_softmax.cpp │ │ │ │ ├── scaled_masked_softmax.cpp │ │ │ │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ │ │ │ └── scaled_masked_softmax_cuda.cu │ │ ├── examples │ │ │ ├── ds_config.json │ │ │ ├── generate_text.sh │ │ │ ├── ds_zero_stage_2_config.json │ │ │ ├── merge_mp_bert.sh │ │ │ ├── pretrain_bert.sh │ │ │ ├── pretrain_gpt2.sh │ │ │ ├── evaluate_zeroshot_gpt2.sh │ │ │ ├── pretrain_bert_distributed.sh │ │ │ ├── pretrain_gpt2_distributed.sh │ │ │ ├── finetune_mnli_distributed.sh │ │ │ ├── finetune_race_distributed.sh │ │ │ ├── ds_pretrain_gpt2.sh │ │ │ └── ds_pretrain_gpt2_pipe.sh │ │ ├── tools │ │ │ ├── create_doc_index.py │ │ │ ├── linter.py │ │ │ └── openwebtext │ │ │ │ ├── merge_jsons.py │ │ │ │ ├── README.md │ │ │ │ ├── remove_group_duplicates.py │ │ │ │ ├── group_duplicates_url.py │ │ │ │ └── find_duplicates.py │ │ ├── tasks │ │ │ ├── race │ │ │ │ └── finetune.py │ │ │ ├── glue │ │ │ │ ├── data.py │ │ │ │ ├── finetune.py │ │ │ │ └── mnli.py │ │ │ ├── zeroshot_gpt2 │ │ │ │ └── detokenizer.py │ │ │ ├── main.py │ │ │ └── data_utils.py │ │ └── setup.py │ ├── __init__.py │ ├── ds_config.json │ └── util.py ├── __init__.py ├── utils │ └── __init__.py ├── eval_megatron_retrieval_controlled.sh └── eval_megatron_retrieval.sh ├── inference ├── cheminformatics │ ├── utils │ │ └── fpscores.pkl.gz │ ├── __init__.py │ └── common │ │ ├── cuchemcommon │ │ ├── __init__.py │ │ ├── data │ │ │ ├── helper │ │ │ │ └── __init__.py │ │ │ ├── generative_wf.py │ │ │ ├── __init__.py │ │ │ └── cluster_wf.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── singleton.py │ │ │ ├── sysinfo.py │ │ │ └── logger.py │ │ ├── context.py │ │ └── fingerprint.py │ │ └── grpc │ │ └── generativesampler.proto └── utils_inference │ ├── __init__.py │ └── LICENSE_RATIONALE ├── guacamol └── guacamol │ ├── __init__.py │ ├── guacamol │ ├── utils │ │ ├── __init__.py │ │ ├── math.py │ │ ├── helpers.py │ │ ├── fingerprints.py │ │ ├── descriptors.py │ │ └── sampling_helpers.py │ ├── __init__.py │ ├── distribution_matching_generator.py │ ├── LICENSE_GUACAMOL │ ├── goal_directed_generator.py │ ├── goal_directed_score_contributions.py │ └── assess_goal_directed_generation.py │ └── guacamol_baseline │ └── LICENSE_GUACAMOL_BASELINE ├── download_scripts ├── download_models.sh ├── download_results_reproduce.sh ├── download_data_exp.sh ├── download_data_ret_precompute.sh ├── download_data_cov.sh └── download_data_guacamol.sh └── LICENSE /assets/vis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/assets/vis.jpg -------------------------------------------------------------------------------- /assets/model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/assets/model.jpg -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron_lm.egg-info/not-zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/changes.md: -------------------------------------------------------------------------------- 1 | PRETEND THESE ARE CODE CHANGES 2 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron_lm.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron_lm.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | megatron 2 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/requirements.txt: -------------------------------------------------------------------------------- 1 | pybind11 2 | torch 3 | six 4 | regex 5 | numpy 6 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/data/Makefile 2 | include megatron/data/helpers.cpp 3 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron_lm.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | pybind11 2 | torch 3 | six 4 | regex 5 | numpy 6 | -------------------------------------------------------------------------------- /inference/cheminformatics/utils/fpscores.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/inference/cheminformatics/utils/fpscores.pkl.gz -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/cases.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/cases.png -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-dp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-dp.png -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-mp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-mp.png -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/dist/megatron_lm-1.1.5-py3.6.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/dist/megatron_lm-1.1.5-py3.6.egg -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/dist/megatron_lm-1.1.5-py3.7.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/dist/megatron_lm-1.1.5-py3.7.egg -------------------------------------------------------------------------------- /MolBART/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # This work is licensed under the NSCL license 5 | # for RetMol. To view a copy of this license, see the LICENSE file. 6 | # --------------------------------------------------------------- -------------------------------------------------------------------------------- /MolBART/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # This work is licensed under the NSCL license 5 | # for RetMol. To view a copy of this license, see the LICENSE file. 6 | # --------------------------------------------------------------- -------------------------------------------------------------------------------- /guacamol/guacamol/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # This work is licensed under the NSCL license 5 | # for RetMol. To view a copy of this license, see the LICENSE file. 6 | # --------------------------------------------------------------- -------------------------------------------------------------------------------- /MolBART/megatron_molbart/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # This work is licensed under the NSCL license 5 | # for RetMol. To view a copy of this license, see the LICENSE file. 6 | # --------------------------------------------------------------- -------------------------------------------------------------------------------- /inference/cheminformatics/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # This work is licensed under the NSCL license 5 | # for RetMol. To view a copy of this license, see the LICENSE file. 6 | # --------------------------------------------------------------- -------------------------------------------------------------------------------- /inference/utils_inference/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # This work is licensed under the NSCL license 5 | # for RetMol. To view a copy of this license, see the LICENSE file. 6 | # --------------------------------------------------------------- -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/__init__.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_GUACAMOL). 7 | # --------------------------------------------------------------- -------------------------------------------------------------------------------- /inference/cheminformatics/common/cuchemcommon/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/__init__.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_CHEMINFORMATICS). 7 | # --------------------------------------------------------------- -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/__init__.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_GUACAMOL). 7 | # --------------------------------------------------------------- 8 | 9 | __version__ = "0.5.5" 10 | -------------------------------------------------------------------------------- /inference/cheminformatics/common/cuchemcommon/data/helper/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/data/helper/__init__.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_CHEMINFORMATICS). 7 | # --------------------------------------------------------------- -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 256, 3 | "train_micro_batch_size_per_gpu": 4, 4 | "steps_per_print": 10, 5 | "gradient_clipping": 1.0, 6 | "fp16": { 7 | "enabled": true, 8 | 9 | "loss_scale": 0, 10 | "loss_scale_window": 1000, 11 | "hysteresis": 2, 12 | "min_loss_scale": 1 13 | }, 14 | "wall_clock_breakdown": true, 15 | "zero_allow_untested_optimizer": false 16 | } 17 | -------------------------------------------------------------------------------- /download_scripts/download_models.sh: -------------------------------------------------------------------------------- 1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1O5u8b_n93HOrsjN1aezq6NhZojh-6dEe' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1O5u8b_n93HOrsjN1aezq6NhZojh-6dEe" -O models.zip && rm -rf /tmp/cookies.txt 2 | 3 | unzip models.zip -d ../ 4 | 5 | rm -r ../__MACOSX 6 | rm models.zip -------------------------------------------------------------------------------- /inference/cheminformatics/common/cuchemcommon/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/utils/__init__.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_CHEMINFORMATICS). 7 | # --------------------------------------------------------------- 8 | 9 | from cuchemcommon.utils.singleton import Singleton -------------------------------------------------------------------------------- /download_scripts/download_results_reproduce.sh: -------------------------------------------------------------------------------- 1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=16USnJttlMES1uPtRjJ7WNJ3UcXoTUhV1' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=16USnJttlMES1uPtRjJ7WNJ3UcXoTUhV1" -O results_reproduce.zip && rm -rf /tmp/cookies.txt 2 | 3 | unzip results_reproduce.zip -d ../results_reproduce 4 | 5 | rm results_reproduce.zip -------------------------------------------------------------------------------- /download_scripts/download_data_exp.sh: -------------------------------------------------------------------------------- 1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Si5_yHdCGZNHQov99hPp8rOZx4DC6BY_' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Si5_yHdCGZNHQov99hPp8rOZx4DC6BY_" -O data_exp.zip && rm -rf /tmp/cookies.txt 2 | 3 | mkdir -p ../data 4 | 5 | unzip data_exp.zip -d ../data/ 6 | 7 | rm -r ../data/__MACOSX 8 | rm data_exp.zip -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/Makefile: -------------------------------------------------------------------------------- 1 | default: cases.png scaling-mp.png scaling-dp.png 2 | 3 | # for some reason the size option to convert in scaling.tex doesn't work, manually do it after 4 | cases.png scaling-mp.png scaling-dp.png: tables.tex 5 | latex --shell-escape $< 6 | convert tables-1.png -resize 650 cases.png 7 | convert tables-2.png -resize 600 scaling-mp.png 8 | convert tables-3.png -resize 350 scaling-dp.png 9 | 10 | clean: 11 | rm -rf *.aux *.log *.dvi *.ps 12 | rm -rf tables-*.png 13 | -------------------------------------------------------------------------------- /download_scripts/download_data_ret_precompute.sh: -------------------------------------------------------------------------------- 1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1nN1KEDPNuANJqZJwAOPMKwdHxMa6J7F3' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1nN1KEDPNuANJqZJwAOPMKwdHxMa6J7F3" -O data-retrieval-precompute.zip && rm -rf /tmp/cookies.txt 2 | 3 | unzip data-retrieval-precompute.zip -d ../data/ 4 | 5 | rm data-retrieval-precompute.zip -------------------------------------------------------------------------------- /download_scripts/download_data_cov.sh: -------------------------------------------------------------------------------- 1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1J5KVdJc7SMPCSF0Y8feoYH1DyHI-YF7B' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1J5KVdJc7SMPCSF0Y8feoYH1DyHI-YF7B" -O data_cov.zip && rm -rf /tmp/cookies.txt 2 | 3 | mkdir -p ../data 4 | 5 | unzip data_cov.zip -d ../data/cov 6 | 7 | mv ../data/cov/data_cov/* ../data/cov/ 8 | rm -r ../data/cov/data_cov 9 | rm data_cov.zip 10 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. -------------------------------------------------------------------------------- /download_scripts/download_data_guacamol.sh: -------------------------------------------------------------------------------- 1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1xQqKU0jMqiPCTUl_6yB-mxKqMFPco5zT' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1xQqKU0jMqiPCTUl_6yB-mxKqMFPco5zT" -O data_guacamol.zip && rm -rf /tmp/cookies.txt 2 | 3 | mkdir -p ../data/guacamol 4 | 5 | unzip data_guacamol.zip -d ../data/guacamol/ 6 | 7 | mv ../data/guacamol/data_guacamol/* ../data/guacamol/ 8 | mv ../data/guacamol/retrieval_database_guacamol ../data/guacamol/retrieval_database 9 | 10 | rm -r ../data/guacamol/data_guacamol 11 | rm -r ../data/guacamol/__MACOSX 12 | rm data_guacamol.zip -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from . import indexed_dataset 17 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .tokenizer import build_tokenizer 18 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .tokenizer import build_tokenizer 18 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/generate_text.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHECKPOINT_PATH=checkpoints/gpt2_345m 4 | VOCAB_FILE=gpt2-vocab.json 5 | MERGE_FILE=gpt2-merges.txt 6 | 7 | python tools/generate_samples_gpt2.py \ 8 | --model-parallel-size 1 \ 9 | --num-layers 24 \ 10 | --hidden-size 1024 \ 11 | --load $CHECKPOINT_PATH \ 12 | --num-attention-heads 16 \ 13 | --max-position-embeddings 1024 \ 14 | --tokenizer-type GPT2BPETokenizer \ 15 | --fp16 \ 16 | --batch-size 2 \ 17 | --seq-length 1024 \ 18 | --out-seq-length 1024 \ 19 | --temperature 1.0 \ 20 | --vocab-file $VOCAB_FILE \ 21 | --merge-file $MERGE_FILE \ 22 | --genfile unconditional_samples.json \ 23 | --num-samples 2 \ 24 | --top_p 0.9 \ 25 | --recompute 26 | -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/utils/math.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/math.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_GUACAMOL). 7 | # --------------------------------------------------------------- 8 | 9 | from typing import List 10 | 11 | import numpy as np 12 | 13 | 14 | def arithmetic_mean(values: List[float]) -> float: 15 | """ 16 | Computes the arithmetic mean of a list of values. 17 | """ 18 | return sum(values) / len(values) 19 | 20 | 21 | def geometric_mean(values: List[float]) -> float: 22 | """ 23 | Computes the geometric mean of a list of values. 24 | """ 25 | a = np.array(values) 26 | return a.prod() ** (1.0 / len(a)) 27 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_zero_stage_2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 2048, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 1, 5 | "zero_optimization": { 6 | "stage": 2, 7 | "allgather_partitions": true, 8 | "reduce_scatter": true, 9 | "allgather_bucket_size": 50000000, 10 | "reduce_bucket_size": 50000000, 11 | "overlap_comm": true 12 | }, 13 | "optimizer": { 14 | "type": "Adam", 15 | "params": { 16 | "lr": 0.00015, 17 | "max_grad_norm": 1.0, 18 | "betas": [0.9, 0.95] 19 | } 20 | }, 21 | "gradient_clipping": 1.0, 22 | "fp16": { 23 | "enabled": true, 24 | 25 | "loss_scale": 0, 26 | "loss_scale_window": 1000, 27 | "hysteresis": 2, 28 | "min_loss_scale": 1 29 | }, 30 | "wall_clock_breakdown": true, 31 | "zero_allow_untested_optimizer": false 32 | } 33 | -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/utils/helpers.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/helpers.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_GUACAMOL). 7 | # --------------------------------------------------------------- 8 | 9 | import logging 10 | 11 | 12 | def setup_default_logger(): 13 | """ 14 | Call this function in your main function to initialize a basic logger. 15 | 16 | To have more control on the format or level, call `logging.basicConfig()` directly instead. 17 | 18 | If you don't initialize any logger, log entries from the guacamol package will not appear anywhere. 19 | """ 20 | logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) 21 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/merge_mp_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL_PARALLEL_SIZE=2 4 | 5 | VOCAB_FILE=bert-vocab.txt 6 | CHECKPOINT_PATH=checkpoints/bert_345m 7 | 8 | WORLD_SIZE=$MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \ 9 | --model-type BERT \ 10 | --model-parallel-size $MODEL_PARALLEL_SIZE \ 11 | --tokenizer-type BertWordPieceLowerCase \ 12 | --vocab-file $VOCAB_FILE \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --seq-length 512 \ 17 | --max-position-embeddings 512 \ 18 | --load $CHECKPOINT_PATH 19 | -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/distribution_matching_generator.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/distribution_matching_generator.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_GUACAMOL). 7 | # --------------------------------------------------------------- 8 | 9 | from abc import ABCMeta, abstractmethod 10 | from typing import List 11 | 12 | 13 | class DistributionMatchingGenerator(metaclass=ABCMeta): 14 | """ 15 | Interface for molecule generators. 16 | """ 17 | 18 | @abstractmethod 19 | def generate(self, number_samples: int) -> List[str]: 20 | """ 21 | Samples SMILES strings from a molecule generator. 22 | 23 | Args: 24 | number_samples: number of molecules to generate 25 | 26 | Returns: 27 | A list of SMILES strings. 28 | """ 29 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .distributed import * 17 | from .bert_model import BertModel 18 | from .realm_model import ICTBertModel 19 | from .gpt2_model import GPT2Model, GPT2ModelPipe 20 | from .utils import get_params_for_weight_decay_optimization 21 | from .language_model import get_language_model 22 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .distributed import * 17 | from .bert_model import BertModel 18 | from .realm_model import ICTBertModel 19 | from .gpt2_model import GPT2Model, GPT2ModelPipe 20 | from .utils import get_params_for_weight_decay_optimization 21 | from .language_model import get_language_model 22 | -------------------------------------------------------------------------------- /inference/cheminformatics/common/cuchemcommon/utils/singleton.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/utils/singleton.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_CHEMINFORMATICS). 7 | # --------------------------------------------------------------- 8 | 9 | import logging 10 | 11 | """ 12 | Metaclass for singletons. 13 | """ 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class Singleton(type): 19 | """ 20 | Ensures single instance of a class. 21 | 22 | Example Usage: 23 | class MySingleton(metaclass=Singleton) 24 | pass 25 | """ 26 | _instances = {} 27 | 28 | def __call__(cls, *args, **kwargs): 29 | if cls not in cls._instances: 30 | cls._instances[cls] = super(Singleton, cls).__call__( 31 | *args, **kwargs) 32 | return cls._instances[cls] 33 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 1024, 3 | "train_micro_batch_size_per_gpu": 256, 4 | "gradient_accumulation_steps": 1, 5 | "steps_per_print": 1, 6 | "gradient_clipping": 1.0, 7 | "zero_optimization": { 8 | "stage": 1, 9 | "allgather_partitions": true, 10 | "reduce_scatter": true, 11 | "allgather_bucket_size": 5e8, 12 | "reduce_bucket_size": 5e8, 13 | "overlap_comm": false, 14 | "contiguous_gradients": false, 15 | "cpu_offload": false 16 | }, 17 | "optimizer": { 18 | "type": "AdamW", 19 | "params": { 20 | "lr": 3e-5, 21 | "betas": [0.8, 0.999], 22 | "eps": 1e-8, 23 | "weight_decay": 3e-7 24 | } 25 | }, 26 | "fp16": { 27 | "enabled": true, 28 | "loss_scale": 0, 29 | "loss_scale_window": 1000, 30 | "hysteresis": 2, 31 | "min_loss_scale": 1 32 | }, 33 | "wall_clock_breakdown": true, 34 | "zero_allow_untested_optimizer": true 35 | } -------------------------------------------------------------------------------- /inference/cheminformatics/common/cuchemcommon/data/generative_wf.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/data/generative_wf.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_CHEMINFORMATICS). 7 | # --------------------------------------------------------------- 8 | 9 | import logging 10 | from typing import List 11 | 12 | from cuchemcommon.data.helper.chembldata import ChEmblData 13 | from cuchemcommon.utils.singleton import Singleton 14 | 15 | from . import GenerativeWfDao 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class ChemblGenerativeWfDao(GenerativeWfDao, metaclass=Singleton): 21 | 22 | def __init__(self, fp_type): 23 | self.chem_data = ChEmblData(fp_type) 24 | 25 | def fetch_id_from_chembl(self, id: List): 26 | logger.debug('Fetch ChEMBL ID using molregno...') 27 | return self.chem_data.fetch_id_from_chembl(id) 28 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/create_doc_index.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('../') 3 | 4 | from megatron.indexer import IndexBuilder 5 | from megatron.initialize import initialize_megatron 6 | 7 | 8 | def main(): 9 | """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset 10 | - Include all args needed for initial model specification 11 | 12 | Other key args: 13 | --block-data-path: path to write to 14 | --ict-load or --realm-load: path to checkpoint with which to embed 15 | --data-path and --titles-data-path: paths for dataset 16 | --indexer-log-interval: reporting interval 17 | --indexer-batch-size: size specific for indexer jobs 18 | 19 | Check README.md for example script 20 | """ 21 | 22 | initialize_megatron(extra_args_provider=None, 23 | args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}) 24 | index_builder = IndexBuilder() 25 | index_builder.build_and_save_index() 26 | 27 | 28 | if __name__ == "__main__": 29 | main() 30 | 31 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | DATA_PATH=_text_sentence 6 | CHECKPOINT_PATH= 7 | 8 | python pretrain_bert.py \ 9 | --num-layers 24 \ 10 | --hidden-size 1024 \ 11 | --num-attention-heads 16 \ 12 | --batch-size 4 \ 13 | --seq-length 512 \ 14 | --max-position-embeddings 512 \ 15 | --train-iters 2000000 \ 16 | --save $CHECKPOINT_PATH \ 17 | --load $CHECKPOINT_PATH \ 18 | --data-path $DATA_PATH \ 19 | --vocab-file bert-vocab.txt \ 20 | --data-impl mmap \ 21 | --split 949,50,1 \ 22 | --distributed-backend nccl \ 23 | --lr 0.0001 \ 24 | --min-lr 0.00001 \ 25 | --lr-decay-style linear \ 26 | --lr-decay-iters 990000 \ 27 | --weight-decay 1e-2 \ 28 | --clip-grad 1.0 \ 29 | --warmup .01 \ 30 | --log-interval 100 \ 31 | --save-interval 10000 \ 32 | --eval-interval 1000 \ 33 | --eval-iters 10 \ 34 | --fp16 35 | 36 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/fp16/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from .fp16util import ( 16 | BN_convert_float, 17 | network_to_half, 18 | prep_param_lists, 19 | model_grads_to_master_grads, 20 | master_params_to_model_params, 21 | tofp16, 22 | to_python_float, 23 | clip_grad_norm, 24 | convert_module, 25 | convert_network, 26 | FP16Model, 27 | ) 28 | 29 | from .fp16 import * 30 | from .loss_scaler import * 31 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/fp16/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from .fp16util import ( 16 | BN_convert_float, 17 | network_to_half, 18 | prep_param_lists, 19 | model_grads_to_master_grads, 20 | master_params_to_model_params, 21 | tofp16, 22 | to_python_float, 23 | clip_grad_norm, 24 | convert_module, 25 | convert_network, 26 | FP16Model, 27 | ) 28 | 29 | from .fp16 import * 30 | from .loss_scaler import * 31 | -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/LICENSE_GUACAMOL: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 BenevolentAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol_baseline/LICENSE_GUACAMOL_BASELINE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 BenevolentAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /inference/utils_inference/LICENSE_RATIONALE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Wengong Jin, Regina Barzilay, Tommi Jaakkola 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_gpt2.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | DATA_PATH=_text_document 9 | CHECKPOINT_PATH= 10 | 11 | 12 | python pretrain_gpt2.py \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --batch-size 8 \ 17 | --seq-length 1024 \ 18 | --max-position-embeddings 1024 \ 19 | --train-iters 500000 \ 20 | --lr-decay-iters 320000 \ 21 | --save $CHECKPOINT_PATH \ 22 | --load $CHECKPOINT_PATH \ 23 | --data-path $DATA_PATH \ 24 | --vocab-file gpt2-vocab.json \ 25 | --merge-file gpt2-merges.txt \ 26 | --data-impl mmap \ 27 | --split 949,50,1 \ 28 | --distributed-backend nccl \ 29 | --lr 0.00015 \ 30 | --min-lr 1.0e-5 \ 31 | --lr-decay-style cosine \ 32 | --weight-decay 1e-2 \ 33 | --clip-grad 1.0 \ 34 | --warmup .01 \ 35 | --checkpoint-activations \ 36 | --log-interval 100 \ 37 | --save-interval 10000 \ 38 | --eval-interval 1000 \ 39 | --eval-iters 10 \ 40 | --fp16 41 | 42 | 43 | set +x 44 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/module.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Megatron Module""" 17 | 18 | import torch 19 | 20 | 21 | class MegatronModule(torch.nn.Module): 22 | """Megatron specific extentions of torch Module.""" 23 | 24 | def __init__(self): 25 | super(MegatronModule, self).__init__() 26 | 27 | def state_dict_for_save_checkpoint(self, destination=None, prefix='', 28 | keep_vars=False): 29 | """Use this function to override the state dict for 30 | saving checkpoints.""" 31 | return self.state_dict(destination, prefix, keep_vars) 32 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/module.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Megatron Module""" 17 | 18 | import torch 19 | 20 | 21 | class MegatronModule(torch.nn.Module): 22 | """Megatron specific extentions of torch Module.""" 23 | 24 | def __init__(self): 25 | super(MegatronModule, self).__init__() 26 | 27 | def state_dict_for_save_checkpoint(self, destination=None, prefix='', 28 | keep_vars=False): 29 | """Use this function to override the state dict for 30 | saving checkpoints.""" 31 | return self.state_dict(destination, prefix, keep_vars) 32 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/evaluate_zeroshot_gpt2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TASK="LAMBADA" 12 | 13 | VALID_DATA= 14 | VOCAB_FILE=gpt2-vocab.json 15 | MERGE_FILE=gpt2-merges.txt 16 | CHECKPOINT=checkpoints/gpt2_345m 17 | 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 20 | --task $TASK \ 21 | --valid-data $VALID_DATA \ 22 | --tokenizer-type GPT2BPETokenizer \ 23 | --strict-lambada \ 24 | --vocab-file $VOCAB_FILE \ 25 | --merge-file $MERGE_FILE \ 26 | --load $CHECKPOINT \ 27 | --model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --batch-size 8 \ 32 | --checkpoint-activations \ 33 | --seq-length 1024 \ 34 | --max-position-embeddings 1024 \ 35 | --log-interval 10 \ 36 | --fp16 \ 37 | --no-load-optim \ 38 | --no-load-rng 39 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/package_info.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | MAJOR = 1 17 | MINOR = 1.5 18 | 19 | # Use the following formatting: (major, minor) 20 | VERSION = (MAJOR, MINOR) 21 | 22 | __version__ = '.'.join(map(str, VERSION)) 23 | __package_name__ = 'megatron-lm' 24 | __contact_names__ = 'NVIDIA INC' 25 | __url__ = 'https://github.com/NVIDIA/Megatron-LM' 26 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 27 | __description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.' 28 | __license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE' 29 | __keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language' 30 | 31 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/package_info.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | MAJOR = 1 17 | MINOR = 1.5 18 | 19 | # Use the following formatting: (major, minor) 20 | VERSION = (MAJOR, MINOR) 21 | 22 | __version__ = '.'.join(map(str, VERSION)) 23 | __package_name__ = 'megatron-lm' 24 | __contact_names__ = 'NVIDIA INC' 25 | __url__ = 'https://github.com/NVIDIA/Megatron-LM' 26 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 27 | __description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.' 28 | __license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE' 29 | __keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language' 30 | 31 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/linter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import pathlib 4 | import subprocess 5 | 6 | 7 | def recursively_lint_files(): 8 | """Recursively lint all python files in chosen subdirectories of megatron-lm""" 9 | 10 | try: 11 | import autopep8 12 | except ModuleNotFoundError: 13 | print("Please first install autopep8 via `pip install autopep8`") 14 | return 15 | 16 | # get all python file paths from top level directory 17 | file_dir = str(pathlib.Path(__file__).parent.absolute()) 18 | working_dir = osp.join(file_dir, os.pardir) 19 | all_py_paths = set(os.path.join(working_dir, fname) 20 | for fname in os.listdir(working_dir) if ".py" in fname) 21 | 22 | # get all python file paths from chosen subdirectories 23 | check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks'] 24 | for sub_dir in check_dirs: 25 | for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)): 26 | all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname)) 27 | 28 | print("Linting the following: ") 29 | for py_path in all_py_paths: 30 | print(py_path) 31 | command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path) 32 | subprocess.check_call(command) 33 | 34 | 35 | if __name__ == "__main__": 36 | recursively_lint_files() 37 | -------------------------------------------------------------------------------- /inference/cheminformatics/common/grpc/generativesampler.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package nvidia.cheminformatics.grpc; 4 | 5 | import "google/protobuf/empty.proto"; 6 | 7 | //python -m pip install grpcio 8 | //python -m pip install grpcio-tools 9 | //python -m grpc_tools.protoc -I./grpc/ \ 10 | // --python_out=generated \ 11 | // --experimental_allow_proto3_optional \ 12 | // --grpc_python_out=generated \ 13 | // ./grpc/generativesampler.proto 14 | 15 | 16 | enum GenerativeModel { 17 | CDDD = 0; 18 | MegaMolBART = 1; 19 | MolBART = 10000; 20 | } 21 | 22 | 23 | service GenerativeSampler { 24 | rpc SmilesToEmbedding(GenerativeSpec) returns (EmbeddingList) {}; 25 | rpc FindSimilars(GenerativeSpec) returns (SmilesList) {}; 26 | rpc Interpolate(GenerativeSpec) returns (SmilesList) {}; 27 | rpc GetIteration(google.protobuf.Empty) returns (IterationVal) {}; 28 | } 29 | 30 | 31 | message GenerativeSpec { 32 | GenerativeModel model = 1; 33 | repeated string smiles = 2; 34 | optional float radius = 3; 35 | optional int32 numRequested = 4; 36 | optional int32 padding = 5; 37 | optional bool forceUnique = 6; 38 | optional bool sanitize = 7; 39 | } 40 | 41 | 42 | message SmilesList { 43 | repeated string generatedSmiles = 1; 44 | repeated EmbeddingList embeddings = 2; 45 | } 46 | 47 | message EmbeddingList{ 48 | repeated float embedding = 1; 49 | } 50 | 51 | message IterationVal{ 52 | int32 iteration = 1; 53 | } 54 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/util.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # This file has been modified from MolecularAI/MolBART 5 | # 6 | # Source: 7 | # https://github.com/MolecularAI/MolBART/blob/master/molbart/util.py 8 | # 9 | # The license for the original version of this file can be 10 | # found in this directory (LICENSE_MOLBART). 11 | # The modifications to this file are subject to the same license. 12 | # --------------------------------------------------------------- 13 | 14 | # coding=utf-8 15 | 16 | import os 17 | 18 | project_home = os.environ['PROJECT_HOME'] 19 | 20 | root = project_home 21 | DEFAULT_VOCAB_PATH = os.path.join(root, 'models/megamolbart/bart_vocab.txt') 22 | CHECKPOINTS_DIR = os.path.join(root, 'models/megamolbart/checkpoints') 23 | 24 | # Tokenization and vocabulary 25 | DEFAULT_MAX_SEQ_LEN = 512 26 | DEFAULT_CHEM_TOKEN_START = 272 27 | DEFAULT_BEGIN_TOKEN = "^" 28 | DEFAULT_END_TOKEN = "&" 29 | DEFAULT_PAD_TOKEN = "" 30 | DEFAULT_UNK_TOKEN = "?" 31 | DEFAULT_MASK_TOKEN = "" 32 | DEFAULT_SEP_TOKEN = "" 33 | DEFAULT_MASK_PROB = 0.15 34 | DEFAULT_SHOW_MASK_TOKEN_PROB = 1.0 35 | DEFAULT_MASK_SCHEME = "span" 36 | DEFAULT_SPAN_LAMBDA = 3.0 37 | REGEX = "\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]" 38 | 39 | # Model parameters 40 | DEFAULT_D_MODEL = 256 41 | DEFAULT_NUM_LAYERS = 4 42 | DEFAULT_NUM_HEADS = 8 43 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_bert_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DATA_PATH=_text_sentence 12 | CHECKPOINT_PATH= 13 | 14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 15 | 16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 17 | pretrain_bert.py \ 18 | --model-parallel-size 1 \ 19 | --num-layers 24 \ 20 | --hidden-size 1024 \ 21 | --num-attention-heads 16 \ 22 | --batch-size 4 \ 23 | --seq-length 512 \ 24 | --max-position-embeddings 512 \ 25 | --train-iters 1000000 \ 26 | --save $CHECKPOINT_PATH \ 27 | --load $CHECKPOINT_PATH \ 28 | --data-path $DATA_PATH \ 29 | --vocab-file bert-vocab.txt \ 30 | --data-impl mmap \ 31 | --split 949,50,1 \ 32 | --distributed-backend nccl \ 33 | --lr 0.0001 \ 34 | --lr-decay-style linear \ 35 | --min-lr 1.0e-5 \ 36 | --lr-decay-iters 990000 \ 37 | --weight-decay 1e-2 \ 38 | --clip-grad 1.0 \ 39 | --warmup .01 \ 40 | --log-interval 100 \ 41 | --save-interval 10000 \ 42 | --eval-interval 1000 \ 43 | --eval-iters 10 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | from .package_info import ( 18 | __description__, 19 | __contact_names__, 20 | __url__, 21 | __download_url__, 22 | __keywords__, 23 | __license__, 24 | __package_name__, 25 | __version__, 26 | ) 27 | 28 | from .global_vars import get_args 29 | from .global_vars import get_tokenizer 30 | from .global_vars import get_tensorboard_writer 31 | from .global_vars import get_adlr_autoresume 32 | from .global_vars import get_timers 33 | from .initialize import initialize_megatron 34 | 35 | def print_rank_0(message): 36 | """If distributed is initialized print only on rank 0.""" 37 | if torch.distributed.is_initialized(): 38 | if torch.distributed.get_rank() == 0: 39 | print(message, flush=True) 40 | else: 41 | print(message, flush=True) 42 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | from .package_info import ( 18 | __description__, 19 | __contact_names__, 20 | __url__, 21 | __download_url__, 22 | __keywords__, 23 | __license__, 24 | __package_name__, 25 | __version__, 26 | ) 27 | 28 | from .global_vars import get_args 29 | from .global_vars import get_tokenizer 30 | from .global_vars import get_tensorboard_writer 31 | from .global_vars import get_adlr_autoresume 32 | from .global_vars import get_timers 33 | from .initialize import initialize_megatron 34 | 35 | def print_rank_0(message): 36 | """If distributed is initialized print only on rank 0.""" 37 | if torch.distributed.is_initialized(): 38 | if torch.distributed.get_rank() == 0: 39 | print(message, flush=True) 40 | else: 41 | print(message, flush=True) 42 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ 17 | Usage: 18 | python scripts/presplit_sentences_json.py 19 | """ 20 | 21 | import sys 22 | import json 23 | 24 | import nltk 25 | 26 | nltk.download('punkt') 27 | 28 | input_file = sys.argv[1] 29 | output_file = sys.argv[2] 30 | 31 | line_seperator = "\n" 32 | 33 | with open(input_file, 'r') as ifile: 34 | with open(output_file, "w") as ofile: 35 | for doc in ifile.readlines(): 36 | parsed = json.loads(doc) 37 | sent_list = [] 38 | for line in parsed['text'].split('\n'): 39 | if line != '\n': 40 | sent_list.extend(nltk.tokenize.sent_tokenize(line)) 41 | parsed['text'] = line_seperator.join(sent_list) 42 | ofile.write(json.dumps(parsed) + '\n') 43 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_gpt2_distributed.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | DATA_PATH=_text_document 14 | CHECKPOINT_PATH= 15 | 16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 19 | pretrain_gpt2.py \ 20 | --model-parallel-size 1 \ 21 | --num-layers 24 \ 22 | --hidden-size 1024 \ 23 | --num-attention-heads 16 \ 24 | --batch-size 8 \ 25 | --seq-length 1024 \ 26 | --max-position-embeddings 1024 \ 27 | --train-iters 500000 \ 28 | --lr-decay-iters 320000 \ 29 | --save $CHECKPOINT_PATH \ 30 | --load $CHECKPOINT_PATH \ 31 | --data-path $DATA_PATH \ 32 | --vocab-file gpt2-vocab.json \ 33 | --merge-file gpt2-merges.txt \ 34 | --data-impl mmap \ 35 | --split 949,50,1 \ 36 | --distributed-backend nccl \ 37 | --lr 0.00015 \ 38 | --lr-decay-style cosine \ 39 | --min-lr 1.0e-5 \ 40 | --weight-decay 1e-2 \ 41 | --clip-grad 1.0 \ 42 | --warmup .01 \ 43 | --checkpoint-activations \ 44 | --log-interval 100 \ 45 | --save-interval 10000 \ 46 | --eval-interval 1000 \ 47 | --eval-iters 10 \ 48 | --fp16 49 | 50 | 51 | 52 | set +x 53 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/tables.tex: -------------------------------------------------------------------------------- 1 | \documentclass[multi,convert]{standalone} 2 | \usepackage{multirow} 3 | \standaloneenv{tabular} 4 | 5 | \begin{document} 6 | 7 | \begin{tabular}{cccccc} 8 | Case & Hidden Size & Attention Heads & Layers & Parameters (billions) & Model Parallel Partitions \\ 9 | \hline 10 | 1B & 1920 & 15 & 24 & 1.16 & 1 \\ 11 | 2B & 2304 & 18 & 30 & 2.03 & 2 \\ 12 | 4B & 3072 & 24 & 36 & 4.24 & 4 \\ 13 | 8B & 4096 & 32 & 42 & 8.67 & 8 \\ 14 | \end{tabular} 15 | 16 | \begin{tabular}{cc|ccc|ccc} 17 | & & \multicolumn{3}{c|}{\textbf{DGX-2 (V100) batch size 8}} & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 16}} \\ 18 | \hline 19 | \multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\ 20 | & GPUs & Time (ms) & & per GPU & Time (ms) & & per GPU \\ 21 | \hline 22 | 1B & 1 & 1121 & 100.0\% & 71.9 & 1076 & 100\% & 149.8 \\ 23 | 2B & 2 & 1093 & 89.6\% & 64.2 & 1026 & 91.7\% & 136.8 \\ 24 | 4B & 4 & 1238 & 82.5\% & 58.5 & 1162 & 84.5\% & 124.7 \\ 25 | 8B & 8 & 1407 & 74.3\% & 52.2 & 1343 & 74.7\% & 109.3 \\ 26 | \end{tabular} 27 | 28 | \begin{tabular}{cc|ccc} 29 | & & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 2048}} \\ 30 | \hline 31 | \multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\ 32 | & GPUs & Time (ms) & & per GPU \\ 33 | \hline 34 | 1B & 128 & 1153 & 93.3\% & 139.8 \\ 35 | 2B & 256 & 1101 & 85.5\% & 127.5 \\ 36 | 4B & 512 & 1242 & 79.0\% & 116.7 \\ 37 | 8B & 1024 & 1380 & 72.7\% & 106.5 \\ 38 | \end{tabular} 39 | 40 | \end{document} 41 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/finetune_mnli_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv" 12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \ 13 | data/glue_data/MNLI/dev_mismatched.tsv" 14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 15 | VOCAB_FILE=bert-vocab.txt 16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task MNLI \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 5 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --batch-size 8 \ 32 | --checkpoint-activations \ 33 | --lr 5.0e-5 \ 34 | --lr-decay-style linear \ 35 | --warmup 0.065 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --save-interval 500000 \ 39 | --save $CHECKPOINT_PATH \ 40 | --log-interval 10 \ 41 | --eval-interval 100 \ 42 | --eval-iters 50 \ 43 | --weight-decay 1.0e-1 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/finetune_race_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/RACE/train/middle" 12 | VALID_DATA="data/RACE/dev/middle \ 13 | data/RACE/dev/high" 14 | VOCAB_FILE=bert-vocab.txt 15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 16 | CHECKPOINT_PATH=checkpoints/bert_345m_race 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task RACE \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 3 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --batch-size 4 \ 32 | --checkpoint-activations \ 33 | --lr 1.0e-5 \ 34 | --lr-decay-style linear \ 35 | --warmup 0.06 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --save-interval 100000 \ 39 | --save $CHECKPOINT_PATH \ 40 | --log-interval 10 \ 41 | --eval-interval 100 \ 42 | --eval-iters 50 \ 43 | --weight-decay 1.0e-1 \ 44 | --clip-grad 1.0 \ 45 | --hidden-dropout 0.1 \ 46 | --attention-dropout 0.1 \ 47 | --fp16 48 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/openwebtext/merge_jsons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import glob 18 | import sys 19 | import json 20 | import argparse 21 | 22 | if __name__ == '__main__': 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--json_path", type=str, default=".", 26 | help="path where all the json files are located") 27 | 28 | parser.add_argument("--output_file", type=str, default="merged_output.json", 29 | help="filename where the merged json should go") 30 | 31 | args = parser.parse_args() 32 | 33 | json_path = args.json_path 34 | out_file = args.output_file 35 | 36 | json_files = glob.glob(json_path + '/*.json') 37 | 38 | counter = 0 39 | 40 | with open(out_file, 'w') as outfile: 41 | for fname in json_files: 42 | counter += 1 43 | 44 | if counter % 1024 == 0: 45 | print("Merging at ", counter, flush=True) 46 | 47 | with open(fname, 'r') as infile: 48 | for row in infile: 49 | each_row = json.loads(row) 50 | outfile.write(row) 51 | 52 | 53 | print("Merged file", out_file, flush=True) 54 | 55 | 56 | -------------------------------------------------------------------------------- /inference/cheminformatics/common/cuchemcommon/data/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/data/__init__.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_CHEMINFORMATICS). 7 | # --------------------------------------------------------------- 8 | 9 | 10 | from typing import List 11 | 12 | 13 | class ClusterWfDAO(object): 14 | """ 15 | Base class for all DAO for fetching data for Clustering Workflows 16 | """ 17 | 18 | def meta_df(self): 19 | """ 20 | Returns df with dtype set for structure without any column filter. 21 | """ 22 | return NotImplemented 23 | 24 | def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None): 25 | """ 26 | Fetch molecular properties from database/cache into a dask array. 27 | """ 28 | return NotImplemented 29 | 30 | def fetch_molecular_embedding_by_id(self, molecule_id: List): 31 | """ 32 | Fetch molecular properties from database for the given id. Id depends on 33 | the backend databse. For chemble DB it should be molregid. 34 | """ 35 | return NotImplemented 36 | 37 | def fetch_id_from_smile(self, new_molecules: List): 38 | """ 39 | Fetch molecular details for a list of molecules. The values in the list 40 | of molecules depends on database/service used. For e.g. it could be 41 | ChemblId or molreg_id for Chemble database. 42 | """ 43 | return NotImplemented 44 | 45 | 46 | class GenerativeWfDao(object): 47 | 48 | def fetch_id_from_chembl(self, id: List): 49 | """ 50 | Fetch molecular details for a list of molecules. The values in the list 51 | of molecules depends on database/service used. For e.g. it could be 52 | ChemblId or molreg_id for Chemble database. 53 | """ 54 | return NotImplemented 55 | -------------------------------------------------------------------------------- /inference/cheminformatics/common/cuchemcommon/context.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/context.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_CHEMINFORMATICS). 7 | # --------------------------------------------------------------- 8 | 9 | import logging 10 | import os 11 | from configparser import RawConfigParser 12 | from io import StringIO 13 | 14 | from cuchemcommon.utils.singleton import Singleton 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | CONFIG_FILE = '.env' 19 | 20 | 21 | class Context(metaclass=Singleton): 22 | 23 | def __init__(self): 24 | 25 | self.dask_client = None 26 | self.compute_type = 'gpu' 27 | self.is_benchmark = False 28 | self.benchmark_file = None 29 | self.cache_directory = None 30 | self.n_molecule = None 31 | self.batch_size = 10000 32 | 33 | self.config = {} 34 | if os.path.exists(CONFIG_FILE): 35 | logger.info('Reading properties from %s...', CONFIG_FILE) 36 | self.config = self._load_properties_file(CONFIG_FILE) 37 | else: 38 | logger.warn('Could not locate %s', CONFIG_FILE) 39 | 40 | def _load_properties_file(self, properties_file): 41 | """ 42 | Reads a properties file using ConfigParser. 43 | 44 | :param propertiesFile/configFile: 45 | """ 46 | config_file = open(properties_file, 'r') 47 | config_content = StringIO('[root]\n' + config_file.read()) 48 | config = RawConfigParser() 49 | config.read_file(config_content) 50 | 51 | return config._sections['root'] 52 | 53 | def get_config(self, config_name, default=None): 54 | """ 55 | Returns values from local configuration. 56 | """ 57 | try: 58 | return self.config[config_name] 59 | except KeyError: 60 | logger.warn('%s not found, returing default.', config_name) 61 | return default 62 | -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/goal_directed_generator.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # This file has been modified from guacamol benchmark. 5 | # 6 | # Source: 7 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/goal_directed_generator.py 8 | # 9 | # The license for the original version of this file can be 10 | # found in this directory (LICENSE_GUACAMOL). 11 | # The modifications to this file are subject to the same license. 12 | # --------------------------------------------------------------- 13 | import os 14 | import sys 15 | from abc import ABCMeta, abstractmethod 16 | from typing import List, Optional 17 | import pytorch_lightning as pl 18 | from guacamol.guacamol.scoring_function import ScoringFunction 19 | 20 | project_home = os.environ['PROJECT_HOME'] 21 | sys.path.insert(1, project_home + '/inference') 22 | from inference import MegaMolBART 23 | 24 | 25 | class GoalDirectedGenerator(metaclass=ABCMeta): 26 | """ 27 | Interface for goal-directed molecule generators. 28 | """ 29 | 30 | def __init__(self, model_path, ret_data_path, 31 | model_ckpt_itr=50000, max_mol_len=200): 32 | ''' 33 | my defined initialization 34 | ''' 35 | self.wf = MegaMolBART(model_path=model_path, model_ckpt_itr=model_ckpt_itr, decoder_max_seq_len=max_mol_len) 36 | self.ret_dataset = None 37 | 38 | 39 | @abstractmethod 40 | def generate_optimized_molecules(self, scoring_function: ScoringFunction, number_molecules: int, 41 | starting_population: Optional[List[str]], benchmark_name: str): 42 | """ 43 | Given an objective function, generate molecules that score as high as possible. 44 | 45 | Args: 46 | scoring_function: scoring function 47 | number_molecules: number of molecules to generate 48 | starting_population: molecules to start the optimization from (optional) 49 | benchmark_name: benchmark name 50 | 51 | Returns: 52 | A list of SMILES strings for the generated molecules. 53 | """ 54 | pl.utilities.seed.seed_everything(1234) 55 | -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/utils/fingerprints.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/fingerprints.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_GUACAMOL). 7 | # --------------------------------------------------------------- 8 | 9 | from rdkit.Chem import AllChem, Mol 10 | from rdkit.Chem.AtomPairs.Sheridan import GetBPFingerprint, GetBTFingerprint 11 | from rdkit.Chem.Pharm2D import Generate, Gobbi_Pharm2D 12 | 13 | 14 | class _FingerprintCalculator: 15 | """ 16 | Calculate the fingerprint while avoiding a series of if-else. 17 | See recipe 8.21 of the book "Python Cookbook". 18 | 19 | To support a new type of fingerprint, just add a function "get_fpname(self, mol)". 20 | """ 21 | 22 | def get_fingerprint(self, mol: Mol, fp_type: str): 23 | method_name = 'get_' + fp_type 24 | method = getattr(self, method_name) 25 | if method is None: 26 | raise Exception(f'{fp_type} is not a supported fingerprint type.') 27 | return method(mol) 28 | 29 | def get_AP(self, mol: Mol): 30 | return AllChem.GetAtomPairFingerprint(mol, maxLength=10) 31 | 32 | def get_PHCO(self, mol: Mol): 33 | return Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory) 34 | 35 | def get_BPF(self, mol: Mol): 36 | return GetBPFingerprint(mol) 37 | 38 | def get_BTF(self, mol: Mol): 39 | return GetBTFingerprint(mol) 40 | 41 | def get_PATH(self, mol: Mol): 42 | return AllChem.RDKFingerprint(mol) 43 | 44 | def get_ECFP4(self, mol: Mol): 45 | return AllChem.GetMorganFingerprint(mol, 2) 46 | 47 | def get_ECFP6(self, mol: Mol): 48 | return AllChem.GetMorganFingerprint(mol, 3) 49 | 50 | def get_FCFP4(self, mol: Mol): 51 | return AllChem.GetMorganFingerprint(mol, 2, useFeatures=True) 52 | 53 | def get_FCFP6(self, mol: Mol): 54 | return AllChem.GetMorganFingerprint(mol, 3, useFeatures=True) 55 | 56 | 57 | def get_fingerprint(mol: Mol, fp_type: str): 58 | return _FingerprintCalculator().get_fingerprint(mol=mol, fp_type=fp_type) 59 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/openwebtext/README.md: -------------------------------------------------------------------------------- 1 | The following steps show how to prepare training dataset to train the mode. 2 | 3 | # Libraries to install 4 | 5 | ``` 6 | pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 7 | git clone https://github.com/mattilyra/LSH 8 | cd LSH 9 | python setup.py install 10 | ``` 11 | 12 | # Download the dataset 13 | 14 | 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ) 15 | 2. Remove blacklisted URLs. 16 | ``` 17 | python blacklist_urls.py 18 | ``` 19 | 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 20 | 21 | 4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique. 22 | 23 | # Prepare the data for GPT-2 training: 24 | 25 | 1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards. 26 | ``` 27 | python cleanup_dataset.py 28 | ``` 29 | 2. Using LSH, find possible duplicates and store then in a file for later processing. This step can NOT be sharded and usually takes 12 to 24 hours for OpenWebText dataset. 30 | ``` 31 | python find_duplicates.py 32 | ``` 33 | 3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest. 34 | ``` 35 | python group_duplicate_urls.py 36 | ``` 37 | 4. Remove similar documents that were detected in the last step. 38 | ``` 39 | python remove_group_duplicates.py 40 | ``` 41 | 42 | 5. Shuffle the dataset. 43 | ``` 44 | shuf -o train_data.json 45 | ``` 46 | 47 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/corpora.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """several datasets with preset arguments""" 16 | from .datasets import json_dataset, csv_dataset 17 | import os 18 | 19 | 20 | class wikipedia(json_dataset): 21 | """ 22 | dataset for wikipedia with arguments configured for convenience 23 | 24 | command line usage: `--train-data wikipedia` 25 | """ 26 | PATH = 'data/wikipedia/wikidump_lines.json' 27 | assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py" 28 | 29 | def __init__(self, **kwargs): 30 | assert os.path.exists(wikipedia.PATH), \ 31 | wikipedia.assert_str 32 | if not kwargs: 33 | kwargs = {} 34 | kwargs['text_key'] = 'text' 35 | kwargs['loose_json'] = True 36 | super(wikipedia, self).__init__(wikipedia.PATH, **kwargs) 37 | 38 | 39 | class webtext(json_dataset): 40 | """ 41 | dataset for webtext with arguments configured for convenience 42 | 43 | command line usage: `--train-data webtext` 44 | """ 45 | PATH = 'data/webtext/data.json' 46 | assert_str = "make sure to set PATH for webtext data_utils/corpora.py" 47 | 48 | def __init__(self, **kwargs): 49 | assert os.path.exists(webtext.PATH), \ 50 | webtext.assert_str 51 | if not kwargs: 52 | kwargs = {} 53 | kwargs['text_key'] = 'text' 54 | kwargs['loose_json'] = True 55 | super(webtext, self).__init__(webtext.PATH, **kwargs) 56 | 57 | 58 | NAMED_CORPORA = { 59 | 'wikipedia': wikipedia, 60 | 'webtext': webtext, 61 | } 62 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/deprecated_data_utils/corpora.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """several datasets with preset arguments""" 16 | from .datasets import json_dataset, csv_dataset 17 | import os 18 | 19 | 20 | class wikipedia(json_dataset): 21 | """ 22 | dataset for wikipedia with arguments configured for convenience 23 | 24 | command line usage: `--train-data wikipedia` 25 | """ 26 | PATH = 'data/wikipedia/wikidump_lines.json' 27 | assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py" 28 | 29 | def __init__(self, **kwargs): 30 | assert os.path.exists(wikipedia.PATH), \ 31 | wikipedia.assert_str 32 | if not kwargs: 33 | kwargs = {} 34 | kwargs['text_key'] = 'text' 35 | kwargs['loose_json'] = True 36 | super(wikipedia, self).__init__(wikipedia.PATH, **kwargs) 37 | 38 | 39 | class webtext(json_dataset): 40 | """ 41 | dataset for webtext with arguments configured for convenience 42 | 43 | command line usage: `--train-data webtext` 44 | """ 45 | PATH = 'data/webtext/data.json' 46 | assert_str = "make sure to set PATH for webtext data_utils/corpora.py" 47 | 48 | def __init__(self, **kwargs): 49 | assert os.path.exists(webtext.PATH), \ 50 | webtext.assert_str 51 | if not kwargs: 52 | kwargs = {} 53 | kwargs['text_key'] = 'text' 54 | kwargs['loose_json'] = True 55 | super(webtext, self).__init__(webtext.PATH, **kwargs) 56 | 57 | 58 | NAMED_CORPORA = { 59 | 'wikipedia': wikipedia, 60 | 'webtext': webtext, 61 | } 62 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/race/finetune.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Race.""" 17 | 18 | from megatron import get_args 19 | from megatron import print_rank_0 20 | from megatron import get_tokenizer 21 | from megatron.model.multiple_choice import MultipleChoice 22 | from tasks.eval_utils import accuracy_func_provider 23 | from tasks.finetune_utils import finetune 24 | from tasks.race.data import RaceDataset 25 | 26 | 27 | def train_valid_datasets_provider(): 28 | """Provide train and validation datasets.""" 29 | args = get_args() 30 | tokenizer = get_tokenizer() 31 | 32 | train_dataset = RaceDataset('training', args.train_data, 33 | tokenizer, args.seq_length) 34 | valid_dataset = RaceDataset('validation', args.valid_data, 35 | tokenizer, args.seq_length) 36 | 37 | return train_dataset, valid_dataset 38 | 39 | 40 | def model_provider(): 41 | """Build the model.""" 42 | 43 | print_rank_0('building multichoice model for RACE ...') 44 | 45 | return MultipleChoice(num_tokentypes=2) 46 | 47 | 48 | def metrics_func_provider(): 49 | """Privde metrics callback function.""" 50 | args = get_args() 51 | tokenizer = get_tokenizer() 52 | 53 | def single_dataset_provider(datapath): 54 | name = datapath.split('RACE')[-1].strip('/').replace('/', '-') 55 | return RaceDataset(name, [datapath], tokenizer, args.seq_length) 56 | 57 | return accuracy_func_provider(single_dataset_provider) 58 | 59 | 60 | def main(): 61 | 62 | finetune(train_valid_datasets_provider, model_provider, 63 | end_of_epoch_callback_provider=metrics_func_provider) 64 | -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/utils/descriptors.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/descriptors.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_GUACAMOL). 7 | # --------------------------------------------------------------- 8 | 9 | from rdkit import Chem 10 | from rdkit.Chem import Descriptors, Mol, rdMolDescriptors 11 | 12 | 13 | def logP(mol: Mol) -> float: 14 | return Descriptors.MolLogP(mol) 15 | 16 | 17 | def qed(mol: Mol) -> float: 18 | return Descriptors.qed(mol) 19 | 20 | 21 | def tpsa(mol: Mol) -> float: 22 | return Descriptors.TPSA(mol) 23 | 24 | 25 | def bertz(mol: Mol) -> float: 26 | return Descriptors.BertzCT(mol) 27 | 28 | 29 | def mol_weight(mol: Mol) -> float: 30 | return Descriptors.MolWt(mol) 31 | 32 | 33 | def num_H_donors(mol: Mol) -> int: 34 | return Descriptors.NumHDonors(mol) 35 | 36 | 37 | def num_H_acceptors(mol: Mol) -> int: 38 | return Descriptors.NumHAcceptors(mol) 39 | 40 | 41 | def num_rotatable_bonds(mol: Mol) -> int: 42 | return Descriptors.NumRotatableBonds(mol) 43 | 44 | 45 | def num_rings(mol: Mol) -> int: 46 | return rdMolDescriptors.CalcNumRings(mol) 47 | 48 | 49 | def num_aromatic_rings(mol: Mol) -> int: 50 | return rdMolDescriptors.CalcNumAromaticRings(mol) 51 | 52 | 53 | def num_atoms(mol: Mol) -> int: 54 | """ 55 | Returns the total number of atoms, H included 56 | """ 57 | mol = Chem.AddHs(mol) 58 | return mol.GetNumAtoms() 59 | 60 | 61 | class AtomCounter: 62 | 63 | def __init__(self, element: str) -> None: 64 | """ 65 | Args: 66 | element: element to count within a molecule 67 | """ 68 | self.element = element 69 | 70 | def __call__(self, mol: Mol) -> int: 71 | """ 72 | Count the number of atoms of a given type. 73 | 74 | Args: 75 | mol: molecule 76 | 77 | Returns: 78 | The number of atoms of the given type. 79 | """ 80 | # if the molecule contains H atoms, they may be implicit, so add them 81 | if self.element == 'H': 82 | mol = Chem.AddHs(mol) 83 | 84 | return sum(1 for a in mol.GetAtoms() if a.GetSymbol() == self.element) 85 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | torch._C._jit_set_profiling_mode(False) 19 | torch._C._jit_set_profiling_executor(False) 20 | torch._C._jit_override_can_fuse_on_cpu(True) 21 | torch._C._jit_override_can_fuse_on_gpu(True) 22 | 23 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 24 | # 1/sqrt(2*pi)-> 0.3989423 25 | # 1/sqrt(2) -> 0.70710678 26 | # sqrt(2/pi) -> 0.79788456 27 | # this function is tanh approximation of gelu 28 | # actual gelu is: 29 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 30 | 31 | @torch.jit.script 32 | def bias_gelu(bias, y): 33 | x = bias + y 34 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 35 | 36 | # gradient of tanh approximation of gelu 37 | # gradient of actual gelu is: 38 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 39 | @torch.jit.script 40 | def bias_gelu_back(g, bias, y): 41 | x = bias + y 42 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 43 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 44 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 45 | return ff*g 46 | 47 | class GeLUFunction(torch.autograd.Function): 48 | @staticmethod 49 | # bias is an optional argument 50 | def forward(ctx, input, bias): 51 | ctx.save_for_backward(input, bias) 52 | return bias_gelu(bias, input) 53 | 54 | @staticmethod 55 | def backward(ctx, grad_output): 56 | input, bias = ctx.saved_tensors 57 | tmp = bias_gelu_back(grad_output, bias, input) 58 | return tmp, tmp 59 | 60 | bias_gelu_impl = GeLUFunction.apply 61 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | torch._C._jit_set_profiling_mode(False) 19 | torch._C._jit_set_profiling_executor(False) 20 | torch._C._jit_override_can_fuse_on_cpu(True) 21 | torch._C._jit_override_can_fuse_on_gpu(True) 22 | 23 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 24 | # 1/sqrt(2*pi)-> 0.3989423 25 | # 1/sqrt(2) -> 0.70710678 26 | # sqrt(2/pi) -> 0.79788456 27 | # this function is tanh approximation of gelu 28 | # actual gelu is: 29 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 30 | 31 | @torch.jit.script 32 | def bias_gelu(bias, y): 33 | x = bias + y 34 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 35 | 36 | # gradient of tanh approximation of gelu 37 | # gradient of actual gelu is: 38 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 39 | @torch.jit.script 40 | def bias_gelu_back(g, bias, y): 41 | x = bias + y 42 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 43 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 44 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 45 | return ff*g 46 | 47 | class GeLUFunction(torch.autograd.Function): 48 | @staticmethod 49 | # bias is an optional argument 50 | def forward(ctx, input, bias): 51 | ctx.save_for_backward(input, bias) 52 | return bias_gelu(bias, input) 53 | 54 | @staticmethod 55 | def backward(ctx, grad_output): 56 | input, bias = ctx.saved_tensors 57 | tmp = bias_gelu_back(grad_output, bias, input) 58 | return tmp, tmp 59 | 60 | bias_gelu_impl = GeLUFunction.apply 61 | -------------------------------------------------------------------------------- /inference/cheminformatics/common/cuchemcommon/utils/sysinfo.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/utils/sysinfo.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_CHEMINFORMATICS). 7 | # --------------------------------------------------------------- 8 | 9 | from collections import Counter 10 | 11 | import psutil 12 | import pynvml as nv 13 | 14 | 15 | def get_machine_config(): 16 | """Get machine config for CPU and GPU(s)""" 17 | 18 | # CPU config 19 | physical_cores = psutil.cpu_count(logical=False) 20 | logical_cores = psutil.cpu_count(logical=True) 21 | 22 | cpufreq = psutil.cpu_freq() 23 | cpufreq_max = cpufreq.max # Mhz 24 | cpufreq_min = cpufreq.min 25 | cpufreq_cur = cpufreq.current 26 | 27 | svmem = psutil.virtual_memory() 28 | mem_total = svmem.total / (1024.0 ** 3) # GB 29 | mem_avail = svmem.available / (1024.0 ** 3) 30 | 31 | # GPU config 32 | nv.nvmlInit() 33 | driver_version = nv.nvmlSystemGetDriverVersion() 34 | deviceCount = nv.nvmlDeviceGetCount() 35 | gpu_devices, gpu_mems = [], [] 36 | for i in range(deviceCount): 37 | handle = nv.nvmlDeviceGetHandleByIndex(i) 38 | gpu_devices.append(nv.nvmlDeviceGetName(handle).decode("utf-8")) 39 | gpu_mem = nv.nvmlDeviceGetMemoryInfo(handle).total / (1024.0 ** 3) 40 | gpu_mems.append(gpu_mem) 41 | 42 | return {'cpu': {'physical_cores': physical_cores, 'logical_cores': logical_cores, 43 | 'min_freq_MHz': cpufreq_min, 'max_freq_MHz': cpufreq_max, 'cur_freq_MHz': cpufreq_cur, 44 | 'total_mem_GB': mem_total, 'avail_mem_GB': mem_avail}, 45 | 'gpu': {'devices': gpu_devices, 'mem_GB': gpu_mems}} 46 | 47 | 48 | def print_machine_config(config): 49 | """Printable version of config""" 50 | cpu_cores = config['cpu']['physical_cores'] 51 | cpu_freq = int(round(config['cpu']['max_freq_MHz'], 0)) 52 | ram = int(round(config['cpu']['total_mem_GB'], 0)) 53 | cpu_config_message = f'{cpu_freq} MHz CPU with {cpu_cores} cores, {ram} GB RAM' 54 | 55 | gpu_devices = Counter([(x, int(round(y, 0))) for x, y in zip(config['gpu']['devices'], config['gpu']['mem_GB'])]) 56 | gpu_config_message = '' 57 | for (handle, mem), count in gpu_devices.items(): 58 | gpu_config_message += f'{count} x {handle} GPU(s)' 59 | 60 | return ', '.join([cpu_config_message, gpu_config_message]) 61 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Model parallel utility interface.""" 17 | 18 | from .cross_entropy import vocab_parallel_cross_entropy 19 | 20 | from .data import broadcast_data 21 | 22 | from .grads import clip_grad_norm 23 | 24 | from .initialize import is_unitialized 25 | from .initialize import destroy_model_parallel 26 | from .initialize import get_data_parallel_group 27 | from .initialize import get_data_parallel_rank 28 | from .initialize import get_data_parallel_world_size 29 | from .initialize import get_model_parallel_group 30 | from .initialize import get_model_parallel_rank, set_model_parallel_rank 31 | from .initialize import get_model_parallel_src_rank 32 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size 33 | from .initialize import get_topology 34 | from .initialize import get_pipe_parallel_group 35 | from .initialize import get_pipe_parallel_rank 36 | from .initialize import get_pipe_parallel_world_size 37 | from .initialize import get_io_parallel_group 38 | from .initialize import initialize_model_parallel 39 | from .initialize import model_parallel_is_initialized 40 | 41 | from .layers import LayerNorm 42 | from .layers import ColumnParallelLinear 43 | from .layers import RowParallelLinear 44 | from .layers import VocabParallelEmbedding 45 | 46 | from .mappings import copy_to_model_parallel_region 47 | from .mappings import gather_from_model_parallel_region 48 | from .mappings import reduce_from_model_parallel_region 49 | from .mappings import scatter_to_model_parallel_region 50 | 51 | from .random import checkpoint 52 | from .random import get_cuda_rng_tracker 53 | from .random import init_checkpointed_activations_memory_buffer 54 | from .random import model_parallel_cuda_manual_seed 55 | from .random import reset_checkpointed_activations_memory_buffer 56 | 57 | from .utils import divide 58 | from .utils import split_tensor_along_last_dim 59 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Model parallel utility interface.""" 17 | 18 | from .cross_entropy import vocab_parallel_cross_entropy 19 | 20 | from .data import broadcast_data 21 | 22 | from .grads import clip_grad_norm 23 | 24 | from .initialize import is_unitialized 25 | from .initialize import destroy_model_parallel 26 | from .initialize import get_data_parallel_group 27 | from .initialize import get_data_parallel_rank 28 | from .initialize import get_data_parallel_world_size 29 | from .initialize import get_model_parallel_group 30 | from .initialize import get_model_parallel_rank, set_model_parallel_rank 31 | from .initialize import get_model_parallel_src_rank 32 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size 33 | from .initialize import get_topology 34 | from .initialize import get_pipe_parallel_group 35 | from .initialize import get_pipe_parallel_rank 36 | from .initialize import get_pipe_parallel_world_size 37 | from .initialize import get_io_parallel_group 38 | from .initialize import initialize_model_parallel 39 | from .initialize import model_parallel_is_initialized 40 | 41 | from .layers import LayerNorm 42 | from .layers import ColumnParallelLinear 43 | from .layers import RowParallelLinear 44 | from .layers import VocabParallelEmbedding 45 | 46 | from .mappings import copy_to_model_parallel_region 47 | from .mappings import gather_from_model_parallel_region 48 | from .mappings import reduce_from_model_parallel_region 49 | from .mappings import scatter_to_model_parallel_region 50 | 51 | from .random import checkpoint 52 | from .random import get_cuda_rng_tracker 53 | from .random import init_checkpointed_activations_memory_buffer 54 | from .random import model_parallel_cuda_manual_seed 55 | from .random import reset_checkpointed_activations_memory_buffer 56 | 57 | from .utils import divide 58 | from .utils import split_tensor_along_last_dim 59 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_upper_triang_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda( 26 | torch::Tensor const& input, 27 | float scale_factor); 28 | 29 | torch::Tensor bwd_cuda( 30 | torch::Tensor const& output_grads, 31 | torch::Tensor const& softmax_results, 32 | float scale_factor); 33 | 34 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) { 35 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 36 | AT_ASSERTM(input.scalar_type() == at::ScalarType::Half, 37 | "Only HALF is supported"); 38 | 39 | return fwd_cuda(input, scale_factor); 40 | } 41 | 42 | torch::Tensor bwd( 43 | torch::Tensor const& output_grads, 44 | torch::Tensor const& softmax_results, 45 | float scale_factor) { 46 | 47 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 48 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 49 | 50 | AT_ASSERTM(output_grads.scalar_type() == at::ScalarType::Half, 51 | "Only HALF is supported"); 52 | AT_ASSERTM(softmax_results.scalar_type() == at::ScalarType::Half, 53 | "Only HALF is supported"); 54 | 55 | return bwd_cuda(output_grads, softmax_results, scale_factor); 56 | } 57 | 58 | } // end namespace scaled_upper_triang_masked_softmax 59 | } // end namespace fused_softmax 60 | } // end namespace multihead_attn 61 | 62 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 63 | m.def("forward", 64 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 65 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 66 | m.def("backward", 67 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, 68 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 69 | } 70 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/fused_kernels/scaled_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda( 26 | torch::Tensor const& input, 27 | torch::Tensor const& mask, 28 | float scale_factor); 29 | 30 | torch::Tensor bwd_cuda( 31 | torch::Tensor const& output_grads, 32 | torch::Tensor const& softmax_results, 33 | float scale_factor); 34 | 35 | torch::Tensor fwd( 36 | torch::Tensor const& input, 37 | torch::Tensor const& mask, 38 | float scale_factor) { 39 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 40 | AT_ASSERTM(input.scalar_type() == at::ScalarType::Half, 41 | "Only HALF is supported"); 42 | AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); 43 | 44 | return fwd_cuda(input, mask, scale_factor); 45 | } 46 | 47 | torch::Tensor bwd( 48 | torch::Tensor const& output_grads, 49 | torch::Tensor const& softmax_results, 50 | float scale_factor) { 51 | 52 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 53 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 54 | 55 | AT_ASSERTM(output_grads.scalar_type() == at::ScalarType::Half, 56 | "Only HALF is supported"); 57 | AT_ASSERTM(softmax_results.scalar_type() == at::ScalarType::Half, 58 | "Only HALF is supported"); 59 | 60 | return bwd_cuda(output_grads, softmax_results, scale_factor); 61 | } 62 | 63 | } // end namespace scaled_masked_softmax 64 | } // end namespace fused_softmax 65 | } // end namespace multihead_attn 66 | 67 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 68 | m.def("forward", 69 | &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 70 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 71 | m.def("backward", 72 | &multihead_attn::fused_softmax::scaled_masked_softmax::bwd, 73 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 74 | } 75 | -------------------------------------------------------------------------------- /inference/cheminformatics/common/cuchemcommon/data/cluster_wf.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/data/cluster_wf.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_CHEMINFORMATICS). 7 | # --------------------------------------------------------------- 8 | 9 | import logging 10 | import math 11 | import os 12 | from typing import List 13 | 14 | import cudf 15 | import dask 16 | import dask_cudf 17 | from cuchemcommon.context import Context 18 | from cuchemcommon.data.helper.chembldata import BATCH_SIZE, ChEmblData 19 | from cuchemcommon.utils.singleton import Singleton 20 | 21 | from . import ClusterWfDAO 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | FINGER_PRINT_FILES = 'filter_*.h5' 26 | 27 | 28 | class ChemblClusterWfDao(ClusterWfDAO, metaclass=Singleton): 29 | 30 | def __init__(self, fp_type): 31 | self.chem_data = ChEmblData(fp_type) 32 | 33 | def meta_df(self): 34 | chem_data = ChEmblData() 35 | return chem_data._meta_df() 36 | 37 | def fetch_molecular_embedding(self, 38 | n_molecules: int, 39 | cache_directory: str = None): 40 | context = Context() 41 | if cache_directory: 42 | hdf_path = os.path.join(cache_directory, FINGER_PRINT_FILES) 43 | logger.info('Reading %d rows from %s...', n_molecules, hdf_path) 44 | mol_df = dask.dataframe.read_hdf(hdf_path, 'fingerprints') 45 | 46 | if n_molecules > 0: 47 | npartitions = math.ceil(n_molecules / BATCH_SIZE) 48 | mol_df = mol_df.head(n_molecules, compute=False, npartitions=npartitions) 49 | else: 50 | logger.info('Reading molecules from database...') 51 | mol_df = self.chem_data.fetch_mol_embedding(num_recs=n_molecules, 52 | batch_size=context.batch_size) 53 | 54 | return mol_df 55 | 56 | def fetch_molecular_embedding_by_id(self, molecule_id: List): 57 | context = Context() 58 | meta = self.chem_data._meta_df() 59 | fp_df = self.chem_data._fetch_mol_embedding(molregnos=molecule_id, 60 | batch_size=context.batch_size) \ 61 | .astype(meta.dtypes) 62 | 63 | fp_df = cudf.from_pandas(fp_df) 64 | fp_df = dask_cudf.from_cudf(fp_df, npartitions=1).reset_index() 65 | return fp_df 66 | 67 | def fetch_id_from_chembl(self, new_molecules: List): 68 | logger.debug('Fetch ChEMBL ID using molregno...') 69 | return self.chem_data.fetch_id_from_chembl(new_molecules) 70 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron_lm.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | MANIFEST.in 2 | README.md 3 | setup.py 4 | megatron/__init__.py 5 | megatron/arguments.py 6 | megatron/checkpointing.py 7 | megatron/global_vars.py 8 | megatron/indexer.py 9 | megatron/initialize.py 10 | megatron/learning_rates.py 11 | megatron/memory.py 12 | megatron/module.py 13 | megatron/package_info.py 14 | megatron/text_generation_utils.py 15 | megatron/training.py 16 | megatron/utils.py 17 | megatron/data/Makefile 18 | megatron/data/__init__.py 19 | megatron/data/bert_dataset.py 20 | megatron/data/dataset_utils.py 21 | megatron/data/gpt2_dataset.py 22 | megatron/data/helpers.cpp 23 | megatron/data/ict_dataset.py 24 | megatron/data/indexed_dataset.py 25 | megatron/data/realm_dataset_utils.py 26 | megatron/data/realm_index.py 27 | megatron/data/samplers.py 28 | megatron/deprecated_data_utils/__init__.py 29 | megatron/deprecated_data_utils/configure_data.py 30 | megatron/deprecated_data_utils/corpora.py 31 | megatron/deprecated_data_utils/datasets.py 32 | megatron/deprecated_data_utils/file_utils.py 33 | megatron/deprecated_data_utils/lazy_loader.py 34 | megatron/deprecated_data_utils/samplers.py 35 | megatron/deprecated_data_utils/tf_dl.py 36 | megatron/deprecated_data_utils/tokenization.py 37 | megatron/deprecated_data_utils/tokenization_gpt2.py 38 | megatron/deprecated_data_utils/wordpiece.py 39 | megatron/fp16/__init__.py 40 | megatron/fp16/fp16.py 41 | megatron/fp16/fp16util.py 42 | megatron/fp16/loss_scaler.py 43 | megatron/fused_kernels/__init__.py 44 | megatron/model/__init__.py 45 | megatron/model/bert_model.py 46 | megatron/model/classification.py 47 | megatron/model/distributed.py 48 | megatron/model/fused_bias_gelu.py 49 | megatron/model/fused_softmax.py 50 | megatron/model/gpt2_model.py 51 | megatron/model/language_model.py 52 | megatron/model/multiple_choice.py 53 | megatron/model/realm_model.py 54 | megatron/model/transformer.py 55 | megatron/model/utils.py 56 | megatron/mpu/__init__.py 57 | megatron/mpu/cross_entropy.py 58 | megatron/mpu/data.py 59 | megatron/mpu/grads.py 60 | megatron/mpu/initialize.py 61 | megatron/mpu/layers.py 62 | megatron/mpu/mappings.py 63 | megatron/mpu/random.py 64 | megatron/mpu/utils.py 65 | megatron/mpu/tests/__init__.py 66 | megatron/mpu/tests/commons.py 67 | megatron/mpu/tests/test_cross_entropy.py 68 | megatron/mpu/tests/test_data.py 69 | megatron/mpu/tests/test_initialize.py 70 | megatron/mpu/tests/test_layers.py 71 | megatron/mpu/tests/test_random.py 72 | megatron/tokenizer/__init__.py 73 | megatron/tokenizer/bert_tokenization.py 74 | megatron/tokenizer/gpt2_tokenization.py 75 | megatron/tokenizer/tokenizer.py 76 | megatron_lm.egg-info/PKG-INFO 77 | megatron_lm.egg-info/SOURCES.txt 78 | megatron_lm.egg-info/dependency_links.txt 79 | megatron_lm.egg-info/not-zip-safe 80 | megatron_lm.egg-info/requires.txt 81 | megatron_lm.egg-info/top_level.txt -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/goal_directed_score_contributions.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/goal_directed_score_contributions.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_GUACAMOL). 7 | # --------------------------------------------------------------- 8 | 9 | from typing import List, Tuple, Dict 10 | 11 | 12 | class ScoreContributionSpecification: 13 | """ 14 | Specifies how to calculate the score of a goal-directed benchmark. 15 | 16 | The global score will be a weighted average of top-x scores. 17 | This class specifies which top-x to consider and what the corresponding weights are. 18 | """ 19 | 20 | def __init__(self, contributions: List[Tuple[int, float]]) -> None: 21 | """ 22 | Args: 23 | contributions: List of tuples (top_count, weight) for the score contributions 24 | """ 25 | self.contributions = contributions 26 | 27 | @property 28 | def top_counts(self) -> List[int]: 29 | return [x[0] for x in self.contributions] 30 | 31 | @property 32 | def weights(self) -> List[float]: 33 | return [x[1] for x in self.contributions] 34 | 35 | 36 | def uniform_specification(*top_counts: int) -> ScoreContributionSpecification: 37 | """ 38 | Creates an instance of ScoreContributionSpecification where all the top-x contributions have equal weight 39 | 40 | Args: 41 | top_counts: list of values, where each value x will correspond to the top-x contribution 42 | """ 43 | contributions = [(x, 1.0) for x in top_counts] 44 | return ScoreContributionSpecification(contributions=contributions) 45 | 46 | 47 | def compute_global_score(contribution_specification: ScoreContributionSpecification, 48 | scores: List[float]) -> Tuple[float, Dict[str, float]]: 49 | """ 50 | Computes the global score according to the contribution specification. 51 | 52 | Args: 53 | contribution_specification: Score contribution specification 54 | scores: List of all scores - list must be long enough for all top_counts in contribution_specification 55 | 56 | Returns: 57 | Tuple with the global score and a dict with the considered top-x scores 58 | """ 59 | sorted_scores = sorted(scores, reverse=True) 60 | 61 | global_score = 0.0 62 | top_x_dict = {} 63 | 64 | for top_count, weight in contribution_specification.contributions: 65 | score = sum(sorted_scores[:top_count]) / top_count 66 | top_x_dict[f'top_{top_count}'] = score 67 | global_score += score * weight 68 | 69 | global_score /= sum(contribution_specification.weights) 70 | 71 | return global_score, top_x_dict 72 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/glue/data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """GLUE dataset.""" 17 | 18 | from abc import ABC 19 | from abc import abstractmethod 20 | 21 | from torch.utils.data import Dataset 22 | 23 | from megatron import print_rank_0 24 | from tasks.data_utils import build_sample 25 | from tasks.data_utils import build_tokens_types_paddings_from_text 26 | 27 | 28 | class GLUEAbstractDataset(ABC, Dataset): 29 | """GLUE base dataset class.""" 30 | 31 | def __init__(self, task_name, dataset_name, datapaths, 32 | tokenizer, max_seq_length): 33 | # Store inputs. 34 | self.task_name = task_name 35 | self.dataset_name = dataset_name 36 | self.tokenizer = tokenizer 37 | self.max_seq_length = max_seq_length 38 | print_rank_0(' > building {} dataset for {}:'.format(self.task_name, 39 | self.dataset_name)) 40 | # Process the files. 41 | string = ' > paths:' 42 | for path in datapaths: 43 | string += ' ' + path 44 | print_rank_0(string) 45 | self.samples = [] 46 | for datapath in datapaths: 47 | self.samples.extend(self.process_samples_from_single_path(datapath)) 48 | print_rank_0(' >> total number of samples: {}'.format( 49 | len(self.samples))) 50 | 51 | def __len__(self): 52 | return len(self.samples) 53 | 54 | def __getitem__(self, idx): 55 | raw_sample = self.samples[idx] 56 | ids, types, paddings = build_tokens_types_paddings_from_text( 57 | raw_sample['text_a'], raw_sample['text_b'], 58 | self.tokenizer, self.max_seq_length) 59 | sample = build_sample(ids, types, paddings, 60 | raw_sample['label'], raw_sample['uid']) 61 | return sample 62 | 63 | @abstractmethod 64 | def process_samples_from_single_path(self, datapath): 65 | """Abstract method that takes a single path / filename and 66 | returns a list of dataset samples, each sample being a dict of 67 | {'text_a': string, 'text_b': string, 'label': int, 'uid': int} 68 | """ 69 | pass 70 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/openwebtext/remove_group_duplicates.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import time 19 | import sys 20 | 21 | 22 | if __name__ == '__main__': 23 | 24 | url_filename = sys.argv[1] 25 | data_filename = sys.argv[2] 26 | output_filename = sys.argv[3] 27 | 28 | urls = set() 29 | with open(url_filename, 'r') as f: 30 | for line in f: 31 | myjson = json.loads(line) 32 | for key in myjson: 33 | this_urls = myjson[key] 34 | for i in range(1, len(this_urls)): 35 | urls.add(this_urls[i]) 36 | print('will be removing {} urls'.format(len(urls)), flush=True) 37 | 38 | written_docs = 0 39 | removed_docs = 0 40 | removed_chars = 0 41 | start_time = time.time() 42 | with open(output_filename, 'wb') as fout: 43 | with open(data_filename, 'r') as fin: 44 | for line in fin: 45 | try: 46 | myjson = json.loads(line) 47 | url = myjson['url'] 48 | if url in urls: 49 | print('removing', myjson) 50 | removed_docs += 1 51 | removed_chars += len(myjson['text']) 52 | continue 53 | myjson = json.dumps(myjson, ensure_ascii=False) 54 | fout.write(myjson.encode('utf-8')) 55 | fout.write('\n'.encode('utf-8')) 56 | written_docs += 1 57 | if written_docs % 10000 == 0: 58 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 59 | '| removed: {} (char: {})'.format( 60 | time.time() - start_time, 61 | written_docs, removed_docs, removed_chars)) 62 | except Exception as e: 63 | print('[SKIPPING]', line, e) 64 | 65 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 66 | '| removed: {} (char: {})'.format( 67 | time.time() - start_time, 68 | written_docs, removed_docs, removed_chars)) 69 | print('done :-)') 70 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/zeroshot_gpt2/detokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Detokenization.""" 17 | 18 | import re 19 | 20 | 21 | def ptb_detokenizer(string): 22 | string = string.replace(" '", "'") 23 | string = string.replace(" \n", "\n") 24 | string = string.replace("\n ", "\n") 25 | string = string.replace(" n't", "n't") 26 | string = string.replace(" N ", "1 ") 27 | string = string.replace("$ 1", "$1") 28 | string = string.replace("# 1", "#1") 29 | return string 30 | 31 | 32 | def wikitext_detokenizer(string): 33 | # contractions 34 | string = string.replace("s '", "s'") 35 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 36 | # number separators 37 | string = string.replace(" @-@ ", "-") 38 | string = string.replace(" @,@ ", ",") 39 | string = string.replace(" @.@ ", ".") 40 | # punctuation 41 | string = string.replace(" : ", ": ") 42 | string = string.replace(" ; ", "; ") 43 | string = string.replace(" . ", ". ") 44 | string = string.replace(" ! ", "! ") 45 | string = string.replace(" ? ", "? ") 46 | string = string.replace(" , ", ", ") 47 | # double brackets 48 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 49 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 50 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 51 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 52 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 53 | # miscellaneous 54 | string = string.replace("= = = =", "====") 55 | string = string.replace("= = =", "===") 56 | string = string.replace("= =", "==") 57 | string = string.replace(" " + chr(176) + " ", chr(176)) 58 | string = string.replace(" \n", "\n") 59 | string = string.replace("\n ", "\n") 60 | string = string.replace(" N ", " 1 ") 61 | string = string.replace(" 's", "'s") 62 | 63 | return string 64 | 65 | 66 | def lambada_detokenizer(string): 67 | return string 68 | 69 | 70 | _DETOKENIZERS = { 71 | 'ptb': ptb_detokenizer, 72 | 'wiki': wikitext_detokenizer, 73 | 'lambada': lambada_detokenizer, 74 | } 75 | 76 | 77 | def get_detokenizer(path): 78 | for key in _DETOKENIZERS.keys(): 79 | if key in path: 80 | return _DETOKENIZERS[key] 81 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import argparse 17 | import os 18 | import random 19 | import numpy 20 | import torch 21 | 22 | import mpu 23 | 24 | 25 | class IdentityLayer(torch.nn.Module): 26 | def __init__(self, size, scale=1.0): 27 | super(IdentityLayer, self).__init__() 28 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 29 | 30 | def forward(self): 31 | return self.weight 32 | 33 | 34 | def set_random_seed(seed): 35 | """Set random seed for reproducability.""" 36 | random.seed(seed) 37 | numpy.random.seed(seed) 38 | torch.manual_seed(seed) 39 | mpu.model_parallel_cuda_manual_seed(seed) 40 | 41 | 42 | def initialize_distributed(backend='nccl'): 43 | """Initialize torch.distributed.""" 44 | # Get local rank in case it is provided. 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--local_rank', type=int, default=None, 47 | help='local rank passed from distributed launcher') 48 | args = parser.parse_args() 49 | local_rank = args.local_rank 50 | 51 | # Get rank and world size. 52 | rank = int(os.getenv('RANK', '0')) 53 | world_size = int(os.getenv("WORLD_SIZE", '1')) 54 | 55 | print('> initializing torch.distributed with local rank: {}, ' 56 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 57 | 58 | # Set the device id. 59 | device = rank % torch.cuda.device_count() 60 | if local_rank is not None: 61 | device = local_rank 62 | torch.cuda.set_device(device) 63 | 64 | # Call the init process. 65 | init_method = 'tcp://' 66 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 67 | master_port = os.getenv('MASTER_PORT', '6000') 68 | init_method += master_ip + ':' + master_port 69 | torch.distributed.init_process_group( 70 | backend=backend, 71 | world_size=world_size, 72 | rank=rank, 73 | init_method=init_method) 74 | 75 | 76 | def print_separator(message): 77 | torch.distributed.barrier() 78 | filler_len = (78 - len(message)) // 2 79 | filler = '-' * filler_len 80 | string = '\n' + filler + ' {} '.format(message) + filler 81 | if torch.distributed.get_rank() == 0: 82 | print(string, flush=True) 83 | torch.distributed.barrier() 84 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/main.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Main tasks functionality.""" 17 | 18 | import os 19 | import sys 20 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 21 | os.path.pardir))) 22 | 23 | from megatron import get_args 24 | from megatron.initialize import initialize_megatron 25 | 26 | 27 | def get_tasks_args(parser): 28 | """Provide extra arguments required for tasks.""" 29 | group = parser.add_argument_group(title='tasks') 30 | 31 | group.add_argument('--task', type=str, required=True, 32 | help='Task name.') 33 | group.add_argument('--epochs', type=int, default=None, 34 | help='Number of finetunning epochs. Zero results in ' 35 | 'evaluation only.') 36 | group.add_argument('--pretrained-checkpoint', type=str, default=None, 37 | help='Pretrained checkpoint used for finetunning.') 38 | group.add_argument('--keep-last', action='store_true', 39 | help='Keep the last batch (maybe incomplete) in' 40 | 'the data loader') 41 | group.add_argument('--train-data', nargs='+', default=None, 42 | help='Whitespace separated paths or corpora names ' 43 | 'for training.') 44 | group.add_argument('--valid-data', nargs='*', default=None, 45 | help='path(s) to the validation data.') 46 | group.add_argument('--overlapping-eval', type=int, default=32, 47 | help='Sliding window for overlapping evaluation.') 48 | group.add_argument('--strict-lambada', action='store_true', 49 | help='Use more difficult formulation of lambada.') 50 | 51 | return parser 52 | 53 | 54 | if __name__ == '__main__': 55 | 56 | initialize_megatron(extra_args_provider=get_tasks_args) 57 | 58 | args = get_args() 59 | if args.task == 'RACE': 60 | from race.finetune import main 61 | elif args.task in ['MNLI', 'QQP']: 62 | from glue.finetune import main 63 | elif args.task in ['LAMBADA', 'WIKITEXT103']: 64 | from zeroshot_gpt2.evaluate import main 65 | else: 66 | raise NotImplementedError('Task {} is not implemented.'.format( 67 | args.task)) 68 | 69 | main() 70 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import argparse 17 | import os 18 | import random 19 | import numpy 20 | import torch 21 | 22 | import mpu 23 | 24 | 25 | class IdentityLayer(torch.nn.Module): 26 | def __init__(self, size, scale=1.0): 27 | super(IdentityLayer, self).__init__() 28 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 29 | 30 | def forward(self): 31 | return self.weight 32 | 33 | 34 | def set_random_seed(seed): 35 | """Set random seed for reproducability.""" 36 | random.seed(seed) 37 | numpy.random.seed(seed) 38 | torch.manual_seed(seed) 39 | mpu.model_parallel_cuda_manual_seed(seed) 40 | 41 | 42 | def initialize_distributed(backend='nccl'): 43 | """Initialize torch.distributed.""" 44 | # Get local rank in case it is provided. 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--local_rank', type=int, default=None, 47 | help='local rank passed from distributed launcher') 48 | args = parser.parse_args() 49 | local_rank = args.local_rank 50 | 51 | # Get rank and world size. 52 | rank = int(os.getenv('RANK', '0')) 53 | world_size = int(os.getenv("WORLD_SIZE", '1')) 54 | 55 | print('> initializing torch.distributed with local rank: {}, ' 56 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 57 | 58 | # Set the device id. 59 | device = rank % torch.cuda.device_count() 60 | if local_rank is not None: 61 | device = local_rank 62 | torch.cuda.set_device(device) 63 | 64 | # Call the init process. 65 | init_method = 'tcp://' 66 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 67 | master_port = os.getenv('MASTER_PORT', '6000') 68 | init_method += master_ip + ':' + master_port 69 | torch.distributed.init_process_group( 70 | backend=backend, 71 | world_size=world_size, 72 | rank=rank, 73 | init_method=init_method) 74 | 75 | 76 | def print_separator(message): 77 | torch.distributed.barrier() 78 | filler_len = (78 - len(message)) // 2 79 | filler = '-' * filler_len 80 | string = '\n' + filler + ' {} '.format(message) + filler 81 | if torch.distributed.get_rank() == 0: 82 | print(string, flush=True) 83 | torch.distributed.barrier() 84 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | 20 | def ensure_divisibility(numerator, denominator): 21 | """Ensure that numerator is divisible by the denominator.""" 22 | assert numerator % denominator == 0, '{} is not divisible by {}'.format( 23 | numerator, denominator) 24 | 25 | 26 | def divide(numerator, denominator): 27 | """Ensure that numerator is divisible by the denominator and return 28 | the division value.""" 29 | ensure_divisibility(numerator, denominator) 30 | return numerator // denominator 31 | 32 | 33 | def split_tensor_along_last_dim(tensor, num_partitions, 34 | contiguous_split_chunks=False): 35 | """Split a tensor along its last dimension. 36 | Arguments: 37 | tensor: input tensor. 38 | num_partitions: number of partitions to split the tensor 39 | contiguous_split_chunks: If True, make each chunk contiguous 40 | in memory. 41 | """ 42 | # Get the size and dimension. 43 | last_dim = tensor.dim() - 1 44 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 45 | # Split. 46 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 47 | # Note: torch.split does not create contiguous tensors by default. 48 | if contiguous_split_chunks: 49 | return tuple(chunk.contiguous() for chunk in tensor_list) 50 | 51 | return tensor_list 52 | 53 | 54 | class VocabUtility: 55 | """Split the vocabulary into `world_size` chunks amd return the 56 | first and last index of the vocabulary belonging to the `rank` 57 | partition: Note that indecies in [fist, last)""" 58 | 59 | @staticmethod 60 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, 61 | rank, world_size): 62 | index_f = rank * per_partition_vocab_size 63 | index_l = index_f + per_partition_vocab_size 64 | return index_f, index_l 65 | 66 | @staticmethod 67 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 68 | per_partition_vocab_size = divide(global_vocab_size, world_size) 69 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 70 | per_partition_vocab_size, rank, world_size) 71 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | 20 | def ensure_divisibility(numerator, denominator): 21 | """Ensure that numerator is divisible by the denominator.""" 22 | assert numerator % denominator == 0, '{} is not divisible by {}'.format( 23 | numerator, denominator) 24 | 25 | 26 | def divide(numerator, denominator): 27 | """Ensure that numerator is divisible by the denominator and return 28 | the division value.""" 29 | ensure_divisibility(numerator, denominator) 30 | return numerator // denominator 31 | 32 | 33 | def split_tensor_along_last_dim(tensor, num_partitions, 34 | contiguous_split_chunks=False): 35 | """Split a tensor along its last dimension. 36 | Arguments: 37 | tensor: input tensor. 38 | num_partitions: number of partitions to split the tensor 39 | contiguous_split_chunks: If True, make each chunk contiguous 40 | in memory. 41 | """ 42 | # Get the size and dimension. 43 | last_dim = tensor.dim() - 1 44 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 45 | # Split. 46 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 47 | # Note: torch.split does not create contiguous tensors by default. 48 | if contiguous_split_chunks: 49 | return tuple(chunk.contiguous() for chunk in tensor_list) 50 | 51 | return tensor_list 52 | 53 | 54 | class VocabUtility: 55 | """Split the vocabulary into `world_size` chunks amd return the 56 | first and last index of the vocabulary belonging to the `rank` 57 | partition: Note that indecies in [fist, last)""" 58 | 59 | @staticmethod 60 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, 61 | rank, world_size): 62 | index_f = rank * per_partition_vocab_size 63 | index_l = index_f + per_partition_vocab_size 64 | return index_f, index_l 65 | 66 | @staticmethod 67 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 68 | per_partition_vocab_size = divide(global_vocab_size, world_size) 69 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 70 | per_partition_vocab_size, rank, world_size) 71 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/model/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Utilities for models.""" 17 | 18 | import math 19 | 20 | import torch 21 | 22 | from .transformer import LayerNorm 23 | 24 | 25 | def init_method_normal(sigma): 26 | """Init method based on N(0, sigma).""" 27 | def init_(tensor): 28 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 29 | 30 | return init_ 31 | 32 | 33 | def scaled_init_method_normal(sigma, num_layers): 34 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 35 | std = sigma / math.sqrt(2.0 * num_layers) 36 | 37 | def init_(tensor): 38 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 39 | 40 | return init_ 41 | 42 | 43 | def get_linear_layer(rows, columns, init_method): 44 | """Simple linear layer with weight initialization.""" 45 | layer = torch.nn.Linear(rows, columns) 46 | init_method(layer.weight) 47 | with torch.no_grad(): 48 | layer.bias.zero_() 49 | return layer 50 | 51 | @torch.jit.script 52 | def gelu_impl(x): 53 | """OpenAI's gelu implementation.""" 54 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 55 | (1.0 + 0.044715 * x * x))) 56 | def openai_gelu(x): 57 | return gelu_impl(x) 58 | 59 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 60 | @torch.jit.script 61 | def erf_gelu(x): 62 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) 63 | 64 | def get_params_for_weight_decay_optimization(module): 65 | """Divide params into with-weight-decay and without-weight-decay groups. 66 | Layernorms and baises will have no weight decay but the rest will. 67 | """ 68 | weight_decay_params = {'params': []} 69 | no_weight_decay_params = {'params': [], 'weight_decay': 0.0} 70 | for module_ in module.modules(): 71 | if isinstance(module_, LayerNorm): 72 | no_weight_decay_params['params'].extend( 73 | [p for p in list(module_._parameters.values()) 74 | if p is not None]) 75 | else: 76 | weight_decay_params['params'].extend( 77 | [p for n, p in list(module_._parameters.items()) 78 | if p is not None and n != 'bias']) 79 | no_weight_decay_params['params'].extend( 80 | [p for n, p in list(module_._parameters.items()) 81 | if p is not None and n == 'bias']) 82 | 83 | return weight_decay_params, no_weight_decay_params 84 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/model/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Utilities for models.""" 17 | 18 | import math 19 | 20 | import torch 21 | 22 | from .transformer import LayerNorm 23 | 24 | 25 | def init_method_normal(sigma): 26 | """Init method based on N(0, sigma).""" 27 | def init_(tensor): 28 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 29 | 30 | return init_ 31 | 32 | 33 | def scaled_init_method_normal(sigma, num_layers): 34 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 35 | std = sigma / math.sqrt(2.0 * num_layers) 36 | 37 | def init_(tensor): 38 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 39 | 40 | return init_ 41 | 42 | 43 | def get_linear_layer(rows, columns, init_method): 44 | """Simple linear layer with weight initialization.""" 45 | layer = torch.nn.Linear(rows, columns) 46 | init_method(layer.weight) 47 | with torch.no_grad(): 48 | layer.bias.zero_() 49 | return layer 50 | 51 | @torch.jit.script 52 | def gelu_impl(x): 53 | """OpenAI's gelu implementation.""" 54 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 55 | (1.0 + 0.044715 * x * x))) 56 | def openai_gelu(x): 57 | return gelu_impl(x) 58 | 59 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 60 | @torch.jit.script 61 | def erf_gelu(x): 62 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) 63 | 64 | def get_params_for_weight_decay_optimization(module): 65 | """Divide params into with-weight-decay and without-weight-decay groups. 66 | Layernorms and baises will have no weight decay but the rest will. 67 | """ 68 | weight_decay_params = {'params': []} 69 | no_weight_decay_params = {'params': [], 'weight_decay': 0.0} 70 | for module_ in module.modules(): 71 | if isinstance(module_, LayerNorm): 72 | no_weight_decay_params['params'].extend( 73 | [p for p in list(module_._parameters.values()) 74 | if p is not None]) 75 | else: 76 | weight_decay_params['params'].extend( 77 | [p for n, p in list(module_._parameters.items()) 78 | if p is not None and n != 'bias']) 79 | no_weight_decay_params['params'].extend( 80 | [p for n, p in list(module_._parameters.items()) 81 | if p is not None and n == 'bias']) 82 | 83 | return weight_decay_params, no_weight_decay_params 84 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "THC/THC.h" 23 | #include 24 | #include 25 | #include "scaled_upper_triang_masked_softmax.h" 26 | 27 | namespace multihead_attn { 28 | namespace fused_softmax { 29 | namespace scaled_upper_triang_masked_softmax { 30 | 31 | torch::Tensor fwd_cuda( 32 | torch::Tensor const& input, 33 | float scale_factor) 34 | { 35 | // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 36 | const int attn_batches = input.size(0); 37 | const int seq_len = input.size(1); 38 | TORCH_INTERNAL_ASSERT(seq_len <= 2048); 39 | 40 | // Output 41 | auto act_options = input.options().requires_grad(false); 42 | torch::Tensor softmax_results = 43 | torch::empty({attn_batches, seq_len, seq_len}, act_options); 44 | 45 | // Softmax Intermediate Result Ptr 46 | void* input_ptr = static_cast(input.data_ptr()); 47 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 48 | 49 | dispatch_scaled_upper_triang_masked_softmax_forward( 50 | reinterpret_cast(softmax_results_ptr), 51 | reinterpret_cast(input_ptr), 52 | scale_factor, 53 | seq_len, 54 | seq_len, 55 | attn_batches); 56 | return softmax_results; 57 | } 58 | 59 | torch::Tensor bwd_cuda( 60 | torch::Tensor const& output_grads_, 61 | torch::Tensor const& softmax_results_, 62 | float scale_factor) { 63 | 64 | auto output_grads = output_grads_.contiguous(); 65 | auto softmax_results = softmax_results_.contiguous(); 66 | 67 | //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 68 | const int attn_batches = output_grads.size(0); 69 | const int seq_len = output_grads.size(1); 70 | TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2)); 71 | 72 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 73 | 74 | //Softmax Grad 75 | dispatch_scaled_upper_triang_masked_softmax_backward( 76 | reinterpret_cast(output_grads_ptr), 77 | reinterpret_cast(output_grads_ptr), 78 | reinterpret_cast(softmax_results.data_ptr()), 79 | scale_factor, 80 | seq_len, 81 | seq_len, 82 | attn_batches); 83 | 84 | //backward pass is completely in-place 85 | return output_grads; 86 | } 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | from mpu import data as data_utils 19 | import mpu 20 | import torch 21 | import functools 22 | import operator 23 | import sys 24 | sys.path.append("../..") 25 | 26 | 27 | def test_boradcast_data(model_parallel_size): 28 | 29 | if torch.distributed.get_rank() == 0: 30 | print('> testing boradcast_data with model parallel size {} ...'. 31 | format(model_parallel_size)) 32 | 33 | mpu.initialize_model_parallel(model_parallel_size) 34 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 35 | model_parallel_size = mpu.get_model_parallel_world_size() 36 | 37 | key_size_t = {'key1': [7, 11], 38 | 'key2': [8, 2, 1], 39 | 'key3': [13], 40 | 'key4': [5, 1, 2], 41 | 'key5': [5, 12]} 42 | keys = list(key_size_t.keys()) 43 | 44 | data = {} 45 | data_t = {} 46 | for key in key_size_t: 47 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 48 | data_t[key] = data[key].clone() 49 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 50 | data_t['keyX'] = data['keyX'].clone() 51 | if mpu.get_model_parallel_rank() != 0: 52 | data = None 53 | 54 | data_utils._check_data_types(keys, data_t, torch.int64) 55 | key_size, key_numel, \ 56 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 57 | for key in keys: 58 | assert key_size[key] == key_size_t[key] 59 | total_numel_t = 0 60 | for key in keys: 61 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 62 | assert key_numel[key] == target_size 63 | total_numel_t += target_size 64 | assert total_numel == total_numel_t 65 | 66 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 67 | for key in keys: 68 | tensor = data_t[key].cuda() 69 | assert data_b[key].sub(tensor).abs().max() == 0 70 | 71 | # Reset groups 72 | mpu.destroy_model_parallel() 73 | 74 | torch.distributed.barrier() 75 | if torch.distributed.get_rank() == 0: 76 | print('>> passed the test :-)') 77 | 78 | 79 | if __name__ == '__main__': 80 | 81 | initialize_distributed() 82 | world_size = torch.distributed.get_world_size() 83 | 84 | model_parallel_size = 1 85 | while model_parallel_size <= world_size: 86 | print_separator('test test boradcast data') 87 | test_boradcast_data(model_parallel_size) 88 | model_parallel_size *= 2 89 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | from mpu import data as data_utils 19 | import mpu 20 | import torch 21 | import functools 22 | import operator 23 | import sys 24 | sys.path.append("../..") 25 | 26 | 27 | def test_boradcast_data(model_parallel_size): 28 | 29 | if torch.distributed.get_rank() == 0: 30 | print('> testing boradcast_data with model parallel size {} ...'. 31 | format(model_parallel_size)) 32 | 33 | mpu.initialize_model_parallel(model_parallel_size) 34 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 35 | model_parallel_size = mpu.get_model_parallel_world_size() 36 | 37 | key_size_t = {'key1': [7, 11], 38 | 'key2': [8, 2, 1], 39 | 'key3': [13], 40 | 'key4': [5, 1, 2], 41 | 'key5': [5, 12]} 42 | keys = list(key_size_t.keys()) 43 | 44 | data = {} 45 | data_t = {} 46 | for key in key_size_t: 47 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 48 | data_t[key] = data[key].clone() 49 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 50 | data_t['keyX'] = data['keyX'].clone() 51 | if mpu.get_model_parallel_rank() != 0: 52 | data = None 53 | 54 | data_utils._check_data_types(keys, data_t, torch.int64) 55 | key_size, key_numel, \ 56 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 57 | for key in keys: 58 | assert key_size[key] == key_size_t[key] 59 | total_numel_t = 0 60 | for key in keys: 61 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 62 | assert key_numel[key] == target_size 63 | total_numel_t += target_size 64 | assert total_numel == total_numel_t 65 | 66 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 67 | for key in keys: 68 | tensor = data_t[key].cuda() 69 | assert data_b[key].sub(tensor).abs().max() == 0 70 | 71 | # Reset groups 72 | mpu.destroy_model_parallel() 73 | 74 | torch.distributed.barrier() 75 | if torch.distributed.get_rank() == 0: 76 | print('>> passed the test :-)') 77 | 78 | 79 | if __name__ == '__main__': 80 | 81 | initialize_distributed() 82 | world_size = torch.distributed.get_world_size() 83 | 84 | model_parallel_size = 1 85 | while model_parallel_size <= world_size: 86 | print_separator('test test boradcast data') 87 | test_boradcast_data(model_parallel_size) 88 | model_parallel_size *= 2 89 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/glue/finetune.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """GLUE finetuning/evaluation.""" 17 | 18 | from megatron import get_args 19 | from megatron import print_rank_0 20 | from megatron import get_tokenizer 21 | from megatron.model.classification import Classification 22 | from tasks.eval_utils import accuracy_func_provider 23 | from tasks.finetune_utils import finetune 24 | 25 | 26 | def glue_classification(num_classes, Dataset, 27 | name_from_datapath_func): 28 | 29 | def train_valid_datasets_provider(): 30 | """Build train and validation dataset.""" 31 | args = get_args() 32 | tokenizer = get_tokenizer() 33 | 34 | train_dataset = Dataset('training', args.train_data, 35 | tokenizer, args.seq_length) 36 | valid_dataset = Dataset('validation', args.valid_data, 37 | tokenizer, args.seq_length) 38 | 39 | return train_dataset, valid_dataset 40 | 41 | def model_provider(): 42 | """Build the model.""" 43 | args = get_args() 44 | 45 | print_rank_0('building classification model for {} ...'.format( 46 | args.task)) 47 | 48 | return Classification(num_classes=num_classes, num_tokentypes=2) 49 | 50 | def metrics_func_provider(): 51 | """Privde metrics callback function.""" 52 | def single_dataset_provider(datapath): 53 | args = get_args() 54 | tokenizer = get_tokenizer() 55 | 56 | name = name_from_datapath_func(datapath) 57 | return Dataset(name, [datapath], tokenizer, args.seq_length) 58 | return accuracy_func_provider(single_dataset_provider) 59 | 60 | """Finetune/evaluate.""" 61 | finetune(train_valid_datasets_provider, model_provider, 62 | end_of_epoch_callback_provider=metrics_func_provider) 63 | 64 | 65 | def main(): 66 | args = get_args() 67 | 68 | if args.task == 'MNLI': 69 | 70 | num_classes = 3 71 | from tasks.glue.mnli import MNLIDataset as Dataset 72 | 73 | def name_from_datapath(datapath): 74 | return datapath.split('MNLI')[-1].strip( 75 | '.tsv').strip('/').replace('_', '-') 76 | 77 | elif args.task == 'QQP': 78 | 79 | num_classes = 2 80 | from tasks.glue.qqp import QQPDataset as Dataset 81 | 82 | def name_from_datapath(datapath): 83 | return datapath.split('QQP')[-1].strip( 84 | '.tsv').strip('/').replace('_', '-') 85 | 86 | else: 87 | raise NotImplementedError('GLUE task {} is not implemented.'.format( 88 | args.task)) 89 | 90 | glue_classification(num_classes, Dataset, name_from_datapath) 91 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/setup.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Setup for pip package for Megatron.""" 17 | 18 | import os 19 | import sys 20 | import setuptools 21 | 22 | if sys.version_info < (3,): 23 | raise Exception("Python 2 is not supported by Megatron.") 24 | 25 | from megatron.package_info import ( 26 | __description__, 27 | __contact_names__, 28 | __url__, 29 | __download_url__, 30 | __keywords__, 31 | __license__, 32 | __package_name__, 33 | __version__, 34 | ) 35 | 36 | with open("README.md", "r") as fh: 37 | long_description = fh.read() 38 | 39 | ############################################################################### 40 | # Dependency Loading # 41 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 42 | 43 | 44 | def req_file(filename): 45 | with open(filename) as f: 46 | content = f.readlines() 47 | return [x.strip() for x in content] 48 | 49 | 50 | install_requires = req_file("requirements.txt") 51 | 52 | setuptools.setup( 53 | name=__package_name__, 54 | # Versions should comply with PEP440. For a discussion on single-sourcing 55 | # the version across setup.py and the project code, see 56 | # https://packaging.python.org/en/latest/single_source_version.html 57 | version=__version__, 58 | description=__description__, 59 | long_description=long_description, 60 | long_description_content_type="text/markdown", 61 | # The project's main homepage. 62 | url=__url__, 63 | author=__contact_names__, 64 | maintainer=__contact_names__, 65 | # The licence under which the project is released 66 | license=__license__, 67 | classifiers=[ 68 | 'Intended Audience :: Developers', 69 | 'Intended Audience :: Science/Research', 70 | 'Intended Audience :: Information Technology', 71 | # Indicate what your project relates to 72 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 73 | 'Topic :: Software Development :: Libraries :: Python Modules', 74 | # Supported python versions 75 | 'Programming Language :: Python :: 3.6', 76 | 'Programming Language :: Python :: 3.7', 77 | 'Programming Language :: Python :: 3.8', 78 | # Additional Setting 79 | 'Environment :: Console', 80 | 'Natural Language :: English', 81 | 'Operating System :: OS Independent', 82 | ], 83 | python_requires='>=3.6', 84 | packages=setuptools.find_packages(), 85 | install_requires=install_requires, 86 | # Add in any packaged data. 87 | include_package_data=True, 88 | zip_safe=False, 89 | # PyPI package information. 90 | keywords=__keywords__ 91 | ) 92 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/glue/mnli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """MNLI dataset.""" 17 | 18 | from megatron import print_rank_0 19 | from tasks.data_utils import clean_text 20 | from .data import GLUEAbstractDataset 21 | 22 | 23 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2} 24 | 25 | 26 | class MNLIDataset(GLUEAbstractDataset): 27 | 28 | def __init__(self, name, datapaths, tokenizer, max_seq_length, 29 | test_label='contradiction'): 30 | self.test_label = test_label 31 | super().__init__('MNLI', name, datapaths, 32 | tokenizer, max_seq_length) 33 | 34 | def process_samples_from_single_path(self, filename): 35 | """"Implement abstract method.""" 36 | print_rank_0(' > Processing {} ...'.format(filename)) 37 | 38 | samples = [] 39 | total = 0 40 | first = True 41 | is_test = False 42 | with open(filename, 'r') as f: 43 | for line in f: 44 | row = line.strip().split('\t') 45 | if first: 46 | first = False 47 | if len(row) == 10: 48 | is_test = True 49 | print_rank_0( 50 | ' reading {}, {} and {} columns and setting ' 51 | 'labels to {}'.format( 52 | row[0].strip(), row[8].strip(), 53 | row[9].strip(), self.test_label)) 54 | else: 55 | print_rank_0(' reading {} , {}, {}, and {} columns ' 56 | '...'.format( 57 | row[0].strip(), row[8].strip(), 58 | row[9].strip(), row[-1].strip())) 59 | continue 60 | 61 | text_a = clean_text(row[8].strip()) 62 | text_b = clean_text(row[9].strip()) 63 | unique_id = int(row[0].strip()) 64 | label = row[-1].strip() 65 | if is_test: 66 | label = self.test_label 67 | 68 | assert len(text_a) > 0 69 | assert len(text_b) > 0 70 | assert label in LABELS 71 | assert unique_id >= 0 72 | 73 | sample = {'text_a': text_a, 74 | 'text_b': text_b, 75 | 'label': LABELS[label], 76 | 'uid': unique_id} 77 | total += 1 78 | samples.append(sample) 79 | 80 | if total % 50000 == 0: 81 | print_rank_0(' > processed {} so far ...'.format(total)) 82 | 83 | print_rank_0(' >> processed {} samples.'.format(len(samples))) 84 | return samples 85 | -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/utils/sampling_helpers.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/sampling_helper.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_GUACAMOL). 7 | # --------------------------------------------------------------- 8 | 9 | from typing import List, Set 10 | 11 | from guacamol.guacamol.distribution_matching_generator import DistributionMatchingGenerator 12 | from guacamol.guacamol.utils.chemistry import is_valid, canonicalize 13 | 14 | 15 | def sample_valid_molecules(model: DistributionMatchingGenerator, number_molecules: int, max_tries=10) -> List[str]: 16 | """ 17 | Sample from the given generator until the desired number of valid molecules 18 | has been sampled (i.e., ignore invalid molecules). 19 | 20 | Args: 21 | model: model to sample from 22 | number_molecules: number of valid molecules to generate 23 | max_tries: determines the maximum number N of samples to draw, N = number_molecules * max_tries 24 | 25 | Returns: 26 | A list of number_molecules valid molecules. If this was not possible with the given max_tries, the list may be shorter. 27 | """ 28 | 29 | max_samples = max_tries * number_molecules 30 | number_already_sampled = 0 31 | 32 | valid_molecules: List[str] = [] 33 | 34 | while len(valid_molecules) < number_molecules and number_already_sampled < max_samples: 35 | remaining_to_sample = number_molecules - len(valid_molecules) 36 | 37 | samples = model.generate(remaining_to_sample) 38 | number_already_sampled += remaining_to_sample 39 | 40 | valid_molecules += [m for m in samples if is_valid(m)] 41 | 42 | return valid_molecules 43 | 44 | 45 | def sample_unique_molecules(model: DistributionMatchingGenerator, number_molecules: int, max_tries=10) -> List[str]: 46 | """ 47 | Sample from the given generator until the desired number of unique (distinct) molecules 48 | has been sampled (i.e., ignore duplicate molecules). 49 | 50 | Args: 51 | model: model to sample from 52 | number_molecules: number of unique (distinct) molecules to generate 53 | max_tries: determines the maximum number N of samples to draw, N = number_molecules * max_tries 54 | 55 | Returns: 56 | A list of number_molecules unique molecules, in canonalized form. 57 | If this was not possible with the given max_tries, the list may be shorter. 58 | The generation order is kept. 59 | """ 60 | 61 | max_samples = max_tries * number_molecules 62 | number_already_sampled = 0 63 | 64 | unique_list: List[str] = [] 65 | unique_set: Set[str] = set() 66 | 67 | while len(unique_list) < number_molecules and number_already_sampled < max_samples: 68 | remaining_to_sample = number_molecules - len(unique_list) 69 | 70 | samples = model.generate(remaining_to_sample) 71 | number_already_sampled += remaining_to_sample 72 | 73 | for smiles in samples: 74 | canonical_smiles = canonicalize(smiles) 75 | if canonical_smiles is not None and canonical_smiles not in unique_set: 76 | unique_set.add(canonical_smiles) 77 | unique_list.append(canonical_smiles) 78 | 79 | # this should always be True 80 | assert len(unique_set) == len(unique_list) 81 | 82 | return unique_list 83 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/openwebtext/group_duplicates_url.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import time 18 | import sys 19 | 20 | 21 | def is_similar(jaccard_similarity): 22 | return (js >= 0.9) 23 | 24 | 25 | if __name__ == '__main__': 26 | 27 | 28 | print('grouping duplicate urls ...') 29 | 30 | input = sys.argv[1] 31 | output = sys.argv[2] 32 | 33 | url_to_index = {} 34 | index_to_urls = [] 35 | counter = 0 36 | start_time = time.time() 37 | with open(input, 'r') as f: 38 | for line in f: 39 | counter += 1 40 | myjson = json.loads(line) 41 | urls = [] 42 | for main_url in myjson.keys(): 43 | urls.append(main_url) 44 | for value in myjson[main_url]: 45 | for other_url, js in value.items(): 46 | if is_similar(js): 47 | urls.append(other_url) 48 | current_index = -1 49 | other_indices = set() 50 | for url in urls: 51 | if url in url_to_index: 52 | if current_index == -1: 53 | current_index = url_to_index[url] 54 | elif current_index != url_to_index[url]: 55 | other_indices.add(url_to_index[url]) 56 | if current_index == -1: 57 | current_index = len(index_to_urls) 58 | index_to_urls.append(set()) 59 | for url in urls: 60 | url_to_index[url] = current_index 61 | index_to_urls[current_index].add(url) 62 | for index in other_indices: 63 | for url in index_to_urls[index]: 64 | index_to_urls[current_index].add(url) 65 | url_to_index[url] = current_index 66 | index_to_urls[index] = None 67 | 68 | if counter % 100000 == 0: 69 | print(' > processed {} lines in {} seconds ...'.format( 70 | counter, time.time() - start_time)) 71 | 72 | 73 | total_remove = 0 74 | total_remain = 0 75 | for urls in index_to_urls: 76 | if urls is not None: 77 | if len(urls) > 1: 78 | total_remove += (len(urls) - 1) 79 | total_remain += 1 80 | print('out of {} urls, only {} are unique and {} should be removed'.format( 81 | total_remove+total_remain, total_remain, total_remove)) 82 | 83 | with open(output, 'wb') as f: 84 | for i, urls in enumerate(index_to_urls): 85 | if urls is not None: 86 | if len(urls) > 1: 87 | myjson = json.dumps({str(i): list(urls)}, 88 | ensure_ascii=False) 89 | f.write(myjson.encode('utf-8')) 90 | f.write('\n'.encode('utf-8')) 91 | -------------------------------------------------------------------------------- /MolBART/eval_megatron_retrieval_controlled.sh: -------------------------------------------------------------------------------- 1 | GPUS_PER_NODE=1 # 4 2 | # Change for multinode config 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=6003 5 | NNODES=1 6 | NODE_RANK=0 7 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 8 | 9 | export DLWS_NUM_WORKER=${NNODES} 10 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE} 11 | 12 | script_path=$(realpath $0) 13 | script_dir=$(dirname $script_path) 14 | # config_json="$script_dir/megatron_molbart/ds_config.json" 15 | config_json="megatron_molbart/ds_config.json" 16 | 17 | #ZeRO Configs 18 | stage=1 19 | reduce_scatter=true 20 | contigious_gradients=false 21 | rbs=50000000 22 | agbs=5000000000 23 | 24 | chkp_layers=1 25 | PA=true 26 | PA_CPU=false 27 | CC=true 28 | SYNCHRONIZE=true 29 | PROFILE=false 30 | 31 | # Megatron Model Parallelism 32 | mp_size=1 33 | # DeepSpeed Pipeline parallelism 34 | pp_size=0 35 | 36 | 37 | ####### 38 | ## JACKMOD: add two options: 1 for data, 1 for tensorboard 39 | megatron_options=" \ 40 | --model-parallel-size ${mp_size} \ 41 | --pipe-parallel-size ${pp_size} \ 42 | --num-layers 4 \ 43 | --hidden-size 256 \ 44 | --num-attention-heads 8 \ 45 | --seq-length 512 \ 46 | --max-position-embeddings 512 \ 47 | --batch-size 320 \ 48 | --gas 16 \ 49 | --train-iters 320000 \ 50 | --lr-decay-iters 320000 \ 51 | --data-impl mmap \ 52 | --distributed-backend nccl \ 53 | --lr 0.0001 \ 54 | --lr-decay-style cosine \ 55 | --min-lr 1.0e-5 \ 56 | --weight-decay 0 \ 57 | --clip-grad 1.0 \ 58 | --warmup 0.01 \ 59 | --checkpoint-activations \ 60 | --log-interval 1 \ 61 | --save-interval 1000 \ 62 | --eval-interval 100000 \ 63 | --eval-iters 10 \ 64 | --save megatron_molbart_100m_checkpoint 65 | --dataset_path ../data/zinc.tab 66 | --load /mol-gen/drug/models/megamolbart/checkpoints 67 | " 68 | 69 | deepspeed_options=" \ 70 | --deepspeed \ 71 | --deepspeed_config ${config_json} \ 72 | --zero-stage ${stage} \ 73 | --zero-reduce-bucket-size ${rbs} \ 74 | --zero-allgather-bucket-size ${agbs} 75 | " 76 | 77 | if [ "${contigious_gradients}" = "true" ]; then 78 | deepspeed_options="${deepspeed_options} \ 79 | --zero-contigious-gradients" 80 | fi 81 | 82 | if [ "${reduce_scatter}" = "true" ]; then 83 | deepspeed_options="${deepspeed_options} \ 84 | --zero-reduce-scatter" 85 | fi 86 | 87 | chkp_opt=" \ 88 | --checkpoint-activations \ 89 | --checkpoint-num-layers ${chkp_layers}" 90 | 91 | if [ "${PA}" = "true" ]; then 92 | chkp_opt="${chkp_opt} \ 93 | --partition-activations" 94 | fi 95 | 96 | if [ "${PA_CPU}" = "true" ]; then 97 | chkp_opt="${chkp_opt} \ 98 | --checkpoint-in-cpu" 99 | fi 100 | 101 | if [ "${SYNCHRONIZE}" = "true" ]; then 102 | chkp_opt="${chkp_opt} \ 103 | --synchronize-each-layer" 104 | fi 105 | 106 | if [ "${CC}" = "true" ]; then 107 | chkp_opt="${chkp_opt} \ 108 | --contigious-checkpointing" 109 | fi 110 | 111 | if [ "${PROFILE}" = "true" ]; then 112 | chkp_opt="${chkp_opt} \ 113 | --profile-backward" 114 | fi 115 | 116 | full_options="${megatron_options} ${deepspeed_options} ${chkp_opt}" 117 | 118 | run_cmd="deepspeed --include localhost:4 --master_port=${MASTER_PORT} megatron_molbart/eval_retrieval_controlled.py $@ ${full_options}" 119 | echo ${run_cmd} 120 | eval ${run_cmd} 121 | 122 | set +x -------------------------------------------------------------------------------- /inference/cheminformatics/common/cuchemcommon/utils/logger.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/utils/logger.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_CHEMINFORMATICS). 7 | # --------------------------------------------------------------- 8 | 9 | import logging 10 | import os 11 | from datetime import datetime 12 | 13 | from cuchemcommon.context import Context 14 | 15 | from .sysinfo import get_machine_config, print_machine_config 16 | 17 | BENCHMARK_FILE = '/data/benchmark.csv' 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | def initialize_logfile(benchmark_file=BENCHMARK_FILE): 23 | """Initialize benchmark file with header if needed""" 24 | 25 | config = get_machine_config() 26 | config_message = print_machine_config(config) 27 | 28 | if not os.path.exists(benchmark_file): 29 | with open(benchmark_file, 'w') as fh: 30 | fh.write(f'# {config_message}\n') 31 | fh.write('date,benchmark_type,step,time(hh:mm:ss.ms),n_molecules,n_workers,metric_name,metric_value\n') 32 | return benchmark_file 33 | 34 | 35 | class MetricsLogger(object): 36 | 37 | def __init__(self, 38 | task_name, 39 | n_molecules): 40 | 41 | self.task_name = task_name 42 | self.n_molecules = n_molecules 43 | self.start_time = None 44 | self.metric_name = None 45 | self.metric_value = None 46 | 47 | self.metric_func = None 48 | self.metric_func_args = None 49 | self.metric_func_kwargs = {} 50 | 51 | def __enter__(self): 52 | self.start_time = datetime.now() 53 | 54 | return self 55 | 56 | def __exit__(self, type, value, traceback): 57 | context = Context() 58 | 59 | runtime = datetime.now() - self.start_time 60 | logger.info('### Runtime {} time (hh:mm:ss.ms) {}'.format(self.task_name, runtime)) 61 | n_workers = len(context.dask_client.cluster.workers) 62 | 63 | if self.metric_func and context.is_benchmark: 64 | self.metric_value = self.metric_func(*self.metric_func_args, 65 | **self.metric_func_kwargs) 66 | 67 | if self.metric_value is None: 68 | self.metric_name = '' 69 | self.metric_value = '' 70 | else: 71 | logger.info('Calculated {} is {}'.format(self.metric_name, self.metric_value)) 72 | 73 | log_results(self.start_time, context.compute_type, self.task_name, 74 | runtime, 75 | n_molecules=self.n_molecules, 76 | n_workers=n_workers, 77 | metric_name=self.metric_name, 78 | metric_value=self.metric_value, 79 | benchmark_file=context.benchmark_file) 80 | 81 | 82 | def log_results(date, 83 | benchmark_type, 84 | step, 85 | time, 86 | n_molecules, 87 | n_workers, 88 | metric_name='', 89 | metric_value='', 90 | benchmark_file=BENCHMARK_FILE): 91 | """Log benchmark results to a file""" 92 | 93 | out_list = [date, benchmark_type, step, time, n_molecules, n_workers, metric_name, metric_value] 94 | out_fmt = ','.join(['{}'] * len(out_list)) + '\n' 95 | 96 | with open(benchmark_file, 'a') as fh: 97 | out_string = out_fmt.format(*out_list) 98 | fh.write(out_string) 99 | -------------------------------------------------------------------------------- /inference/cheminformatics/common/cuchemcommon/fingerprint.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/fingerprint.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_CHEMINFORMATICS). 7 | # --------------------------------------------------------------- 8 | 9 | import logging 10 | import os 11 | from abc import ABC 12 | from enum import Enum 13 | 14 | import numpy as np 15 | import pandas as pd 16 | from cddd.inference import InferenceModel 17 | from cuchem.utils.data_peddler import download_cddd_models 18 | from rdkit import Chem 19 | from rdkit.Chem import AllChem 20 | 21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | def calc_morgan_fingerprints(dataframe, smiles_col='canonical_smiles'): 26 | """Calculate Morgan fingerprints on SMILES strings 27 | 28 | Args: 29 | dataframe (pd.DataFrame): dataframe containing a SMILES column for calculation 30 | 31 | Returns: 32 | pd.DataFrame: new dataframe containing fingerprints 33 | """ 34 | mf = MorganFingerprint() 35 | fp = mf.transform(dataframe, col_name=smiles_col) 36 | fp = pd.DataFrame(fp) 37 | fp.index = dataframe.index 38 | return fp 39 | 40 | 41 | class TransformationDefaults(Enum): 42 | MorganFingerprint = {'radius': 2, 'nBits': 512} 43 | Embeddings = {} 44 | 45 | 46 | class BaseTransformation(ABC): 47 | def __init__(self, **kwargs): 48 | self.name = None 49 | self.kwargs = None 50 | self.func = None 51 | 52 | def transform(self, data): 53 | return NotImplemented 54 | 55 | def transform_many(self, data): 56 | return list(map(self.transform, data)) 57 | 58 | def __len__(self): 59 | return NotImplemented 60 | 61 | 62 | class MorganFingerprint(BaseTransformation): 63 | 64 | def __init__(self, **kwargs): 65 | self.name = __class__.__name__.split('.')[-1] 66 | self.kwargs = TransformationDefaults[self.name].value 67 | self.kwargs.update(kwargs) 68 | self.func = AllChem.GetMorganFingerprintAsBitVect 69 | 70 | def transform(self, data, col_name='transformed_smiles'): 71 | data = data[col_name] 72 | fp_array = [] 73 | for mol in data: 74 | m = Chem.MolFromSmiles(mol) 75 | fp = self.func(m, **self.kwargs) 76 | fp_array.append(list(fp.ToBitString())) 77 | fp_array = np.asarray(fp_array) 78 | return fp_array 79 | 80 | def __len__(self): 81 | return self.kwargs['nBits'] 82 | 83 | 84 | class Embeddings(BaseTransformation): 85 | 86 | def __init__(self, use_gpu=True, cpu_threads=5, model_dir=None, **kwargs): 87 | self.name = __class__.__name__.split('.')[-1] 88 | self.kwargs = TransformationDefaults[self.name].value 89 | self.kwargs.update(kwargs) 90 | model_dir = download_cddd_models() 91 | self.func = InferenceModel(model_dir, use_gpu=use_gpu, cpu_threads=cpu_threads) 92 | 93 | def transform(self, data): 94 | data = data['transformed_smiles'] 95 | return self.func.seq_to_emb(data).squeeze() 96 | 97 | def inverse_transform(self, embeddings): 98 | "Embedding array -- individual compound embeddings are in rows" 99 | embeddings = np.asarray(embeddings) 100 | return self.func.emb_to_seq(embeddings) 101 | 102 | def __len__(self): 103 | return self.func.hparams.emb_size 104 | -------------------------------------------------------------------------------- /guacamol/guacamol/guacamol/assess_goal_directed_generation.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # Taken from the following link as is from: 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/assess_goal_directed_generation.py 4 | # 5 | # The license for the original version of this file can be 6 | # found in this directory (LICENSE_GUACAMOL). 7 | # --------------------------------------------------------------- 8 | 9 | import datetime 10 | import json 11 | import logging 12 | from collections import OrderedDict 13 | from typing import List, Any, Dict 14 | 15 | import guacamol 16 | from guacamol.goal_directed_benchmark import GoalDirectedBenchmark, GoalDirectedBenchmarkResult 17 | from guacamol.goal_directed_generator import GoalDirectedGenerator 18 | from guacamol.benchmark_suites import goal_directed_benchmark_suite 19 | from guacamol.utils.data import get_time_string 20 | 21 | logger = logging.getLogger(__name__) 22 | logger.addHandler(logging.NullHandler()) 23 | 24 | from pdb import set_trace 25 | 26 | def assess_goal_directed_generation(goal_directed_molecule_generator: GoalDirectedGenerator, 27 | json_output_file='output_goal_directed.json', 28 | benchmark_version='v1') -> None: 29 | """ 30 | Assesses a distribution-matching model for de novo molecule design. 31 | 32 | Args: 33 | goal_directed_molecule_generator: Model to evaluate 34 | json_output_file: Name of the file where to save the results in JSON format 35 | benchmark_version: which benchmark suite to execute 36 | """ 37 | logger.info(f'Benchmarking goal-directed molecule generation, version {benchmark_version}') 38 | benchmarks = goal_directed_benchmark_suite(version_name=benchmark_version) 39 | set_trace() 40 | 41 | results = _evaluate_goal_directed_benchmarks( 42 | goal_directed_molecule_generator=goal_directed_molecule_generator, 43 | benchmarks=benchmarks) 44 | 45 | benchmark_results: Dict[str, Any] = OrderedDict() 46 | benchmark_results['guacamol_version'] = guacamol.__version__ 47 | benchmark_results['benchmark_suite_version'] = benchmark_version 48 | benchmark_results['timestamp'] = get_time_string() 49 | benchmark_results['results'] = [vars(result) for result in results] 50 | 51 | logger.info(f'Save results to file {json_output_file}') 52 | with open(json_output_file, 'wt') as f: 53 | f.write(json.dumps(benchmark_results, indent=4)) 54 | 55 | 56 | def _evaluate_goal_directed_benchmarks(goal_directed_molecule_generator: GoalDirectedGenerator, 57 | benchmarks: List[GoalDirectedBenchmark] 58 | ) -> List[GoalDirectedBenchmarkResult]: 59 | """ 60 | Evaluate a model with the given benchmarks. 61 | Should not be called directly except for testing purposes. 62 | 63 | Args: 64 | goal_directed_molecule_generator: model to assess 65 | benchmarks: list of benchmarks to evaluate 66 | json_output_file: Name of the file where to save the results in JSON format 67 | """ 68 | 69 | logger.info(f'Number of benchmarks: {len(benchmarks)}') 70 | 71 | results = [] 72 | for i, benchmark in enumerate(benchmarks, 1): 73 | logger.info(f'Running benchmark {i}/{len(benchmarks)}: {benchmark.name}') 74 | result = benchmark.assess_model(goal_directed_molecule_generator) 75 | logger.info(f'Results for the benchmark "{result.benchmark_name}":') 76 | logger.info(f' Score: {result.score:.6f}') 77 | logger.info(f' Execution time: {str(datetime.timedelta(seconds=int(result.execution_time)))}') 78 | logger.info(f' Metadata: {result.metadata}') 79 | results.append(result) 80 | 81 | logger.info('Finished execution of the benchmarks') 82 | 83 | return results 84 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/test_initialize.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | import mpu 19 | import torch 20 | import sys 21 | sys.path.append("../..") 22 | 23 | 24 | def test_initialize_model_parallel(model_parallel_size): 25 | 26 | if torch.distributed.get_rank() == 0: 27 | print('> testing initialize_model_parallel with size {} ...'.format( 28 | model_parallel_size)) 29 | model_parallel_size_ = min(model_parallel_size, 30 | torch.distributed.get_world_size()) 31 | assert not mpu.model_parallel_is_initialized() 32 | mpu.initialize_model_parallel(model_parallel_size_) 33 | assert mpu.model_parallel_is_initialized() 34 | 35 | # Checks. 36 | def check(group, world_size, rank): 37 | assert world_size == torch.distributed.get_world_size(group=group) 38 | assert rank == torch.distributed.get_rank(group=group) 39 | 40 | # Model parallel. 41 | world_size = model_parallel_size_ 42 | rank = torch.distributed.get_rank() % model_parallel_size_ 43 | assert world_size == mpu.get_model_parallel_world_size() 44 | assert rank == mpu.get_model_parallel_rank() 45 | check(mpu.get_model_parallel_group(), world_size, rank) 46 | 47 | # Data parallel. 48 | world_size = torch.distributed.get_world_size() // model_parallel_size_ 49 | rank = torch.distributed.get_rank() // model_parallel_size 50 | assert world_size == mpu.get_data_parallel_world_size() 51 | assert rank == mpu.get_data_parallel_rank() 52 | check(mpu.get_data_parallel_group(), world_size, rank) 53 | 54 | # Reset groups 55 | mpu.destroy_model_parallel() 56 | 57 | torch.distributed.barrier() 58 | if torch.distributed.get_rank() == 0: 59 | print('>> passed the test :-)') 60 | 61 | 62 | def test_get_model_parallel_src_rank(model_parallel_size_): 63 | 64 | if torch.distributed.get_rank() == 0: 65 | print('> testing get_model_parallel_src_rank with size {} ...'.format( 66 | model_parallel_size_)) 67 | model_parallel_size = min(model_parallel_size_, 68 | torch.distributed.get_world_size()) 69 | assert not mpu.model_parallel_is_initialized() 70 | mpu.initialize_model_parallel(model_parallel_size) 71 | assert mpu.model_parallel_is_initialized() 72 | 73 | # Checks 74 | src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank() 75 | assert mpu.get_model_parallel_src_rank() == src_rank 76 | 77 | # Reset groups 78 | mpu.destroy_model_parallel() 79 | 80 | torch.distributed.barrier() 81 | if torch.distributed.get_rank() == 0: 82 | print('>> passed the test :-)') 83 | 84 | 85 | if __name__ == '__main__': 86 | 87 | initialize_distributed() 88 | world_size = torch.distributed.get_world_size() 89 | model_parallel_size = 1 90 | while model_parallel_size <= world_size: 91 | print_separator('test initialize model parallel') 92 | test_initialize_model_parallel(model_parallel_size) 93 | print_separator('test model parallel source rank') 94 | test_get_model_parallel_src_rank(model_parallel_size) 95 | model_parallel_size *= 2 96 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/tests/test_initialize.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | import mpu 19 | import torch 20 | import sys 21 | sys.path.append("../..") 22 | 23 | 24 | def test_initialize_model_parallel(model_parallel_size): 25 | 26 | if torch.distributed.get_rank() == 0: 27 | print('> testing initialize_model_parallel with size {} ...'.format( 28 | model_parallel_size)) 29 | model_parallel_size_ = min(model_parallel_size, 30 | torch.distributed.get_world_size()) 31 | assert not mpu.model_parallel_is_initialized() 32 | mpu.initialize_model_parallel(model_parallel_size_) 33 | assert mpu.model_parallel_is_initialized() 34 | 35 | # Checks. 36 | def check(group, world_size, rank): 37 | assert world_size == torch.distributed.get_world_size(group=group) 38 | assert rank == torch.distributed.get_rank(group=group) 39 | 40 | # Model parallel. 41 | world_size = model_parallel_size_ 42 | rank = torch.distributed.get_rank() % model_parallel_size_ 43 | assert world_size == mpu.get_model_parallel_world_size() 44 | assert rank == mpu.get_model_parallel_rank() 45 | check(mpu.get_model_parallel_group(), world_size, rank) 46 | 47 | # Data parallel. 48 | world_size = torch.distributed.get_world_size() // model_parallel_size_ 49 | rank = torch.distributed.get_rank() // model_parallel_size 50 | assert world_size == mpu.get_data_parallel_world_size() 51 | assert rank == mpu.get_data_parallel_rank() 52 | check(mpu.get_data_parallel_group(), world_size, rank) 53 | 54 | # Reset groups 55 | mpu.destroy_model_parallel() 56 | 57 | torch.distributed.barrier() 58 | if torch.distributed.get_rank() == 0: 59 | print('>> passed the test :-)') 60 | 61 | 62 | def test_get_model_parallel_src_rank(model_parallel_size_): 63 | 64 | if torch.distributed.get_rank() == 0: 65 | print('> testing get_model_parallel_src_rank with size {} ...'.format( 66 | model_parallel_size_)) 67 | model_parallel_size = min(model_parallel_size_, 68 | torch.distributed.get_world_size()) 69 | assert not mpu.model_parallel_is_initialized() 70 | mpu.initialize_model_parallel(model_parallel_size) 71 | assert mpu.model_parallel_is_initialized() 72 | 73 | # Checks 74 | src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank() 75 | assert mpu.get_model_parallel_src_rank() == src_rank 76 | 77 | # Reset groups 78 | mpu.destroy_model_parallel() 79 | 80 | torch.distributed.barrier() 81 | if torch.distributed.get_rank() == 0: 82 | print('>> passed the test :-)') 83 | 84 | 85 | if __name__ == '__main__': 86 | 87 | initialize_distributed() 88 | world_size = torch.distributed.get_world_size() 89 | model_parallel_size = 1 90 | while model_parallel_size <= world_size: 91 | print_separator('test initialize model parallel') 92 | test_initialize_model_parallel(model_parallel_size) 93 | print_separator('test model parallel source rank') 94 | test_get_model_parallel_src_rank(model_parallel_size) 95 | model_parallel_size *= 2 96 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | export DLWS_NUM_WORKER=${NNODES} 12 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE} 13 | 14 | DATA_PATH=data/webtext/webtext_text_document 15 | VOCAB_PATH=data/gpt2-vocab.json 16 | MERGE_PATH=data/gpt2-merges.txt 17 | CHECKPOINT_PATH=checkpoints/gpt2_345m_ds 18 | 19 | script_path=$(realpath $0) 20 | script_dir=$(dirname $script_path) 21 | config_json="$script_dir/ds_zero_stage_2_config.json" 22 | 23 | # Megatron Model Parallelism 24 | mp_size=4 25 | 26 | NLAYERS=24 27 | NHIDDEN=1024 28 | BATCHSIZE=9 29 | LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${mp_size}mp_${BATCHSIZE}b_ds4" 30 | 31 | #ZeRO Configs 32 | stage=0 33 | reduce_scatter=true 34 | contigious_gradients=true 35 | rbs=50000000 36 | agbs=5000000000 37 | 38 | #Actication Checkpointing and Contigious Memory 39 | chkp_layers=1 40 | PA=true 41 | PA_CPU=false 42 | CC=true 43 | SYNCHRONIZE=true 44 | PROFILE=false 45 | 46 | 47 | gpt_options=" \ 48 | --model-parallel-size ${mp_size} \ 49 | --num-layers $NLAYERS \ 50 | --hidden-size $NHIDDEN \ 51 | --num-attention-heads 16 \ 52 | --seq-length 1024 \ 53 | --max-position-embeddings 1024 \ 54 | --batch-size $BATCHSIZE \ 55 | --train-iters 320000 \ 56 | --lr-decay-iters 320000 \ 57 | --save $CHECKPOINT_PATH \ 58 | --load $CHECKPOINT_PATH \ 59 | --data-path $DATA_PATH \ 60 | --vocab-file $VOCAB_PATH \ 61 | --merge-file $MERGE_PATH \ 62 | --data-impl mmap \ 63 | --split 949,50,1 \ 64 | --distributed-backend nccl \ 65 | --lr 1.5e-4 \ 66 | --lr-decay-style cosine \ 67 | --min-lr 1.0e-5 \ 68 | --weight-decay 1e-2 \ 69 | --clip-grad 1.0 \ 70 | --warmup 0.01 \ 71 | --checkpoint-activations \ 72 | --log-interval 100 \ 73 | --save-interval 10000 \ 74 | --eval-interval 1000 \ 75 | --eval-iters 10 \ 76 | --fp16 \ 77 | --tensorboard-dir ${LOGDIR} 78 | " 79 | 80 | deepspeed_options=" \ 81 | --deepspeed \ 82 | --deepspeed_config ${config_json} \ 83 | --zero-stage ${stage} \ 84 | --zero-reduce-bucket-size ${rbs} \ 85 | --zero-allgather-bucket-size ${agbs} 86 | " 87 | 88 | if [ "${contigious_gradients}" = "true" ]; then 89 | deepspeed_options="${deepspeed_options} \ 90 | --zero-contigious-gradients" 91 | fi 92 | 93 | if [ "${reduce_scatter}" = "true" ]; then 94 | deepspeed_options="${deepspeed_options} \ 95 | --zero-reduce-scatter" 96 | fi 97 | 98 | chkp_opt=" \ 99 | --checkpoint-activations \ 100 | --checkpoint-num-layers ${chkp_layers}" 101 | 102 | if [ "${PA}" = "true" ]; then 103 | chkp_opt="${chkp_opt} \ 104 | --partition-activations" 105 | fi 106 | 107 | if [ "${PA_CPU}" = "true" ]; then 108 | chkp_opt="${chkp_opt} \ 109 | --checkpoint-in-cpu" 110 | fi 111 | 112 | if [ "${SYNCHRONIZE}" = "true" ]; then 113 | chkp_opt="${chkp_opt} \ 114 | --synchronize-each-layer" 115 | fi 116 | 117 | if [ "${CC}" = "true" ]; then 118 | chkp_opt="${chkp_opt} \ 119 | --contigious-checkpointing" 120 | fi 121 | 122 | if [ "${PROFILE}" = "true" ]; then 123 | chkp_opt="${chkp_opt} \ 124 | --profile-backward" 125 | fi 126 | 127 | full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}" 128 | 129 | run_cmd="deepspeed --num_nodes ${DLWS_NUM_WORKER} --num_gpus ${DLWS_NUM_GPU_PER_WORKER} pretrain_gpt2.py $@ ${full_options}" 130 | echo ${run_cmd} 131 | eval ${run_cmd} 132 | 133 | set +x 134 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/fused_kernels/scaled_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "THC/THC.h" 23 | #include 24 | #include 25 | #include "scaled_masked_softmax.h" 26 | 27 | namespace multihead_attn { 28 | namespace fused_softmax { 29 | namespace scaled_masked_softmax { 30 | 31 | torch::Tensor fwd_cuda( 32 | torch::Tensor const& input, 33 | torch::Tensor const& mask, 34 | float scale_factor) 35 | { 36 | // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 37 | const int batches = input.size(0); 38 | const int pad_batches = mask.size(0); 39 | const int attn_heads = input.size(1); 40 | const int seq_len = input.size(2); 41 | TORCH_INTERNAL_ASSERT(seq_len <= 2048); 42 | TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches); 43 | TORCH_INTERNAL_ASSERT(mask.size(1) == 1); 44 | TORCH_INTERNAL_ASSERT(mask.size(2) == seq_len); 45 | TORCH_INTERNAL_ASSERT(mask.size(3) == seq_len); 46 | 47 | // Output 48 | auto act_options = input.options().requires_grad(false); 49 | torch::Tensor softmax_results = 50 | torch::empty({batches, attn_heads, seq_len, seq_len}, act_options); 51 | 52 | // Softmax Intermediate Result Ptr 53 | void* input_ptr = static_cast(input.data_ptr()); 54 | void* mask_ptr = static_cast(mask.data_ptr()); 55 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 56 | 57 | dispatch_scaled_masked_softmax_forward( 58 | reinterpret_cast(softmax_results_ptr), 59 | reinterpret_cast(input_ptr), 60 | reinterpret_cast(mask_ptr), 61 | scale_factor, 62 | seq_len, 63 | seq_len, 64 | batches, 65 | attn_heads, 66 | pad_batches); 67 | return softmax_results; 68 | } 69 | 70 | torch::Tensor bwd_cuda( 71 | torch::Tensor const& output_grads_, 72 | torch::Tensor const& softmax_results_, 73 | float scale_factor) { 74 | 75 | auto output_grads = output_grads_.contiguous(); 76 | auto softmax_results = softmax_results_.contiguous(); 77 | 78 | //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 79 | const int batches = output_grads.size(0); 80 | const int attn_heads = output_grads.size(1); 81 | const int seq_len = output_grads.size(2); 82 | TORCH_INTERNAL_ASSERT(output_grads.size(2) == output_grads.size(3)); 83 | 84 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 85 | 86 | //Softmax Grad 87 | dispatch_scaled_masked_softmax_backward( 88 | reinterpret_cast(output_grads_ptr), 89 | reinterpret_cast(output_grads_ptr), 90 | reinterpret_cast(softmax_results.data_ptr()), 91 | scale_factor, 92 | seq_len, 93 | seq_len, 94 | batches, 95 | attn_heads); 96 | 97 | //backward pass is completely in-place 98 | return output_grads; 99 | } 100 | } 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /MolBART/eval_megatron_retrieval.sh: -------------------------------------------------------------------------------- 1 | GPUS_PER_NODE=1 # 4 2 | # Change for multinode config 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=6004 5 | NNODES=1 6 | NODE_RANK=0 7 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 8 | 9 | export DLWS_NUM_WORKER=${NNODES} 10 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE} 11 | 12 | script_path=$(realpath $0) 13 | script_dir=$(dirname $script_path) 14 | # config_json="$script_dir/megatron_molbart/ds_config.json" 15 | config_json="megatron_molbart/ds_config.json" 16 | 17 | #ZeRO Configs 18 | stage=1 19 | reduce_scatter=true 20 | contigious_gradients=false 21 | rbs=50000000 22 | agbs=5000000000 23 | 24 | chkp_layers=1 25 | PA=true 26 | PA_CPU=false 27 | CC=true 28 | SYNCHRONIZE=true 29 | PROFILE=false 30 | 31 | # Megatron Model Parallelism 32 | mp_size=1 33 | # DeepSpeed Pipeline parallelism 34 | pp_size=0 35 | 36 | 37 | ####### 38 | ## JACKMOD: add two options: 1 for data, 1 for tensorboard 39 | megatron_options=" \ 40 | --model-parallel-size ${mp_size} \ 41 | --pipe-parallel-size ${pp_size} \ 42 | --num-layers 4 \ 43 | --hidden-size 256 \ 44 | --num-attention-heads 8 \ 45 | --seq-length 512 \ 46 | --max-position-embeddings 512 \ 47 | --batch-size 320 \ 48 | --gas 16 \ 49 | --train-iters 320000 \ 50 | --lr-decay-iters 320000 \ 51 | --data-impl mmap \ 52 | --distributed-backend nccl \ 53 | --lr 0.0001 \ 54 | --lr-decay-style cosine \ 55 | --min-lr 1.0e-5 \ 56 | --weight-decay 0 \ 57 | --clip-grad 1.0 \ 58 | --warmup 0.01 \ 59 | --checkpoint-activations \ 60 | --log-interval 1 \ 61 | --save-interval 1000 \ 62 | --eval-interval 100000 \ 63 | --eval-iters 10 \ 64 | --save megatron_molbart_100m_checkpoint 65 | --dataset_path ../data/zinc.tab 66 | --load /mol-gen/drug/models/megamolbart/checkpoints 67 | " 68 | 69 | deepspeed_options=" \ 70 | --deepspeed \ 71 | --deepspeed_config ${config_json} \ 72 | --zero-stage ${stage} \ 73 | --zero-reduce-bucket-size ${rbs} \ 74 | --zero-allgather-bucket-size ${agbs} 75 | " 76 | 77 | if [ "${contigious_gradients}" = "true" ]; then 78 | deepspeed_options="${deepspeed_options} \ 79 | --zero-contigious-gradients" 80 | fi 81 | 82 | if [ "${reduce_scatter}" = "true" ]; then 83 | deepspeed_options="${deepspeed_options} \ 84 | --zero-reduce-scatter" 85 | fi 86 | 87 | chkp_opt=" \ 88 | --checkpoint-activations \ 89 | --checkpoint-num-layers ${chkp_layers}" 90 | 91 | if [ "${PA}" = "true" ]; then 92 | chkp_opt="${chkp_opt} \ 93 | --partition-activations" 94 | fi 95 | 96 | if [ "${PA_CPU}" = "true" ]; then 97 | chkp_opt="${chkp_opt} \ 98 | --checkpoint-in-cpu" 99 | fi 100 | 101 | if [ "${SYNCHRONIZE}" = "true" ]; then 102 | chkp_opt="${chkp_opt} \ 103 | --synchronize-each-layer" 104 | fi 105 | 106 | if [ "${CC}" = "true" ]; then 107 | chkp_opt="${chkp_opt} \ 108 | --contigious-checkpointing" 109 | fi 110 | 111 | if [ "${PROFILE}" = "true" ]; then 112 | chkp_opt="${chkp_opt} \ 113 | --profile-backward" 114 | fi 115 | 116 | full_options="${megatron_options} ${deepspeed_options} ${chkp_opt}" 117 | 118 | 119 | custom_train_options=" \ 120 | --stage 1 \ 121 | --train_from pretrain \ 122 | --model_ckpt_itr 134000 \ 123 | --attr logp-sa \ 124 | --attr_offset 0 \ 125 | --data_source jtnn \ 126 | --enumeration_input false \ 127 | --retriever_rule random \ 128 | --pred_target nearestn \ 129 | --n_retrievals 10 \ 130 | --n_neighbors 100 131 | " 132 | 133 | 134 | run_cmd="deepspeed --include localhost:0 --master_port=${MASTER_PORT} megatron_molbart/eval_retrieval.py $@ ${full_options} ${custom_train_options}" 135 | echo ${run_cmd} 136 | eval ${run_cmd} 137 | 138 | set +x -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2_pipe.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | GPUS_PER_NODE=16 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | export DLWS_NUM_WORKER=${NNODES} 12 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE} 13 | 14 | DATA_PATH=data/webtext/webtext_text_document 15 | VOCAB_PATH=data/gpt2-vocab.json 16 | MERGE_PATH=data/gpt2-merges.txt 17 | CHECKPOINT_PATH=checkpoints/gpt2_345m_ds 18 | 19 | script_path=$(realpath $0) 20 | script_dir=$(dirname $script_path) 21 | #config_json="$script_dir/ds_zero_stage_2_config.json" 22 | config_json="$script_dir/ds_config.json" 23 | 24 | # Megatron Model Parallelism 25 | mp_size=2 26 | # DeepSpeed Pipeline parallelism 27 | pp_size=2 28 | 29 | NLAYERS=24 30 | NHIDDEN=1024 31 | BATCHSIZE=4 32 | LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4" 33 | 34 | GAS=16 35 | 36 | #ZeRO Configs 37 | stage=0 38 | reduce_scatter=true 39 | contigious_gradients=true 40 | rbs=50000000 41 | agbs=5000000000 42 | 43 | #Actication Checkpointing and Contigious Memory 44 | chkp_layers=1 45 | PA=true 46 | PA_CPU=false 47 | CC=true 48 | SYNCHRONIZE=true 49 | PROFILE=false 50 | 51 | 52 | gpt_options=" \ 53 | --model-parallel-size ${mp_size} \ 54 | --pipe-parallel-size ${pp_size} \ 55 | --num-layers $NLAYERS \ 56 | --hidden-size $NHIDDEN \ 57 | --num-attention-heads 16 \ 58 | --seq-length 1024 \ 59 | --max-position-embeddings 1024 \ 60 | --batch-size $BATCHSIZE \ 61 | --gas $GAS \ 62 | --train-iters 320000 \ 63 | --lr-decay-iters 320000 \ 64 | --save $CHECKPOINT_PATH \ 65 | --load $CHECKPOINT_PATH \ 66 | --data-path $DATA_PATH \ 67 | --vocab-file $VOCAB_PATH \ 68 | --merge-file $MERGE_PATH \ 69 | --data-impl mmap \ 70 | --split 949,50,1 \ 71 | --distributed-backend nccl \ 72 | --lr 1.5e-4 \ 73 | --lr-decay-style cosine \ 74 | --min-lr 1.0e-5 \ 75 | --weight-decay 1e-2 \ 76 | --clip-grad 1.0 \ 77 | --warmup 0.01 \ 78 | --checkpoint-activations \ 79 | --log-interval 1 \ 80 | --save-interval 500 \ 81 | --eval-interval 100 \ 82 | --eval-iters 10 \ 83 | --fp16 \ 84 | --tensorboard-dir ${LOGDIR} 85 | " 86 | 87 | deepspeed_options=" \ 88 | --deepspeed \ 89 | --deepspeed_config ${config_json} \ 90 | --zero-stage ${stage} \ 91 | --zero-reduce-bucket-size ${rbs} \ 92 | --zero-allgather-bucket-size ${agbs} 93 | " 94 | 95 | if [ "${contigious_gradients}" = "true" ]; then 96 | deepspeed_options="${deepspeed_options} \ 97 | --zero-contigious-gradients" 98 | fi 99 | 100 | if [ "${reduce_scatter}" = "true" ]; then 101 | deepspeed_options="${deepspeed_options} \ 102 | --zero-reduce-scatter" 103 | fi 104 | 105 | chkp_opt=" \ 106 | --checkpoint-activations \ 107 | --checkpoint-num-layers ${chkp_layers}" 108 | 109 | if [ "${PA}" = "true" ]; then 110 | chkp_opt="${chkp_opt} \ 111 | --partition-activations" 112 | fi 113 | 114 | if [ "${PA_CPU}" = "true" ]; then 115 | chkp_opt="${chkp_opt} \ 116 | --checkpoint-in-cpu" 117 | fi 118 | 119 | if [ "${SYNCHRONIZE}" = "true" ]; then 120 | chkp_opt="${chkp_opt} \ 121 | --synchronize-each-layer" 122 | fi 123 | 124 | if [ "${CC}" = "true" ]; then 125 | chkp_opt="${chkp_opt} \ 126 | --contigious-checkpointing" 127 | fi 128 | 129 | if [ "${PROFILE}" = "true" ]; then 130 | chkp_opt="${chkp_opt} \ 131 | --profile-backward" 132 | fi 133 | 134 | full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}" 135 | 136 | run_cmd="deepspeed --num_nodes ${DLWS_NUM_WORKER} --num_gpus ${DLWS_NUM_GPU_PER_WORKER} pretrain_gpt2.py $@ ${full_options}" 137 | echo ${run_cmd} 138 | eval ${run_cmd} 139 | 140 | set +x 141 | -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/data_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ Tasks data utility.""" 17 | 18 | import re 19 | import numpy as np 20 | 21 | 22 | def clean_text(text): 23 | """Remove new lines and multiple spaces and adjust end of sentence dot.""" 24 | 25 | text = text.replace("\n", " ") 26 | text = re.sub(r'\s+', ' ', text) 27 | for _ in range(3): 28 | text = text.replace(' . ', '. ') 29 | 30 | return text 31 | 32 | 33 | def build_sample(ids, types, paddings, label, unique_id): 34 | """Convert to numpy and return a sample consumed by the batch producer.""" 35 | 36 | ids_np = np.array(ids, dtype=np.int64) 37 | types_np = np.array(types, dtype=np.int64) 38 | paddings_np = np.array(paddings, dtype=np.int64) 39 | sample = ({'text': ids_np, 40 | 'types': types_np, 41 | 'padding_mask': paddings_np, 42 | 'label': int(label), 43 | 'uid': int(unique_id)}) 44 | 45 | return sample 46 | 47 | 48 | def build_tokens_types_paddings_from_text(text_a, text_b, 49 | tokenizer, max_seq_length): 50 | """Build token types and paddings, trim if needed, and pad if needed.""" 51 | 52 | text_a_ids = tokenizer.tokenize(text_a) 53 | text_b_ids = None 54 | if text_b is not None: 55 | text_b_ids = tokenizer.tokenize(text_b) 56 | 57 | return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, 58 | max_seq_length, tokenizer.cls, 59 | tokenizer.sep, tokenizer.pad) 60 | 61 | 62 | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length, 63 | cls_id, sep_id, pad_id): 64 | """Build token types and paddings, trim if needed, and pad if needed.""" 65 | 66 | ids = [] 67 | types = [] 68 | paddings = [] 69 | 70 | # [CLS]. 71 | ids.append(cls_id) 72 | types.append(0) 73 | paddings.append(1) 74 | 75 | # A. 76 | len_text_a = len(text_a_ids) 77 | ids.extend(text_a_ids) 78 | types.extend([0] * len_text_a) 79 | paddings.extend([1] * len_text_a) 80 | 81 | # [SEP]. 82 | ids.append(sep_id) 83 | types.append(0) 84 | paddings.append(1) 85 | 86 | # B. 87 | if text_b_ids is not None: 88 | len_text_b = len(text_b_ids) 89 | ids.extend(text_b_ids) 90 | types.extend([1] * len_text_b) 91 | paddings.extend([1] * len_text_b) 92 | 93 | # Cap the size. 94 | trimmed = False 95 | if len(ids) >= max_seq_length: 96 | max_seq_length_m1 = max_seq_length - 1 97 | ids = ids[0:max_seq_length_m1] 98 | types = types[0:max_seq_length_m1] 99 | paddings = paddings[0:max_seq_length_m1] 100 | trimmed = True 101 | 102 | # [SEP]. 103 | if (text_b_ids is not None) or trimmed: 104 | ids.append(sep_id) 105 | if text_b_ids is None: 106 | types.append(0) 107 | else: 108 | types.append(1) 109 | paddings.append(1) 110 | 111 | # Padding. 112 | padding_length = max_seq_length - len(ids) 113 | if padding_length > 0: 114 | ids.extend([pad_id] * padding_length) 115 | types.extend([pad_id] * padding_length) 116 | paddings.extend([0] * padding_length) 117 | 118 | return ids, types, paddings 119 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | NVIDIA Source Code License for RetMol 2 | 3 | 1. Definitions 4 | 5 | “Licensor” means any person or entity that distributes its Work. 6 | 7 | “Software” means the original work of authorship made available under this License. 8 | 9 | “Work” means the Software and any additions to or derivative works of the Software that are made available under 10 | this License. 11 | 12 | The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under 13 | U.S. copyright law; provided, however, that for the purposes of this License, derivative works shall not include 14 | works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work. 15 | 16 | Works, including the Software, are “made available” under this License by including in or with the Work either 17 | (a) a copyright notice referencing the applicability of this License to the Work, or (b) a copy of this License. 18 | 19 | 2. License Grant 20 | 21 | 2.1 Copyright Grant. Subject to the terms and conditions of this License, each Licensor grants to you a perpetual, 22 | worldwide, non-exclusive, royalty-free, copyright license to reproduce, prepare derivative works of, publicly 23 | display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form. 24 | 25 | 3. Limitations 26 | 27 | 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this License, (b) you 28 | include a complete copy of this License with your distribution, and (c) you retain without modification any 29 | copyright, patent, trademark, or attribution notices that are present in the Work. 30 | 31 | 3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and 32 | distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use 33 | limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works 34 | that are subject to Your Terms. Notwithstanding Your Terms, this License (including the redistribution 35 | requirements in Section 3.1) will continue to apply to the Work itself. 36 | 37 | 3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use 38 | non-commercially. Notwithstanding the foregoing, NVIDIA and its affiliates may use the Work and any derivative 39 | works commercially. As used herein, “non-commercially” means for research or evaluation purposes only. 40 | 41 | 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, 42 | cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then 43 | your rights under this License from such Licensor (including the grant in Section 2.1) will terminate immediately. 44 | 45 | 3.5 Trademarks. This License does not grant any rights to use any Licensor’s or its affiliates’ names, logos, 46 | or trademarks, except as necessary to reproduce the notices described in this License. 47 | 48 | 3.6 Termination. If you violate any term of this License, then your rights under this License (including the 49 | grant in Section 2.1) will terminate immediately. 50 | 51 | 4. Disclaimer of Warranty. 52 | 53 | THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING 54 | WARRANTIES OR CONDITIONS OF M ERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU 55 | BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE. 56 | 57 | 5. Limitation of Liability. 58 | 59 | EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING 60 | NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, 61 | INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR 62 | INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR 63 | DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER COMM ERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN 64 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. -------------------------------------------------------------------------------- /MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/openwebtext/find_duplicates.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import itertools 18 | import json 19 | from lsh import cache, minhash 20 | import time 21 | import sys 22 | 23 | 24 | # This function is adapted from: 25 | # https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb 26 | def shingles(text, char_ngram=5): 27 | return set(text[head:head + char_ngram] 28 | for head in range(0, len(text) - char_ngram)) 29 | 30 | 31 | # This function is adapted from: 32 | # https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb 33 | def jaccard(set_a, set_b): 34 | intersection = set_a & set_b 35 | union = set_a | set_b 36 | return len(intersection) / len(union) 37 | 38 | 39 | if __name__ == '__main__': 40 | 41 | print('finding possible duplicate content ...') 42 | 43 | input = sys.argv[1] 44 | output = sys.argv[2] 45 | 46 | hasher = minhash.MinHasher(seeds=100, char_ngram=5, hashbytes=4) 47 | lshcache = cache.Cache(bands=10, hasher=hasher) 48 | 49 | counter = 0 50 | url_doc = {} 51 | start_time = time.time() 52 | with open(input, 'r') as f: 53 | for line in f: 54 | try: 55 | myjson = json.loads(line) 56 | url = myjson['url'] 57 | text = myjson['text'] 58 | counter += 1 59 | url_doc[url] = text 60 | lshcache.add_fingerprint(hasher.fingerprint(text), url) 61 | except Exception as e: 62 | print('Error:', e) 63 | if counter % 10000 == 0: 64 | print(' [read]> processed {} documents in {:.2f} seconds ...'. 65 | format(counter, time.time() - start_time), flush=True) 66 | 67 | counter = 0 68 | start_time = time.time() 69 | deduped = 0 70 | with open(output, 'wb') as f: 71 | for b in lshcache.bins: 72 | for bucket_id in b: 73 | if len(b[bucket_id]) > 1: 74 | items = list(b[bucket_id]) 75 | main_url = items[0] 76 | main_dhingles = shingles(url_doc[main_url]) 77 | remove_urls = [] 78 | for i in range(1, len(items)): 79 | counter += 1 80 | other_url= items[i] 81 | other_shingles = shingles(url_doc[other_url]) 82 | try: 83 | jaccard_sim = jaccard(main_dhingles, other_shingles) 84 | except Exception as e: 85 | print('Error:', e) 86 | if jaccard_sim > 0.5: 87 | remove_urls.append({other_url: jaccard_sim}) 88 | deduped += 1 89 | if counter % 10000 == 0: 90 | print(' [write]> processed {} documents in {:.2f} ' 91 | 'seoncds and deduped {} documents ...'. 92 | format(counter, time.time() - start_time, 93 | deduped), flush=True) 94 | if len(remove_urls) > 0: 95 | myjson = json.dumps({main_url: remove_urls}, 96 | ensure_ascii=False) 97 | f.write(myjson.encode('utf-8')) 98 | f.write('\n'.encode('utf-8')) 99 | 100 | print('done :-)') 101 | --------------------------------------------------------------------------------