├── assets
    ├── vis.jpg
    └── model.jpg
├── MolBART
    ├── megatron_molbart
    │   ├── Megatron-LM-v1.1.5-3D_parallelism
    │   │   ├── megatron_lm.egg-info
    │   │   │   ├── not-zip-safe
    │   │   │   ├── dependency_links.txt
    │   │   │   ├── top_level.txt
    │   │   │   ├── requires.txt
    │   │   │   └── SOURCES.txt
    │   │   ├── build
    │   │   │   └── lib
    │   │   │   │   └── megatron
    │   │   │   │       ├── mpu
    │   │   │   │           ├── tests
    │   │   │   │           │   ├── __init__.py
    │   │   │   │           │   ├── commons.py
    │   │   │   │           │   ├── test_data.py
    │   │   │   │           │   └── test_initialize.py
    │   │   │   │           ├── __init__.py
    │   │   │   │           └── utils.py
    │   │   │   │       ├── data
    │   │   │   │           ├── __init__.py
    │   │   │   │           └── Makefile
    │   │   │   │       ├── tokenizer
    │   │   │   │           └── __init__.py
    │   │   │   │       ├── model
    │   │   │   │           ├── __init__.py
    │   │   │   │           ├── fused_bias_gelu.py
    │   │   │   │           └── utils.py
    │   │   │   │       ├── fp16
    │   │   │   │           └── __init__.py
    │   │   │   │       ├── module.py
    │   │   │   │       ├── package_info.py
    │   │   │   │       ├── __init__.py
    │   │   │   │       └── deprecated_data_utils
    │   │   │   │           └── corpora.py
    │   │   ├── changes.md
    │   │   ├── requirements.txt
    │   │   ├── MANIFEST.in
    │   │   ├── images
    │   │   │   ├── cases.png
    │   │   │   ├── scaling-dp.png
    │   │   │   ├── scaling-mp.png
    │   │   │   ├── Makefile
    │   │   │   └── tables.tex
    │   │   ├── dist
    │   │   │   ├── megatron_lm-1.1.5-py3.6.egg
    │   │   │   └── megatron_lm-1.1.5-py3.7.egg
    │   │   ├── megatron
    │   │   │   ├── data
    │   │   │   │   ├── test
    │   │   │   │   │   └── test_preprocess_data.sh
    │   │   │   │   ├── Makefile
    │   │   │   │   └── __init__.py
    │   │   │   ├── mpu
    │   │   │   │   ├── tests
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── commons.py
    │   │   │   │   │   ├── test_data.py
    │   │   │   │   │   └── test_initialize.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── utils.py
    │   │   │   ├── tokenizer
    │   │   │   │   └── __init__.py
    │   │   │   ├── model
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── fused_bias_gelu.py
    │   │   │   │   └── utils.py
    │   │   │   ├── fp16
    │   │   │   │   └── __init__.py
    │   │   │   ├── module.py
    │   │   │   ├── package_info.py
    │   │   │   ├── __init__.py
    │   │   │   ├── deprecated_data_utils
    │   │   │   │   ├── scripts
    │   │   │   │   │   └── presplit_sentences_json.py
    │   │   │   │   └── corpora.py
    │   │   │   └── fused_kernels
    │   │   │   │   ├── scaled_upper_triang_masked_softmax.cpp
    │   │   │   │   ├── scaled_masked_softmax.cpp
    │   │   │   │   ├── scaled_upper_triang_masked_softmax_cuda.cu
    │   │   │   │   └── scaled_masked_softmax_cuda.cu
    │   │   ├── examples
    │   │   │   ├── ds_config.json
    │   │   │   ├── generate_text.sh
    │   │   │   ├── ds_zero_stage_2_config.json
    │   │   │   ├── merge_mp_bert.sh
    │   │   │   ├── pretrain_bert.sh
    │   │   │   ├── pretrain_gpt2.sh
    │   │   │   ├── evaluate_zeroshot_gpt2.sh
    │   │   │   ├── pretrain_bert_distributed.sh
    │   │   │   ├── pretrain_gpt2_distributed.sh
    │   │   │   ├── finetune_mnli_distributed.sh
    │   │   │   ├── finetune_race_distributed.sh
    │   │   │   ├── ds_pretrain_gpt2.sh
    │   │   │   └── ds_pretrain_gpt2_pipe.sh
    │   │   ├── tools
    │   │   │   ├── create_doc_index.py
    │   │   │   ├── linter.py
    │   │   │   └── openwebtext
    │   │   │   │   ├── merge_jsons.py
    │   │   │   │   ├── README.md
    │   │   │   │   ├── remove_group_duplicates.py
    │   │   │   │   ├── group_duplicates_url.py
    │   │   │   │   └── find_duplicates.py
    │   │   ├── tasks
    │   │   │   ├── race
    │   │   │   │   └── finetune.py
    │   │   │   ├── glue
    │   │   │   │   ├── data.py
    │   │   │   │   ├── finetune.py
    │   │   │   │   └── mnli.py
    │   │   │   ├── zeroshot_gpt2
    │   │   │   │   └── detokenizer.py
    │   │   │   ├── main.py
    │   │   │   └── data_utils.py
    │   │   └── setup.py
    │   ├── __init__.py
    │   ├── ds_config.json
    │   └── util.py
    ├── __init__.py
    ├── utils
    │   └── __init__.py
    ├── eval_megatron_retrieval_controlled.sh
    └── eval_megatron_retrieval.sh
├── inference
    ├── cheminformatics
    │   ├── utils
    │   │   └── fpscores.pkl.gz
    │   ├── __init__.py
    │   └── common
    │   │   ├── cuchemcommon
    │   │       ├── __init__.py
    │   │       ├── data
    │   │       │   ├── helper
    │   │       │   │   └── __init__.py
    │   │       │   ├── generative_wf.py
    │   │       │   ├── __init__.py
    │   │       │   └── cluster_wf.py
    │   │       ├── utils
    │   │       │   ├── __init__.py
    │   │       │   ├── singleton.py
    │   │       │   ├── sysinfo.py
    │   │       │   └── logger.py
    │   │       ├── context.py
    │   │       └── fingerprint.py
    │   │   └── grpc
    │   │       └── generativesampler.proto
    └── utils_inference
    │   ├── __init__.py
    │   └── LICENSE_RATIONALE
├── guacamol
    └── guacamol
    │   ├── __init__.py
    │   ├── guacamol
    │       ├── utils
    │       │   ├── __init__.py
    │       │   ├── math.py
    │       │   ├── helpers.py
    │       │   ├── fingerprints.py
    │       │   ├── descriptors.py
    │       │   └── sampling_helpers.py
    │       ├── __init__.py
    │       ├── distribution_matching_generator.py
    │       ├── LICENSE_GUACAMOL
    │       ├── goal_directed_generator.py
    │       ├── goal_directed_score_contributions.py
    │       └── assess_goal_directed_generation.py
    │   └── guacamol_baseline
    │       └── LICENSE_GUACAMOL_BASELINE
├── download_scripts
    ├── download_models.sh
    ├── download_results_reproduce.sh
    ├── download_data_exp.sh
    ├── download_data_ret_precompute.sh
    ├── download_data_cov.sh
    └── download_data_guacamol.sh
└── LICENSE


/assets/vis.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/assets/vis.jpg


--------------------------------------------------------------------------------
/assets/model.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/assets/model.jpg


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron_lm.egg-info/not-zip-safe:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/changes.md:
--------------------------------------------------------------------------------
1 | PRETEND THESE ARE CODE CHANGES
2 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron_lm.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron_lm.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | megatron
2 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/requirements.txt:
--------------------------------------------------------------------------------
1 | pybind11
2 | torch
3 | six
4 | regex
5 | numpy
6 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/data/Makefile
2 | include megatron/data/helpers.cpp
3 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron_lm.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | pybind11
2 | torch
3 | six
4 | regex
5 | numpy
6 | 


--------------------------------------------------------------------------------
/inference/cheminformatics/utils/fpscores.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/inference/cheminformatics/utils/fpscores.pkl.gz


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/cases.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/cases.png


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-dp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-dp.png


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-mp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-mp.png


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/dist/megatron_lm-1.1.5-py3.6.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/dist/megatron_lm-1.1.5-py3.6.egg


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/dist/megatron_lm-1.1.5-py3.7.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/RetMol/HEAD/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/dist/megatron_lm-1.1.5-py3.7.egg


--------------------------------------------------------------------------------
/MolBART/__init__.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------
2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # This work is licensed under the NSCL license
5 | # for RetMol. To view a copy of this license, see the LICENSE file.
6 | # ---------------------------------------------------------------


--------------------------------------------------------------------------------
/MolBART/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------
2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # This work is licensed under the NSCL license
5 | # for RetMol. To view a copy of this license, see the LICENSE file.
6 | # ---------------------------------------------------------------


--------------------------------------------------------------------------------
/guacamol/guacamol/__init__.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------
2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # This work is licensed under the NSCL license
5 | # for RetMol. To view a copy of this license, see the LICENSE file.
6 | # ---------------------------------------------------------------


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/__init__.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------
2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # This work is licensed under the NSCL license
5 | # for RetMol. To view a copy of this license, see the LICENSE file.
6 | # ---------------------------------------------------------------


--------------------------------------------------------------------------------
/inference/cheminformatics/__init__.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------
2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # This work is licensed under the NSCL license
5 | # for RetMol. To view a copy of this license, see the LICENSE file.
6 | # ---------------------------------------------------------------


--------------------------------------------------------------------------------
/inference/utils_inference/__init__.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------
2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # This work is licensed under the NSCL license
5 | # for RetMol. To view a copy of this license, see the LICENSE file.
6 | # ---------------------------------------------------------------


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/test/test_preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMPL=cached
 4 | python ../preprocess_data.py \
 5 |        --input test_samples.json \
 6 |        --vocab vocab.txt \
 7 |        --dataset-impl ${IMPL} \
 8 |        --output-prefix test_samples_${IMPL} \
 9 |        --workers 1 \
10 |        --log-interval 2
11 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------
2 | # Taken from the following link as is from:
3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/__init__.py
4 | #
5 | # The license for the original version of this file can be
6 | # found in this directory (LICENSE_GUACAMOL).
7 | # ---------------------------------------------------------------


--------------------------------------------------------------------------------
/inference/cheminformatics/common/cuchemcommon/__init__.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------
2 | # Taken from the following link as is from:
3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/__init__.py
4 | #
5 | # The license for the original version of this file can be
6 | # found in this directory (LICENSE_CHEMINFORMATICS).
7 | # ---------------------------------------------------------------


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/__init__.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/__init__.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_GUACAMOL).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | __version__ = "0.5.5"
10 | 


--------------------------------------------------------------------------------
/inference/cheminformatics/common/cuchemcommon/data/helper/__init__.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------
2 | # Taken from the following link as is from:
3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/data/helper/__init__.py
4 | #
5 | # The license for the original version of this file can be
6 | # found in this directory (LICENSE_CHEMINFORMATICS).
7 | # ---------------------------------------------------------------


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 256,
 3 |   "train_micro_batch_size_per_gpu": 4,
 4 |   "steps_per_print": 10,
 5 |   "gradient_clipping": 1.0,
 6 |   "fp16": {
 7 |     "enabled": true,
 8 | 
 9 |     "loss_scale": 0,
10 |     "loss_scale_window": 1000,
11 |     "hysteresis": 2,
12 |     "min_loss_scale": 1
13 |   },
14 |   "wall_clock_breakdown": true,
15 |   "zero_allow_untested_optimizer": false
16 | }
17 | 


--------------------------------------------------------------------------------
/download_scripts/download_models.sh:
--------------------------------------------------------------------------------
1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1O5u8b_n93HOrsjN1aezq6NhZojh-6dEe' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1O5u8b_n93HOrsjN1aezq6NhZojh-6dEe" -O models.zip && rm -rf /tmp/cookies.txt
2 | 
3 | unzip models.zip -d ../
4 | 
5 | rm -r ../__MACOSX
6 | rm models.zip


--------------------------------------------------------------------------------
/inference/cheminformatics/common/cuchemcommon/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------
2 | # Taken from the following link as is from:
3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/utils/__init__.py
4 | #
5 | # The license for the original version of this file can be
6 | # found in this directory (LICENSE_CHEMINFORMATICS).
7 | # ---------------------------------------------------------------
8 | 
9 | from cuchemcommon.utils.singleton import Singleton


--------------------------------------------------------------------------------
/download_scripts/download_results_reproduce.sh:
--------------------------------------------------------------------------------
1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=16USnJttlMES1uPtRjJ7WNJ3UcXoTUhV1' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=16USnJttlMES1uPtRjJ7WNJ3UcXoTUhV1" -O results_reproduce.zip && rm -rf /tmp/cookies.txt
2 | 
3 | unzip results_reproduce.zip -d ../results_reproduce
4 | 
5 | rm results_reproduce.zip


--------------------------------------------------------------------------------
/download_scripts/download_data_exp.sh:
--------------------------------------------------------------------------------
1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Si5_yHdCGZNHQov99hPp8rOZx4DC6BY_' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Si5_yHdCGZNHQov99hPp8rOZx4DC6BY_" -O data_exp.zip && rm -rf /tmp/cookies.txt
2 | 
3 | mkdir -p ../data
4 | 
5 | unzip data_exp.zip -d ../data/
6 | 
7 | rm -r ../data/__MACOSX
8 | rm data_exp.zip


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/Makefile:
--------------------------------------------------------------------------------
 1 | default: cases.png scaling-mp.png scaling-dp.png
 2 | 
 3 | # for some reason the size option to convert in scaling.tex doesn't work, manually do it after
 4 | cases.png scaling-mp.png scaling-dp.png: tables.tex
 5 | 	latex --shell-escape $<
 6 | 	convert tables-1.png -resize 650 cases.png
 7 | 	convert tables-2.png -resize 600 scaling-mp.png
 8 | 	convert tables-3.png -resize 350 scaling-dp.png
 9 | 
10 | clean:
11 | 	rm -rf *.aux *.log *.dvi *.ps
12 | 	rm -rf tables-*.png
13 | 


--------------------------------------------------------------------------------
/download_scripts/download_data_ret_precompute.sh:
--------------------------------------------------------------------------------
1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1nN1KEDPNuANJqZJwAOPMKwdHxMa6J7F3' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1nN1KEDPNuANJqZJwAOPMKwdHxMa6J7F3" -O data-retrieval-precompute.zip && rm -rf /tmp/cookies.txt
2 | 
3 | unzip data-retrieval-precompute.zip -d ../data/
4 | 
5 | rm data-retrieval-precompute.zip


--------------------------------------------------------------------------------
/download_scripts/download_data_cov.sh:
--------------------------------------------------------------------------------
 1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1J5KVdJc7SMPCSF0Y8feoYH1DyHI-YF7B' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1J5KVdJc7SMPCSF0Y8feoYH1DyHI-YF7B" -O data_cov.zip && rm -rf /tmp/cookies.txt
 2 | 
 3 | mkdir -p ../data
 4 | 
 5 | unzip data_cov.zip -d ../data/cov
 6 | 
 7 | mv ../data/cov/data_cov/* ../data/cov/
 8 | rm -r ../data/cov/data_cov
 9 | rm data_cov.zip
10 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.


--------------------------------------------------------------------------------
/download_scripts/download_data_guacamol.sh:
--------------------------------------------------------------------------------
 1 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1xQqKU0jMqiPCTUl_6yB-mxKqMFPco5zT' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1xQqKU0jMqiPCTUl_6yB-mxKqMFPco5zT" -O data_guacamol.zip && rm -rf /tmp/cookies.txt
 2 | 
 3 | mkdir -p ../data/guacamol
 4 | 
 5 | unzip data_guacamol.zip -d ../data/guacamol/
 6 | 
 7 | mv ../data/guacamol/data_guacamol/* ../data/guacamol/
 8 | mv ../data/guacamol/retrieval_database_guacamol ../data/guacamol/retrieval_database
 9 | 
10 | rm -r ../data/guacamol/data_guacamol
11 | rm -r ../data/guacamol/__MACOSX
12 | rm data_guacamol.zip


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from . import indexed_dataset
17 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | from .tokenizer import build_tokenizer
18 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | from .tokenizer import build_tokenizer
18 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/generate_text.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CHECKPOINT_PATH=checkpoints/gpt2_345m
 4 | VOCAB_FILE=gpt2-vocab.json
 5 | MERGE_FILE=gpt2-merges.txt
 6 | 
 7 | python tools/generate_samples_gpt2.py \
 8 |        --model-parallel-size 1 \
 9 |        --num-layers 24 \
10 |        --hidden-size 1024 \
11 |        --load $CHECKPOINT_PATH \
12 |        --num-attention-heads 16 \
13 |        --max-position-embeddings 1024 \
14 |        --tokenizer-type GPT2BPETokenizer \
15 |        --fp16 \
16 |        --batch-size 2 \
17 |        --seq-length 1024 \
18 |        --out-seq-length 1024 \
19 |        --temperature 1.0 \
20 |        --vocab-file $VOCAB_FILE \
21 |        --merge-file $MERGE_FILE \
22 |        --genfile unconditional_samples.json \
23 |        --num-samples 2 \
24 |        --top_p 0.9 \
25 |        --recompute
26 | 


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/utils/math.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/math.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_GUACAMOL).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | from typing import List
10 | 
11 | import numpy as np
12 | 
13 | 
14 | def arithmetic_mean(values: List[float]) -> float:
15 |     """
16 |     Computes the arithmetic mean of a list of values.
17 |     """
18 |     return sum(values) / len(values)
19 | 
20 | 
21 | def geometric_mean(values: List[float]) -> float:
22 |     """
23 |     Computes the geometric mean of a list of values.
24 |     """
25 |     a = np.array(values)
26 |     return a.prod() ** (1.0 / len(a))
27 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_zero_stage_2_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 2048,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 1,
 5 |   "zero_optimization": {
 6 |     "stage": 2,
 7 |     "allgather_partitions": true,
 8 |     "reduce_scatter": true,
 9 |     "allgather_bucket_size": 50000000,
10 |     "reduce_bucket_size": 50000000,
11 |     "overlap_comm": true
12 |   },
13 |   "optimizer": {
14 |     "type": "Adam",
15 |     "params": {
16 |       "lr": 0.00015,
17 |       "max_grad_norm": 1.0,
18 |       "betas": [0.9, 0.95]
19 |     }
20 |   },
21 |   "gradient_clipping": 1.0,
22 |   "fp16": {
23 |     "enabled": true,
24 | 
25 |     "loss_scale": 0,
26 |     "loss_scale_window": 1000,
27 |     "hysteresis": 2,
28 |     "min_loss_scale": 1
29 |   },
30 |   "wall_clock_breakdown": true,
31 |   "zero_allow_untested_optimizer": false
32 | }
33 | 


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/utils/helpers.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/helpers.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_GUACAMOL).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | import logging
10 | 
11 | 
12 | def setup_default_logger():
13 |     """
14 |     Call this function in your main function to initialize a basic logger.
15 | 
16 |     To have more control on the format or level, call `logging.basicConfig()` directly instead.
17 | 
18 |     If you don't initialize any logger, log entries from the guacamol package will not appear anywhere.
19 |     """
20 |     logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
21 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/merge_mp_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_PARALLEL_SIZE=2
 4 | 
 5 | VOCAB_FILE=bert-vocab.txt
 6 | CHECKPOINT_PATH=checkpoints/bert_345m
 7 | 
 8 | WORLD_SIZE=$MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
 9 |                                 --model-type BERT \
10 |                                 --model-parallel-size $MODEL_PARALLEL_SIZE \
11 |                                 --tokenizer-type BertWordPieceLowerCase \
12 |                                 --vocab-file $VOCAB_FILE \
13 |                                 --num-layers 24 \
14 |                                 --hidden-size 1024 \
15 |                                 --num-attention-heads 16 \
16 |                                 --seq-length 512 \
17 |                                 --max-position-embeddings 512 \
18 |                                 --load $CHECKPOINT_PATH
19 | 


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/distribution_matching_generator.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/distribution_matching_generator.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_GUACAMOL).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | from abc import ABCMeta, abstractmethod
10 | from typing import List
11 | 
12 | 
13 | class DistributionMatchingGenerator(metaclass=ABCMeta):
14 |     """
15 |     Interface for molecule generators.
16 |     """
17 | 
18 |     @abstractmethod
19 |     def generate(self, number_samples: int) -> List[str]:
20 |         """
21 |         Samples SMILES strings from a molecule generator.
22 | 
23 |         Args:
24 |             number_samples: number of molecules to generate
25 | 
26 |         Returns:
27 |             A list of SMILES strings.
28 |         """
29 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .distributed import *
17 | from .bert_model import BertModel
18 | from .realm_model import ICTBertModel
19 | from .gpt2_model import GPT2Model, GPT2ModelPipe
20 | from .utils import get_params_for_weight_decay_optimization
21 | from .language_model import get_language_model
22 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .distributed import *
17 | from .bert_model import BertModel
18 | from .realm_model import ICTBertModel
19 | from .gpt2_model import GPT2Model, GPT2ModelPipe
20 | from .utils import get_params_for_weight_decay_optimization
21 | from .language_model import get_language_model
22 | 


--------------------------------------------------------------------------------
/inference/cheminformatics/common/cuchemcommon/utils/singleton.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/utils/singleton.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_CHEMINFORMATICS).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | import logging
10 | 
11 | """
12 | Metaclass for singletons.
13 | """
14 | 
15 | logger = logging.getLogger(__name__)
16 | 
17 | 
18 | class Singleton(type):
19 |     """
20 |     Ensures single instance of a class.
21 | 
22 |     Example Usage:
23 |         class MySingleton(metaclass=Singleton)
24 |             pass
25 |     """
26 |     _instances = {}
27 | 
28 |     def __call__(cls, *args, **kwargs):
29 |         if cls not in cls._instances:
30 |             cls._instances[cls] = super(Singleton, cls).__call__(
31 |                 *args, **kwargs)
32 |         return cls._instances[cls]
33 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 1024,
 3 |     "train_micro_batch_size_per_gpu": 256,
 4 |     "gradient_accumulation_steps": 1,
 5 |     "steps_per_print": 1,
 6 |     "gradient_clipping": 1.0,
 7 |     "zero_optimization": {
 8 |       "stage": 1,
 9 |       "allgather_partitions": true,
10 |       "reduce_scatter": true,
11 |       "allgather_bucket_size": 5e8,
12 |       "reduce_bucket_size": 5e8,
13 |       "overlap_comm": false,
14 |       "contiguous_gradients": false,
15 |       "cpu_offload": false
16 |     },
17 |      "optimizer": {
18 |           "type": "AdamW",
19 |           "params": {
20 |               "lr": 3e-5,
21 |               "betas": [0.8, 0.999],
22 |               "eps": 1e-8,
23 |               "weight_decay": 3e-7
24 |           }
25 |     },
26 |     "fp16": {
27 |       "enabled": true,
28 |       "loss_scale": 0,
29 |       "loss_scale_window": 1000,
30 |       "hysteresis": 2,
31 |       "min_loss_scale": 1
32 |     },
33 |     "wall_clock_breakdown": true,
34 |     "zero_allow_untested_optimizer": true
35 |   }


--------------------------------------------------------------------------------
/inference/cheminformatics/common/cuchemcommon/data/generative_wf.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/data/generative_wf.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_CHEMINFORMATICS).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | import logging
10 | from typing import List
11 | 
12 | from cuchemcommon.data.helper.chembldata import ChEmblData
13 | from cuchemcommon.utils.singleton import Singleton
14 | 
15 | from . import GenerativeWfDao
16 | 
17 | logger = logging.getLogger(__name__)
18 | 
19 | 
20 | class ChemblGenerativeWfDao(GenerativeWfDao, metaclass=Singleton):
21 | 
22 |     def __init__(self, fp_type):
23 |         self.chem_data = ChEmblData(fp_type)
24 | 
25 |     def fetch_id_from_chembl(self, id: List):
26 |         logger.debug('Fetch ChEMBL ID using molregno...')
27 |         return self.chem_data.fetch_id_from_chembl(id)
28 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/create_doc_index.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('../')
 3 | 
 4 | from megatron.indexer import IndexBuilder
 5 | from megatron.initialize import initialize_megatron
 6 | 
 7 | 
 8 | def main():
 9 |     """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset
10 |     - Include all args needed for initial model specification
11 | 
12 |     Other key args:
13 |         --block-data-path: path to write to
14 |         --ict-load or --realm-load: path to checkpoint with which to embed
15 |         --data-path and --titles-data-path: paths for dataset
16 |         --indexer-log-interval: reporting interval
17 |         --indexer-batch-size: size specific for indexer jobs
18 | 
19 |     Check README.md for example script
20 |     """
21 | 
22 |     initialize_megatron(extra_args_provider=None,
23 |                         args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
24 |     index_builder = IndexBuilder()
25 |     index_builder.build_and_save_index()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     main()
30 | 
31 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RANK=0
 4 | WORLD_SIZE=1
 5 | DATA_PATH=<Specify path and file prefix>_text_sentence
 6 | CHECKPOINT_PATH=<Specify path>
 7 | 
 8 | python pretrain_bert.py \
 9 |        --num-layers 24 \
10 |        --hidden-size 1024 \
11 |        --num-attention-heads 16 \
12 |        --batch-size 4 \
13 |        --seq-length 512 \
14 |        --max-position-embeddings 512 \
15 |        --train-iters 2000000 \
16 |        --save $CHECKPOINT_PATH \
17 |        --load $CHECKPOINT_PATH \
18 |        --data-path $DATA_PATH \
19 |        --vocab-file bert-vocab.txt \
20 |        --data-impl mmap \
21 |        --split 949,50,1 \
22 |        --distributed-backend nccl \
23 |        --lr 0.0001 \
24 |        --min-lr 0.00001 \
25 |        --lr-decay-style linear \
26 |        --lr-decay-iters 990000 \
27 |        --weight-decay 1e-2 \
28 |        --clip-grad 1.0 \
29 |        --warmup .01 \
30 |        --log-interval 100 \
31 |        --save-interval 10000 \
32 |        --eval-interval 1000 \
33 |        --eval-iters 10 \
34 |        --fp16
35 | 
36 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/fp16/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from .fp16util import (
16 |     BN_convert_float,
17 |     network_to_half,
18 |     prep_param_lists,
19 |     model_grads_to_master_grads,
20 |     master_params_to_model_params,
21 |     tofp16,
22 |     to_python_float,
23 |     clip_grad_norm,
24 |     convert_module,
25 |     convert_network,
26 |     FP16Model,
27 | )
28 | 
29 | from .fp16 import *
30 | from .loss_scaler import *
31 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/fp16/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from .fp16util import (
16 |     BN_convert_float,
17 |     network_to_half,
18 |     prep_param_lists,
19 |     model_grads_to_master_grads,
20 |     master_params_to_model_params,
21 |     tofp16,
22 |     to_python_float,
23 |     clip_grad_norm,
24 |     convert_module,
25 |     convert_network,
26 |     FP16Model,
27 | )
28 | 
29 | from .fp16 import *
30 | from .loss_scaler import *
31 | 


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/LICENSE_GUACAMOL:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 BenevolentAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol_baseline/LICENSE_GUACAMOL_BASELINE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 BenevolentAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/inference/utils_inference/LICENSE_RATIONALE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Wengong Jin, Regina Barzilay, Tommi Jaakkola
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_gpt2.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | DATA_PATH=<Specify path and file prefix>_text_document
 9 | CHECKPOINT_PATH=<Specify path>
10 | 
11 | 
12 | python pretrain_gpt2.py \
13 |        --num-layers 24 \
14 |        --hidden-size 1024 \
15 |        --num-attention-heads 16 \
16 |        --batch-size 8 \
17 |        --seq-length 1024 \
18 |        --max-position-embeddings 1024 \
19 |        --train-iters 500000 \
20 |        --lr-decay-iters 320000 \
21 |        --save $CHECKPOINT_PATH \
22 |        --load $CHECKPOINT_PATH \
23 |        --data-path $DATA_PATH \
24 |        --vocab-file gpt2-vocab.json \
25 |        --merge-file gpt2-merges.txt \
26 |        --data-impl mmap \
27 |        --split 949,50,1 \
28 |        --distributed-backend nccl \
29 |        --lr 0.00015 \
30 |        --min-lr 1.0e-5 \
31 |        --lr-decay-style cosine \
32 |        --weight-decay 1e-2 \
33 |        --clip-grad 1.0 \
34 |        --warmup .01 \
35 |        --checkpoint-activations \
36 |        --log-interval 100 \
37 |        --save-interval 10000 \
38 |        --eval-interval 1000 \
39 |        --eval-iters 10 \
40 |        --fp16
41 | 
42 | 
43 | set +x
44 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/module.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Megatron Module"""
17 | 
18 | import torch
19 | 
20 | 
21 | class MegatronModule(torch.nn.Module):
22 |     """Megatron specific extentions of torch Module."""
23 | 
24 |     def __init__(self):
25 |         super(MegatronModule, self).__init__()
26 | 
27 |     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
28 |                                        keep_vars=False):
29 |         """Use this function to override the state dict for
30 |         saving checkpoints."""
31 |         return self.state_dict(destination, prefix, keep_vars)
32 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/module.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Megatron Module"""
17 | 
18 | import torch
19 | 
20 | 
21 | class MegatronModule(torch.nn.Module):
22 |     """Megatron specific extentions of torch Module."""
23 | 
24 |     def __init__(self):
25 |         super(MegatronModule, self).__init__()
26 | 
27 |     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
28 |                                        keep_vars=False):
29 |         """Use this function to override the state dict for
30 |         saving checkpoints."""
31 |         return self.state_dict(destination, prefix, keep_vars)
32 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/evaluate_zeroshot_gpt2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TASK="LAMBADA"
12 | 
13 | VALID_DATA=<lambada path>
14 | VOCAB_FILE=gpt2-vocab.json
15 | MERGE_FILE=gpt2-merges.txt
16 | CHECKPOINT=checkpoints/gpt2_345m
17 | 
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
20 |                --task $TASK \
21 |                --valid-data $VALID_DATA \
22 |                --tokenizer-type GPT2BPETokenizer \
23 |                --strict-lambada \
24 |                --vocab-file $VOCAB_FILE \
25 |                --merge-file $MERGE_FILE \
26 |                --load $CHECKPOINT \
27 |                --model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --batch-size 8 \
32 |                --checkpoint-activations \
33 |                --seq-length 1024 \
34 |                --max-position-embeddings 1024 \
35 |                --log-interval 10 \
36 |                --fp16 \
37 |                --no-load-optim \
38 |                --no-load-rng
39 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/package_info.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | MAJOR = 1
17 | MINOR = 1.5
18 | 
19 | # Use the following formatting: (major, minor)
20 | VERSION = (MAJOR, MINOR)
21 | 
22 | __version__ = '.'.join(map(str, VERSION))
23 | __package_name__ = 'megatron-lm'
24 | __contact_names__ = 'NVIDIA INC'
25 | __url__ = 'https://github.com/NVIDIA/Megatron-LM'
26 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
27 | __description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.'
28 | __license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE'
29 | __keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language'
30 | 
31 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/package_info.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | MAJOR = 1
17 | MINOR = 1.5
18 | 
19 | # Use the following formatting: (major, minor)
20 | VERSION = (MAJOR, MINOR)
21 | 
22 | __version__ = '.'.join(map(str, VERSION))
23 | __package_name__ = 'megatron-lm'
24 | __contact_names__ = 'NVIDIA INC'
25 | __url__ = 'https://github.com/NVIDIA/Megatron-LM'
26 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
27 | __description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.'
28 | __license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE'
29 | __keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language'
30 | 
31 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/linter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import pathlib
 4 | import subprocess
 5 | 
 6 | 
 7 | def recursively_lint_files():
 8 |     """Recursively lint all python files in chosen subdirectories of megatron-lm"""
 9 | 
10 |     try:
11 |         import autopep8
12 |     except ModuleNotFoundError:
13 |         print("Please first install autopep8 via `pip install autopep8`")
14 |         return
15 | 
16 |     # get all python file paths from top level directory
17 |     file_dir = str(pathlib.Path(__file__).parent.absolute())
18 |     working_dir = osp.join(file_dir, os.pardir)
19 |     all_py_paths = set(os.path.join(working_dir, fname)
20 |                        for fname in os.listdir(working_dir) if ".py" in fname)
21 | 
22 |     # get all python file paths from chosen subdirectories
23 |     check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
24 |     for sub_dir in check_dirs:
25 |         for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
26 |             all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
27 | 
28 |     print("Linting the following: ")
29 |     for py_path in all_py_paths:
30 |         print(py_path)
31 |         command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
32 |         subprocess.check_call(command)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     recursively_lint_files()
37 | 


--------------------------------------------------------------------------------
/inference/cheminformatics/common/grpc/generativesampler.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package nvidia.cheminformatics.grpc;
 4 | 
 5 | import "google/protobuf/empty.proto";
 6 | 
 7 | //python -m pip install grpcio
 8 | //python -m pip install grpcio-tools
 9 | //python -m grpc_tools.protoc -I./grpc/ \
10 | //             --python_out=generated \
11 | //             --experimental_allow_proto3_optional \
12 | //             --grpc_python_out=generated \
13 | //             ./grpc/generativesampler.proto
14 | 
15 | 
16 | enum GenerativeModel {
17 |   CDDD = 0;
18 |   MegaMolBART = 1;
19 |   MolBART = 10000;
20 | }
21 | 
22 | 
23 | service GenerativeSampler {
24 |   rpc SmilesToEmbedding(GenerativeSpec) returns (EmbeddingList) {};
25 |   rpc FindSimilars(GenerativeSpec) returns (SmilesList) {};
26 |   rpc Interpolate(GenerativeSpec) returns (SmilesList) {};
27 |   rpc GetIteration(google.protobuf.Empty) returns (IterationVal) {};
28 | }
29 | 
30 | 
31 | message GenerativeSpec {
32 |   GenerativeModel model = 1;
33 |   repeated string smiles = 2;
34 |   optional float radius = 3;
35 |   optional int32 numRequested = 4;
36 |   optional int32 padding = 5;
37 |   optional bool forceUnique = 6;
38 |   optional bool sanitize = 7;
39 | }
40 | 
41 | 
42 | message SmilesList {
43 |   repeated string generatedSmiles = 1;
44 |   repeated EmbeddingList embeddings = 2;
45 | }
46 | 
47 | message EmbeddingList{
48 |   repeated float embedding = 1;
49 | }
50 | 
51 | message IterationVal{
52 |   int32 iteration = 1;
53 | }
54 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/util.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # This file has been modified from MolecularAI/MolBART
 5 | #
 6 | # Source:
 7 | # https://github.com/MolecularAI/MolBART/blob/master/molbart/util.py
 8 | #
 9 | # The license for the original version of this file can be
10 | # found in this directory (LICENSE_MOLBART).
11 | # The modifications to this file are subject to the same license.
12 | # ---------------------------------------------------------------
13 | 
14 | # coding=utf-8
15 | 
16 | import os
17 | 
18 | project_home = os.environ['PROJECT_HOME']
19 | 
20 | root = project_home
21 | DEFAULT_VOCAB_PATH = os.path.join(root, 'models/megamolbart/bart_vocab.txt')
22 | CHECKPOINTS_DIR = os.path.join(root, 'models/megamolbart/checkpoints')
23 | 
24 | # Tokenization and vocabulary
25 | DEFAULT_MAX_SEQ_LEN = 512
26 | DEFAULT_CHEM_TOKEN_START = 272
27 | DEFAULT_BEGIN_TOKEN = "^"
28 | DEFAULT_END_TOKEN = "&"
29 | DEFAULT_PAD_TOKEN = "<PAD>"
30 | DEFAULT_UNK_TOKEN = "?"
31 | DEFAULT_MASK_TOKEN = "<MASK>"
32 | DEFAULT_SEP_TOKEN = "<SEP>"
33 | DEFAULT_MASK_PROB = 0.15
34 | DEFAULT_SHOW_MASK_TOKEN_PROB = 1.0
35 | DEFAULT_MASK_SCHEME = "span"
36 | DEFAULT_SPAN_LAMBDA = 3.0
37 | REGEX = "\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]"
38 | 
39 | # Model parameters
40 | DEFAULT_D_MODEL = 256
41 | DEFAULT_NUM_LAYERS = 4
42 | DEFAULT_NUM_HEADS = 8
43 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_bert_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>_text_sentence
12 | CHECKPOINT_PATH=<Specify path>
13 | 
14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
15 | 
16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
17 |        pretrain_bert.py \
18 |        --model-parallel-size 1 \
19 |        --num-layers 24 \
20 |        --hidden-size 1024 \
21 |        --num-attention-heads 16 \
22 |        --batch-size 4 \
23 |        --seq-length 512 \
24 |        --max-position-embeddings 512 \
25 |        --train-iters 1000000 \
26 |        --save $CHECKPOINT_PATH \
27 |        --load $CHECKPOINT_PATH \
28 |        --data-path $DATA_PATH \
29 |        --vocab-file bert-vocab.txt \
30 |        --data-impl mmap \
31 |        --split 949,50,1 \
32 |        --distributed-backend nccl \
33 |        --lr 0.0001 \
34 |        --lr-decay-style linear \
35 |        --min-lr 1.0e-5 \
36 |        --lr-decay-iters 990000 \
37 |        --weight-decay 1e-2 \
38 |        --clip-grad 1.0 \
39 |        --warmup .01 \
40 |        --log-interval 100 \
41 |        --save-interval 10000 \
42 |        --eval-interval 1000 \
43 |        --eval-iters 10 \
44 |        --fp16
45 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | from .package_info import (
18 |     __description__,
19 |     __contact_names__,
20 |     __url__,
21 |     __download_url__,
22 |     __keywords__,
23 |     __license__,
24 |     __package_name__,
25 |     __version__,
26 | )
27 | 
28 | from .global_vars import get_args
29 | from .global_vars import get_tokenizer
30 | from .global_vars import get_tensorboard_writer
31 | from .global_vars import get_adlr_autoresume
32 | from .global_vars import get_timers
33 | from .initialize  import initialize_megatron
34 | 
35 | def print_rank_0(message):
36 |     """If distributed is initialized print only on rank 0."""
37 |     if torch.distributed.is_initialized():
38 |         if torch.distributed.get_rank() == 0:
39 |             print(message, flush=True)
40 |     else:
41 |         print(message, flush=True)
42 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | from .package_info import (
18 |     __description__,
19 |     __contact_names__,
20 |     __url__,
21 |     __download_url__,
22 |     __keywords__,
23 |     __license__,
24 |     __package_name__,
25 |     __version__,
26 | )
27 | 
28 | from .global_vars import get_args
29 | from .global_vars import get_tokenizer
30 | from .global_vars import get_tensorboard_writer
31 | from .global_vars import get_adlr_autoresume
32 | from .global_vars import get_timers
33 | from .initialize  import initialize_megatron
34 | 
35 | def print_rank_0(message):
36 |     """If distributed is initialized print only on rank 0."""
37 |     if torch.distributed.is_initialized():
38 |         if torch.distributed.get_rank() == 0:
39 |             print(message, flush=True)
40 |     else:
41 |         print(message, flush=True)
42 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """
17 | Usage:
18 | python scripts/presplit_sentences_json.py <original loose json file> <output loose json file>
19 | """
20 | 
21 | import sys
22 | import json
23 | 
24 | import nltk
25 | 
26 | nltk.download('punkt')
27 | 
28 | input_file = sys.argv[1]
29 | output_file = sys.argv[2]
30 | 
31 | line_seperator = "\n"
32 | 
33 | with open(input_file, 'r') as ifile:
34 |     with open(output_file, "w") as ofile:
35 |         for doc in ifile.readlines():
36 |             parsed = json.loads(doc)
37 |             sent_list = []
38 |             for line in parsed['text'].split('\n'):
39 |                 if line != '\n':
40 |                     sent_list.extend(nltk.tokenize.sent_tokenize(line))
41 |             parsed['text'] = line_seperator.join(sent_list)
42 |             ofile.write(json.dumps(parsed) + '\n')
43 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_gpt2_distributed.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | DATA_PATH=<Specify path and file prefix>_text_document
14 | CHECKPOINT_PATH=<Specify path>
15 | 
16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
19 |        pretrain_gpt2.py \
20 |        --model-parallel-size 1 \
21 |        --num-layers 24 \
22 |        --hidden-size 1024 \
23 |        --num-attention-heads 16 \
24 |        --batch-size 8 \
25 |        --seq-length 1024 \
26 |        --max-position-embeddings 1024 \
27 |        --train-iters 500000 \
28 |        --lr-decay-iters 320000 \
29 |        --save $CHECKPOINT_PATH \
30 |        --load $CHECKPOINT_PATH \
31 |        --data-path $DATA_PATH \
32 |        --vocab-file gpt2-vocab.json \
33 |        --merge-file gpt2-merges.txt \
34 |        --data-impl mmap \
35 |        --split 949,50,1 \
36 |        --distributed-backend nccl \
37 |        --lr 0.00015 \
38 |        --lr-decay-style cosine \
39 |        --min-lr 1.0e-5 \
40 |        --weight-decay 1e-2 \
41 |        --clip-grad 1.0 \
42 |        --warmup .01 \
43 |        --checkpoint-activations \
44 |        --log-interval 100 \
45 |        --save-interval 10000 \
46 |        --eval-interval 1000 \
47 |        --eval-iters 10 \
48 |        --fp16
49 | 
50 | 
51 | 
52 | set +x
53 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/images/tables.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[multi,convert]{standalone}
 2 | \usepackage{multirow}
 3 | \standaloneenv{tabular}
 4 | 
 5 | \begin{document}
 6 | 
 7 | \begin{tabular}{cccccc}
 8 |   Case & Hidden Size & Attention Heads & Layers & Parameters (billions) & Model Parallel Partitions \\
 9 |   \hline
10 |   1B & 1920 & 15 & 24 & 1.16 & 1 \\
11 |   2B & 2304 & 18 & 30 & 2.03 & 2 \\
12 |   4B & 3072 & 24 & 36 & 4.24 & 4 \\
13 |   8B & 4096 & 32 & 42 & 8.67 & 8 \\
14 | \end{tabular}
15 | 
16 | \begin{tabular}{cc|ccc|ccc}
17 |   & & \multicolumn{3}{c|}{\textbf{DGX-2 (V100) batch size 8}} & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 16}} \\
18 |   \hline
19 |   \multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\
20 |                         & GPUs      & Time (ms) &                          & per GPU   & Time (ms) &                          & per GPU \\
21 |   \hline
22 |   1B & 1 & 1121 & 100.0\% & 71.9 & 1076 & 100\%  & 149.8 \\
23 |   2B & 2 & 1093 & 89.6\%  & 64.2 & 1026 & 91.7\% & 136.8 \\
24 |   4B & 4 & 1238 & 82.5\%  & 58.5 & 1162 & 84.5\% & 124.7 \\
25 |   8B & 8 & 1407 & 74.3\%  & 52.2 & 1343 & 74.7\% & 109.3 \\
26 | \end{tabular}
27 | 
28 | \begin{tabular}{cc|ccc}
29 |   & & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 2048}} \\
30 |   \hline
31 |   \multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\
32 |                         & GPUs      & Time (ms) &                          & per GPU   \\
33 |   \hline
34 |   1B & 128  & 1153 & 93.3\% & 139.8 \\
35 |   2B & 256  & 1101 & 85.5\% & 127.5 \\
36 |   4B & 512  & 1242 & 79.0\% & 116.7 \\
37 |   8B & 1024 & 1380 & 72.7\% & 106.5 \\
38 | \end{tabular}
39 | 
40 | \end{document}
41 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/finetune_mnli_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv"
12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
13 |             data/glue_data/MNLI/dev_mismatched.tsv"
14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
15 | VOCAB_FILE=bert-vocab.txt
16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task MNLI \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 5 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --batch-size 8 \
32 |                --checkpoint-activations \
33 |                --lr 5.0e-5 \
34 |                --lr-decay-style linear \
35 |                --warmup 0.065 \
36 |                --seq-length 512 \
37 |                --max-position-embeddings 512 \
38 |                --save-interval 500000 \
39 |                --save $CHECKPOINT_PATH \
40 |                --log-interval 10 \
41 |                --eval-interval 100 \
42 |                --eval-iters 50 \
43 |                --weight-decay 1.0e-1 \
44 |                --fp16
45 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/finetune_race_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/RACE/train/middle"
12 | VALID_DATA="data/RACE/dev/middle \
13 |             data/RACE/dev/high"
14 | VOCAB_FILE=bert-vocab.txt
15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
16 | CHECKPOINT_PATH=checkpoints/bert_345m_race
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task RACE \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 3 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --batch-size 4 \
32 |                --checkpoint-activations \
33 |                --lr 1.0e-5 \
34 |                --lr-decay-style linear \
35 |                --warmup 0.06 \
36 |                --seq-length 512 \
37 |                --max-position-embeddings 512 \
38 |                --save-interval 100000 \
39 |                --save $CHECKPOINT_PATH \
40 |                --log-interval 10 \
41 |                --eval-interval 100 \
42 |                --eval-iters 50 \
43 |                --weight-decay 1.0e-1 \
44 |                --clip-grad 1.0 \
45 |                --hidden-dropout 0.1 \
46 |                --attention-dropout 0.1 \
47 |                --fp16
48 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/openwebtext/merge_jsons.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import glob
18 | import sys
19 | import json
20 | import argparse
21 | 
22 | if __name__ == '__main__':
23 | 
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--json_path", type=str, default=".",
26 |         help="path where all the json files are located")
27 | 
28 |     parser.add_argument("--output_file", type=str, default="merged_output.json",
29 |         help="filename where the merged json should go")
30 | 
31 |     args = parser.parse_args()
32 | 
33 |     json_path = args.json_path
34 |     out_file = args.output_file
35 | 
36 |     json_files = glob.glob(json_path + '/*.json')
37 | 
38 |     counter = 0
39 | 
40 |     with open(out_file, 'w') as outfile:
41 |         for fname in json_files:
42 |             counter += 1
43 | 
44 |             if counter % 1024 == 0:
45 |                 print("Merging at ", counter, flush=True)
46 | 
47 |             with open(fname, 'r') as infile:
48 |                 for row in infile:
49 |                     each_row = json.loads(row)
50 |                     outfile.write(row)
51 | 
52 | 
53 |     print("Merged file", out_file, flush=True)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/inference/cheminformatics/common/cuchemcommon/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/data/__init__.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_CHEMINFORMATICS).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | 
10 | from typing import List
11 | 
12 | 
13 | class ClusterWfDAO(object):
14 |     """
15 |     Base class for all DAO for fetching data for Clustering Workflows
16 |     """
17 | 
18 |     def meta_df(self):
19 |         """
20 |         Returns df with dtype set for structure without any column filter.
21 |         """
22 |         return NotImplemented
23 | 
24 |     def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None):
25 |         """
26 |         Fetch molecular properties from database/cache into a dask array.
27 |         """
28 |         return NotImplemented
29 | 
30 |     def fetch_molecular_embedding_by_id(self, molecule_id: List):
31 |         """
32 |         Fetch molecular properties from database for the given id. Id depends on
33 |         the backend databse. For chemble DB it should be molregid.
34 |         """
35 |         return NotImplemented
36 | 
37 |     def fetch_id_from_smile(self, new_molecules: List):
38 |         """
39 |         Fetch molecular details for a list of molecules. The values in the list
40 |         of molecules depends on database/service used. For e.g. it could be
41 |         ChemblId or molreg_id for Chemble database.
42 |         """
43 |         return NotImplemented
44 | 
45 | 
46 | class GenerativeWfDao(object):
47 | 
48 |     def fetch_id_from_chembl(self, id: List):
49 |         """
50 |         Fetch molecular details for a list of molecules. The values in the list
51 |         of molecules depends on database/service used. For e.g. it could be
52 |         ChemblId or molreg_id for Chemble database.
53 |         """
54 |         return NotImplemented
55 | 


--------------------------------------------------------------------------------
/inference/cheminformatics/common/cuchemcommon/context.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/context.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_CHEMINFORMATICS).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | import logging
10 | import os
11 | from configparser import RawConfigParser
12 | from io import StringIO
13 | 
14 | from cuchemcommon.utils.singleton import Singleton
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | CONFIG_FILE = '.env'
19 | 
20 | 
21 | class Context(metaclass=Singleton):
22 | 
23 |     def __init__(self):
24 | 
25 |         self.dask_client = None
26 |         self.compute_type = 'gpu'
27 |         self.is_benchmark = False
28 |         self.benchmark_file = None
29 |         self.cache_directory = None
30 |         self.n_molecule = None
31 |         self.batch_size = 10000
32 | 
33 |         self.config = {}
34 |         if os.path.exists(CONFIG_FILE):
35 |             logger.info('Reading properties from %s...', CONFIG_FILE)
36 |             self.config = self._load_properties_file(CONFIG_FILE)
37 |         else:
38 |             logger.warn('Could not locate %s', CONFIG_FILE)
39 | 
40 |     def _load_properties_file(self, properties_file):
41 |         """
42 |         Reads a properties file using ConfigParser.
43 | 
44 |         :param propertiesFile/configFile:
45 |         """
46 |         config_file = open(properties_file, 'r')
47 |         config_content = StringIO('[root]\n' + config_file.read())
48 |         config = RawConfigParser()
49 |         config.read_file(config_content)
50 | 
51 |         return config._sections['root']
52 | 
53 |     def get_config(self, config_name, default=None):
54 |         """
55 |         Returns values from local configuration.
56 |         """
57 |         try:
58 |             return self.config[config_name]
59 |         except KeyError:
60 |             logger.warn('%s not found, returing default.', config_name)
61 |             return default
62 | 


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/goal_directed_generator.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # This file has been modified from guacamol benchmark.
 5 | #
 6 | # Source:
 7 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/goal_directed_generator.py
 8 | #
 9 | # The license for the original version of this file can be
10 | # found in this directory (LICENSE_GUACAMOL).
11 | # The modifications to this file are subject to the same license.
12 | # ---------------------------------------------------------------
13 | import os
14 | import sys
15 | from abc import ABCMeta, abstractmethod
16 | from typing import List, Optional
17 | import pytorch_lightning as pl
18 | from guacamol.guacamol.scoring_function import ScoringFunction
19 | 
20 | project_home = os.environ['PROJECT_HOME']
21 | sys.path.insert(1, project_home + '/inference')
22 | from inference import MegaMolBART
23 | 
24 | 
25 | class GoalDirectedGenerator(metaclass=ABCMeta):
26 |     """
27 |     Interface for goal-directed molecule generators.
28 |     """
29 | 
30 |     def __init__(self, model_path, ret_data_path, 
31 |                 model_ckpt_itr=50000, max_mol_len=200):
32 |         '''
33 |         my defined initialization
34 |         '''
35 |         self.wf = MegaMolBART(model_path=model_path, model_ckpt_itr=model_ckpt_itr, decoder_max_seq_len=max_mol_len)
36 |         self.ret_dataset = None
37 | 
38 | 
39 |     @abstractmethod
40 |     def generate_optimized_molecules(self, scoring_function: ScoringFunction, number_molecules: int,
41 |                                      starting_population: Optional[List[str]], benchmark_name: str):
42 |         """
43 |         Given an objective function, generate molecules that score as high as possible.
44 | 
45 |         Args:
46 |             scoring_function: scoring function
47 |             number_molecules: number of molecules to generate
48 |             starting_population: molecules to start the optimization from (optional)
49 |             benchmark_name: benchmark name
50 | 
51 |         Returns:
52 |             A list of SMILES strings for the generated molecules.
53 |         """
54 |         pl.utilities.seed.seed_everything(1234)
55 | 


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/utils/fingerprints.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/fingerprints.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_GUACAMOL).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | from rdkit.Chem import AllChem, Mol
10 | from rdkit.Chem.AtomPairs.Sheridan import GetBPFingerprint, GetBTFingerprint
11 | from rdkit.Chem.Pharm2D import Generate, Gobbi_Pharm2D
12 | 
13 | 
14 | class _FingerprintCalculator:
15 |     """
16 |     Calculate the fingerprint while avoiding a series of if-else.
17 |     See recipe 8.21 of the book "Python Cookbook".
18 | 
19 |     To support a new type of fingerprint, just add a function "get_fpname(self, mol)".
20 |     """
21 | 
22 |     def get_fingerprint(self, mol: Mol, fp_type: str):
23 |         method_name = 'get_' + fp_type
24 |         method = getattr(self, method_name)
25 |         if method is None:
26 |             raise Exception(f'{fp_type} is not a supported fingerprint type.')
27 |         return method(mol)
28 | 
29 |     def get_AP(self, mol: Mol):
30 |         return AllChem.GetAtomPairFingerprint(mol, maxLength=10)
31 | 
32 |     def get_PHCO(self, mol: Mol):
33 |         return Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory)
34 | 
35 |     def get_BPF(self, mol: Mol):
36 |         return GetBPFingerprint(mol)
37 | 
38 |     def get_BTF(self, mol: Mol):
39 |         return GetBTFingerprint(mol)
40 | 
41 |     def get_PATH(self, mol: Mol):
42 |         return AllChem.RDKFingerprint(mol)
43 | 
44 |     def get_ECFP4(self, mol: Mol):
45 |         return AllChem.GetMorganFingerprint(mol, 2)
46 | 
47 |     def get_ECFP6(self, mol: Mol):
48 |         return AllChem.GetMorganFingerprint(mol, 3)
49 | 
50 |     def get_FCFP4(self, mol: Mol):
51 |         return AllChem.GetMorganFingerprint(mol, 2, useFeatures=True)
52 | 
53 |     def get_FCFP6(self, mol: Mol):
54 |         return AllChem.GetMorganFingerprint(mol, 3, useFeatures=True)
55 | 
56 | 
57 | def get_fingerprint(mol: Mol, fp_type: str):
58 |     return _FingerprintCalculator().get_fingerprint(mol=mol, fp_type=fp_type)
59 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/openwebtext/README.md:
--------------------------------------------------------------------------------
 1 | The following steps show how to prepare training dataset to train the mode.
 2 | 
 3 | # Libraries to install
 4 | 
 5 | ```
 6 |     pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 
 7 |     git clone https://github.com/mattilyra/LSH
 8 |     cd LSH
 9 |     python setup.py install
10 | ``` 
11 | 
12 | # Download the dataset
13 | 
14 | 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ)
15 | 2. Remove blacklisted URLs.
16 | ```
17 | python blacklist_urls.py <path to the dowloaded deduplicated URLs> <filename for clean urls. e.g. clean_urls.txt>
18 | ```
19 | 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 
20 | 
21 | 4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique.
22 | 
23 | # Prepare the data for GPT-2 training:
24 | 
25 | 1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards.
26 | ```
27 | python cleanup_dataset.py <input data file> <output cleaned data filename>
28 | ```
29 | 2. Using LSH, find possible duplicates and store then in a file for later processing. This step can NOT be sharded and usually takes 12 to 24 hours for OpenWebText dataset.
30 | ```
31 | python find_duplicates.py <input cleaned data file> <output possible duplicate urls filename>
32 | ```
33 | 3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest.
34 | ```
35 | python group_duplicate_urls.py <possible duplicate urls file> <output file containing similar urls>
36 | ```
37 | 4. Remove similar documents that were detected in the last step.
38 | ```
39 | python remove_group_duplicates.py <file containing simialr documents> <cleaned data file> <outputfile containing deduplicate data>
40 | ```
41 | 
42 | 5. Shuffle the dataset.
43 | ```
44 | shuf <cleaned deduped data file> -o train_data.json
45 | ```
46 | 
47 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/corpora.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """several datasets with preset arguments"""
16 | from .datasets import json_dataset, csv_dataset
17 | import os
18 | 
19 | 
20 | class wikipedia(json_dataset):
21 |     """
22 |     dataset for wikipedia with arguments configured for convenience
23 | 
24 |     command line usage: `--train-data wikipedia`
25 |     """
26 |     PATH = 'data/wikipedia/wikidump_lines.json'
27 |     assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py"
28 | 
29 |     def __init__(self, **kwargs):
30 |         assert os.path.exists(wikipedia.PATH), \
31 |             wikipedia.assert_str
32 |         if not kwargs:
33 |             kwargs = {}
34 |         kwargs['text_key'] = 'text'
35 |         kwargs['loose_json'] = True
36 |         super(wikipedia, self).__init__(wikipedia.PATH, **kwargs)
37 | 
38 | 
39 | class webtext(json_dataset):
40 |     """
41 |     dataset for webtext with arguments configured for convenience
42 | 
43 |     command line usage: `--train-data webtext`
44 |     """
45 |     PATH = 'data/webtext/data.json'
46 |     assert_str = "make sure to set PATH for webtext data_utils/corpora.py"
47 | 
48 |     def __init__(self, **kwargs):
49 |         assert os.path.exists(webtext.PATH), \
50 |             webtext.assert_str
51 |         if not kwargs:
52 |             kwargs = {}
53 |         kwargs['text_key'] = 'text'
54 |         kwargs['loose_json'] = True
55 |         super(webtext, self).__init__(webtext.PATH, **kwargs)
56 | 
57 | 
58 | NAMED_CORPORA = {
59 |     'wikipedia': wikipedia,
60 |     'webtext': webtext,
61 | }
62 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/deprecated_data_utils/corpora.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """several datasets with preset arguments"""
16 | from .datasets import json_dataset, csv_dataset
17 | import os
18 | 
19 | 
20 | class wikipedia(json_dataset):
21 |     """
22 |     dataset for wikipedia with arguments configured for convenience
23 | 
24 |     command line usage: `--train-data wikipedia`
25 |     """
26 |     PATH = 'data/wikipedia/wikidump_lines.json'
27 |     assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py"
28 | 
29 |     def __init__(self, **kwargs):
30 |         assert os.path.exists(wikipedia.PATH), \
31 |             wikipedia.assert_str
32 |         if not kwargs:
33 |             kwargs = {}
34 |         kwargs['text_key'] = 'text'
35 |         kwargs['loose_json'] = True
36 |         super(wikipedia, self).__init__(wikipedia.PATH, **kwargs)
37 | 
38 | 
39 | class webtext(json_dataset):
40 |     """
41 |     dataset for webtext with arguments configured for convenience
42 | 
43 |     command line usage: `--train-data webtext`
44 |     """
45 |     PATH = 'data/webtext/data.json'
46 |     assert_str = "make sure to set PATH for webtext data_utils/corpora.py"
47 | 
48 |     def __init__(self, **kwargs):
49 |         assert os.path.exists(webtext.PATH), \
50 |             webtext.assert_str
51 |         if not kwargs:
52 |             kwargs = {}
53 |         kwargs['text_key'] = 'text'
54 |         kwargs['loose_json'] = True
55 |         super(webtext, self).__init__(webtext.PATH, **kwargs)
56 | 
57 | 
58 | NAMED_CORPORA = {
59 |     'wikipedia': wikipedia,
60 |     'webtext': webtext,
61 | }
62 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/race/finetune.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Race."""
17 | 
18 | from megatron import get_args
19 | from megatron import print_rank_0
20 | from megatron import get_tokenizer
21 | from megatron.model.multiple_choice import MultipleChoice
22 | from tasks.eval_utils import accuracy_func_provider
23 | from tasks.finetune_utils import finetune
24 | from tasks.race.data import RaceDataset
25 | 
26 | 
27 | def train_valid_datasets_provider():
28 |     """Provide train and validation datasets."""
29 |     args = get_args()
30 |     tokenizer = get_tokenizer()
31 | 
32 |     train_dataset = RaceDataset('training', args.train_data,
33 |                                 tokenizer, args.seq_length)
34 |     valid_dataset = RaceDataset('validation', args.valid_data,
35 |                                 tokenizer, args.seq_length)
36 | 
37 |     return train_dataset, valid_dataset
38 | 
39 | 
40 | def model_provider():
41 |     """Build the model."""
42 | 
43 |     print_rank_0('building multichoice model for RACE ...')
44 | 
45 |     return MultipleChoice(num_tokentypes=2)
46 | 
47 | 
48 | def metrics_func_provider():
49 |     """Privde metrics callback function."""
50 |     args = get_args()
51 |     tokenizer = get_tokenizer()
52 | 
53 |     def single_dataset_provider(datapath):
54 |         name = datapath.split('RACE')[-1].strip('/').replace('/', '-')
55 |         return RaceDataset(name, [datapath], tokenizer, args.seq_length)
56 | 
57 |     return accuracy_func_provider(single_dataset_provider)
58 | 
59 | 
60 | def main():
61 | 
62 |     finetune(train_valid_datasets_provider, model_provider,
63 |              end_of_epoch_callback_provider=metrics_func_provider)
64 | 


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/utils/descriptors.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/descriptors.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_GUACAMOL).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | from rdkit import Chem
10 | from rdkit.Chem import Descriptors, Mol, rdMolDescriptors
11 | 
12 | 
13 | def logP(mol: Mol) -> float:
14 |     return Descriptors.MolLogP(mol)
15 | 
16 | 
17 | def qed(mol: Mol) -> float:
18 |     return Descriptors.qed(mol)
19 | 
20 | 
21 | def tpsa(mol: Mol) -> float:
22 |     return Descriptors.TPSA(mol)
23 | 
24 | 
25 | def bertz(mol: Mol) -> float:
26 |     return Descriptors.BertzCT(mol)
27 | 
28 | 
29 | def mol_weight(mol: Mol) -> float:
30 |     return Descriptors.MolWt(mol)
31 | 
32 | 
33 | def num_H_donors(mol: Mol) -> int:
34 |     return Descriptors.NumHDonors(mol)
35 | 
36 | 
37 | def num_H_acceptors(mol: Mol) -> int:
38 |     return Descriptors.NumHAcceptors(mol)
39 | 
40 | 
41 | def num_rotatable_bonds(mol: Mol) -> int:
42 |     return Descriptors.NumRotatableBonds(mol)
43 | 
44 | 
45 | def num_rings(mol: Mol) -> int:
46 |     return rdMolDescriptors.CalcNumRings(mol)
47 | 
48 | 
49 | def num_aromatic_rings(mol: Mol) -> int:
50 |     return rdMolDescriptors.CalcNumAromaticRings(mol)
51 | 
52 | 
53 | def num_atoms(mol: Mol) -> int:
54 |     """
55 |     Returns the total number of atoms, H included
56 |     """
57 |     mol = Chem.AddHs(mol)
58 |     return mol.GetNumAtoms()
59 | 
60 | 
61 | class AtomCounter:
62 | 
63 |     def __init__(self, element: str) -> None:
64 |         """
65 |         Args:
66 |             element: element to count within a molecule
67 |         """
68 |         self.element = element
69 | 
70 |     def __call__(self, mol: Mol) -> int:
71 |         """
72 |         Count the number of atoms of a given type.
73 | 
74 |         Args:
75 |             mol: molecule
76 | 
77 |         Returns:
78 |             The number of atoms of the given type.
79 |         """
80 |         # if the molecule contains H atoms, they may be implicit, so add them
81 |         if self.element == 'H':
82 |             mol = Chem.AddHs(mol)
83 | 
84 |         return sum(1 for a in mol.GetAtoms() if a.GetSymbol() == self.element)
85 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | 
18 | torch._C._jit_set_profiling_mode(False)
19 | torch._C._jit_set_profiling_executor(False)
20 | torch._C._jit_override_can_fuse_on_cpu(True)
21 | torch._C._jit_override_can_fuse_on_gpu(True)
22 | 
23 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
24 | # 1/sqrt(2*pi)-> 0.3989423
25 | # 1/sqrt(2)   -> 0.70710678
26 | # sqrt(2/pi)  -> 0.79788456
27 | # this function is tanh approximation of gelu
28 | # actual gelu is:
29 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
30 | 
31 | @torch.jit.script
32 | def bias_gelu(bias, y):
33 |     x = bias + y
34 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
35 | 
36 | # gradient of tanh approximation of gelu
37 | # gradient of actual gelu is:
38 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
39 | @torch.jit.script
40 | def bias_gelu_back(g, bias, y):
41 |     x = bias + y
42 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
43 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
44 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
45 |     return ff*g
46 | 
47 | class GeLUFunction(torch.autograd.Function):
48 |     @staticmethod
49 |     # bias is an optional argument
50 |     def forward(ctx, input, bias):
51 |         ctx.save_for_backward(input, bias)
52 |         return bias_gelu(bias, input)
53 | 
54 |     @staticmethod
55 |     def backward(ctx, grad_output):
56 |         input, bias = ctx.saved_tensors
57 |         tmp = bias_gelu_back(grad_output, bias, input)
58 |         return tmp, tmp
59 | 
60 | bias_gelu_impl = GeLUFunction.apply
61 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | 
18 | torch._C._jit_set_profiling_mode(False)
19 | torch._C._jit_set_profiling_executor(False)
20 | torch._C._jit_override_can_fuse_on_cpu(True)
21 | torch._C._jit_override_can_fuse_on_gpu(True)
22 | 
23 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
24 | # 1/sqrt(2*pi)-> 0.3989423
25 | # 1/sqrt(2)   -> 0.70710678
26 | # sqrt(2/pi)  -> 0.79788456
27 | # this function is tanh approximation of gelu
28 | # actual gelu is:
29 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
30 | 
31 | @torch.jit.script
32 | def bias_gelu(bias, y):
33 |     x = bias + y
34 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
35 | 
36 | # gradient of tanh approximation of gelu
37 | # gradient of actual gelu is:
38 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
39 | @torch.jit.script
40 | def bias_gelu_back(g, bias, y):
41 |     x = bias + y
42 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
43 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
44 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
45 |     return ff*g
46 | 
47 | class GeLUFunction(torch.autograd.Function):
48 |     @staticmethod
49 |     # bias is an optional argument
50 |     def forward(ctx, input, bias):
51 |         ctx.save_for_backward(input, bias)
52 |         return bias_gelu(bias, input)
53 | 
54 |     @staticmethod
55 |     def backward(ctx, grad_output):
56 |         input, bias = ctx.saved_tensors
57 |         tmp = bias_gelu_back(grad_output, bias, input)
58 |         return tmp, tmp
59 | 
60 | bias_gelu_impl = GeLUFunction.apply
61 | 


--------------------------------------------------------------------------------
/inference/cheminformatics/common/cuchemcommon/utils/sysinfo.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/utils/sysinfo.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_CHEMINFORMATICS).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | from collections import Counter
10 | 
11 | import psutil
12 | import pynvml as nv
13 | 
14 | 
15 | def get_machine_config():
16 |     """Get machine config for CPU and GPU(s)"""
17 | 
18 |     # CPU config
19 |     physical_cores = psutil.cpu_count(logical=False)
20 |     logical_cores = psutil.cpu_count(logical=True)
21 | 
22 |     cpufreq = psutil.cpu_freq()
23 |     cpufreq_max = cpufreq.max  # Mhz
24 |     cpufreq_min = cpufreq.min
25 |     cpufreq_cur = cpufreq.current
26 | 
27 |     svmem = psutil.virtual_memory()
28 |     mem_total = svmem.total / (1024.0 ** 3)  # GB
29 |     mem_avail = svmem.available / (1024.0 ** 3)
30 | 
31 |     # GPU config
32 |     nv.nvmlInit()
33 |     driver_version = nv.nvmlSystemGetDriverVersion()
34 |     deviceCount = nv.nvmlDeviceGetCount()
35 |     gpu_devices, gpu_mems = [], []
36 |     for i in range(deviceCount):
37 |         handle = nv.nvmlDeviceGetHandleByIndex(i)
38 |         gpu_devices.append(nv.nvmlDeviceGetName(handle).decode("utf-8"))
39 |         gpu_mem = nv.nvmlDeviceGetMemoryInfo(handle).total / (1024.0 ** 3)
40 |         gpu_mems.append(gpu_mem)
41 | 
42 |     return {'cpu': {'physical_cores': physical_cores, 'logical_cores': logical_cores,
43 |                     'min_freq_MHz': cpufreq_min, 'max_freq_MHz': cpufreq_max, 'cur_freq_MHz': cpufreq_cur,
44 |                     'total_mem_GB': mem_total, 'avail_mem_GB': mem_avail},
45 |             'gpu': {'devices': gpu_devices, 'mem_GB': gpu_mems}}
46 | 
47 | 
48 | def print_machine_config(config):
49 |     """Printable version of config"""
50 |     cpu_cores = config['cpu']['physical_cores']
51 |     cpu_freq = int(round(config['cpu']['max_freq_MHz'], 0))
52 |     ram = int(round(config['cpu']['total_mem_GB'], 0))
53 |     cpu_config_message = f'{cpu_freq} MHz CPU with {cpu_cores} cores, {ram} GB RAM'
54 | 
55 |     gpu_devices = Counter([(x, int(round(y, 0))) for x, y in zip(config['gpu']['devices'], config['gpu']['mem_GB'])])
56 |     gpu_config_message = ''
57 |     for (handle, mem), count in gpu_devices.items():
58 |         gpu_config_message += f'{count} x {handle} GPU(s)'
59 | 
60 |     return ', '.join([cpu_config_message, gpu_config_message])
61 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Model parallel utility interface."""
17 | 
18 | from .cross_entropy import vocab_parallel_cross_entropy
19 | 
20 | from .data import broadcast_data
21 | 
22 | from .grads import clip_grad_norm
23 | 
24 | from .initialize import is_unitialized
25 | from .initialize import destroy_model_parallel
26 | from .initialize import get_data_parallel_group
27 | from .initialize import get_data_parallel_rank
28 | from .initialize import get_data_parallel_world_size
29 | from .initialize import get_model_parallel_group
30 | from .initialize import get_model_parallel_rank, set_model_parallel_rank
31 | from .initialize import get_model_parallel_src_rank
32 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size
33 | from .initialize import get_topology
34 | from .initialize import get_pipe_parallel_group
35 | from .initialize import get_pipe_parallel_rank
36 | from .initialize import get_pipe_parallel_world_size
37 | from .initialize import get_io_parallel_group
38 | from .initialize import initialize_model_parallel
39 | from .initialize import model_parallel_is_initialized
40 | 
41 | from .layers import LayerNorm
42 | from .layers import ColumnParallelLinear
43 | from .layers import RowParallelLinear
44 | from .layers import VocabParallelEmbedding
45 | 
46 | from .mappings import copy_to_model_parallel_region
47 | from .mappings import gather_from_model_parallel_region
48 | from .mappings import reduce_from_model_parallel_region
49 | from .mappings import scatter_to_model_parallel_region
50 | 
51 | from .random import checkpoint
52 | from .random import get_cuda_rng_tracker
53 | from .random import init_checkpointed_activations_memory_buffer
54 | from .random import model_parallel_cuda_manual_seed
55 | from .random import reset_checkpointed_activations_memory_buffer
56 | 
57 | from .utils import divide
58 | from .utils import split_tensor_along_last_dim
59 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Model parallel utility interface."""
17 | 
18 | from .cross_entropy import vocab_parallel_cross_entropy
19 | 
20 | from .data import broadcast_data
21 | 
22 | from .grads import clip_grad_norm
23 | 
24 | from .initialize import is_unitialized
25 | from .initialize import destroy_model_parallel
26 | from .initialize import get_data_parallel_group
27 | from .initialize import get_data_parallel_rank
28 | from .initialize import get_data_parallel_world_size
29 | from .initialize import get_model_parallel_group
30 | from .initialize import get_model_parallel_rank, set_model_parallel_rank
31 | from .initialize import get_model_parallel_src_rank
32 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size
33 | from .initialize import get_topology
34 | from .initialize import get_pipe_parallel_group
35 | from .initialize import get_pipe_parallel_rank
36 | from .initialize import get_pipe_parallel_world_size
37 | from .initialize import get_io_parallel_group
38 | from .initialize import initialize_model_parallel
39 | from .initialize import model_parallel_is_initialized
40 | 
41 | from .layers import LayerNorm
42 | from .layers import ColumnParallelLinear
43 | from .layers import RowParallelLinear
44 | from .layers import VocabParallelEmbedding
45 | 
46 | from .mappings import copy_to_model_parallel_region
47 | from .mappings import gather_from_model_parallel_region
48 | from .mappings import reduce_from_model_parallel_region
49 | from .mappings import scatter_to_model_parallel_region
50 | 
51 | from .random import checkpoint
52 | from .random import get_cuda_rng_tracker
53 | from .random import init_checkpointed_activations_memory_buffer
54 | from .random import model_parallel_cuda_manual_seed
55 | from .random import reset_checkpointed_activations_memory_buffer
56 | 
57 | from .utils import divide
58 | from .utils import split_tensor_along_last_dim
59 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_upper_triang_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input, 
27 |     float scale_factor);
28 | 
29 | torch::Tensor bwd_cuda(
30 |     torch::Tensor const& output_grads, 
31 |     torch::Tensor const& softmax_results,
32 |     float scale_factor);
33 | 
34 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
35 |   AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
36 |   AT_ASSERTM(input.scalar_type() == at::ScalarType::Half, 
37 |       "Only HALF is supported");
38 | 
39 |   return fwd_cuda(input, scale_factor);
40 | }
41 | 
42 | torch::Tensor bwd(
43 |     torch::Tensor const& output_grads, 
44 |     torch::Tensor const& softmax_results,
45 |     float scale_factor) {
46 | 
47 |   AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
48 |   AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
49 | 
50 |   AT_ASSERTM(output_grads.scalar_type() == at::ScalarType::Half, 
51 |       "Only HALF is supported");
52 |   AT_ASSERTM(softmax_results.scalar_type() == at::ScalarType::Half, 
53 |       "Only HALF is supported");
54 | 
55 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
56 | }
57 | 
58 | } // end namespace scaled_upper_triang_masked_softmax
59 | } // end namespace fused_softmax
60 | } // end namespace multihead_attn
61 | 
62 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
63 |   m.def("forward", 
64 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 
65 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
66 |   m.def("backward", 
67 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
68 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
69 | }
70 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/fused_kernels/scaled_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input, 
27 |     torch::Tensor const& mask,
28 |     float scale_factor);
29 | 
30 | torch::Tensor bwd_cuda(
31 |     torch::Tensor const& output_grads, 
32 |     torch::Tensor const& softmax_results,
33 |     float scale_factor);
34 | 
35 | torch::Tensor fwd(
36 |     torch::Tensor const& input,
37 |     torch::Tensor const& mask,
38 |     float scale_factor) {
39 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
40 |   AT_ASSERTM(input.scalar_type() == at::ScalarType::Half, 
41 |       "Only HALF is supported");
42 |   AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
43 | 
44 |   return fwd_cuda(input, mask, scale_factor);
45 | }
46 | 
47 | torch::Tensor bwd(
48 |     torch::Tensor const& output_grads, 
49 |     torch::Tensor const& softmax_results,
50 |     float scale_factor) {
51 | 
52 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
53 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
54 | 
55 |   AT_ASSERTM(output_grads.scalar_type() == at::ScalarType::Half, 
56 |       "Only HALF is supported");
57 |   AT_ASSERTM(softmax_results.scalar_type() == at::ScalarType::Half, 
58 |       "Only HALF is supported");
59 | 
60 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
61 | }
62 | 
63 | } // end namespace scaled_masked_softmax
64 | } // end namespace fused_softmax
65 | } // end namespace multihead_attn
66 | 
67 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
68 |   m.def("forward", 
69 |         &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
70 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
71 |   m.def("backward", 
72 |         &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
73 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
74 | }
75 | 


--------------------------------------------------------------------------------
/inference/cheminformatics/common/cuchemcommon/data/cluster_wf.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/data/cluster_wf.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_CHEMINFORMATICS).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | import logging
10 | import math
11 | import os
12 | from typing import List
13 | 
14 | import cudf
15 | import dask
16 | import dask_cudf
17 | from cuchemcommon.context import Context
18 | from cuchemcommon.data.helper.chembldata import BATCH_SIZE, ChEmblData
19 | from cuchemcommon.utils.singleton import Singleton
20 | 
21 | from . import ClusterWfDAO
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | FINGER_PRINT_FILES = 'filter_*.h5'
26 | 
27 | 
28 | class ChemblClusterWfDao(ClusterWfDAO, metaclass=Singleton):
29 | 
30 |     def __init__(self, fp_type):
31 |         self.chem_data = ChEmblData(fp_type)
32 | 
33 |     def meta_df(self):
34 |         chem_data = ChEmblData()
35 |         return chem_data._meta_df()
36 | 
37 |     def fetch_molecular_embedding(self,
38 |                                   n_molecules: int,
39 |                                   cache_directory: str = None):
40 |         context = Context()
41 |         if cache_directory:
42 |             hdf_path = os.path.join(cache_directory, FINGER_PRINT_FILES)
43 |             logger.info('Reading %d rows from %s...', n_molecules, hdf_path)
44 |             mol_df = dask.dataframe.read_hdf(hdf_path, 'fingerprints')
45 | 
46 |             if n_molecules > 0:
47 |                 npartitions = math.ceil(n_molecules / BATCH_SIZE)
48 |                 mol_df = mol_df.head(n_molecules, compute=False, npartitions=npartitions)
49 |         else:
50 |             logger.info('Reading molecules from database...')
51 |             mol_df = self.chem_data.fetch_mol_embedding(num_recs=n_molecules,
52 |                                                         batch_size=context.batch_size)
53 | 
54 |         return mol_df
55 | 
56 |     def fetch_molecular_embedding_by_id(self, molecule_id: List):
57 |         context = Context()
58 |         meta = self.chem_data._meta_df()
59 |         fp_df = self.chem_data._fetch_mol_embedding(molregnos=molecule_id,
60 |                                                     batch_size=context.batch_size) \
61 |             .astype(meta.dtypes)
62 | 
63 |         fp_df = cudf.from_pandas(fp_df)
64 |         fp_df = dask_cudf.from_cudf(fp_df, npartitions=1).reset_index()
65 |         return fp_df
66 | 
67 |     def fetch_id_from_chembl(self, new_molecules: List):
68 |         logger.debug('Fetch ChEMBL ID using molregno...')
69 |         return self.chem_data.fetch_id_from_chembl(new_molecules)
70 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron_lm.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | MANIFEST.in
 2 | README.md
 3 | setup.py
 4 | megatron/__init__.py
 5 | megatron/arguments.py
 6 | megatron/checkpointing.py
 7 | megatron/global_vars.py
 8 | megatron/indexer.py
 9 | megatron/initialize.py
10 | megatron/learning_rates.py
11 | megatron/memory.py
12 | megatron/module.py
13 | megatron/package_info.py
14 | megatron/text_generation_utils.py
15 | megatron/training.py
16 | megatron/utils.py
17 | megatron/data/Makefile
18 | megatron/data/__init__.py
19 | megatron/data/bert_dataset.py
20 | megatron/data/dataset_utils.py
21 | megatron/data/gpt2_dataset.py
22 | megatron/data/helpers.cpp
23 | megatron/data/ict_dataset.py
24 | megatron/data/indexed_dataset.py
25 | megatron/data/realm_dataset_utils.py
26 | megatron/data/realm_index.py
27 | megatron/data/samplers.py
28 | megatron/deprecated_data_utils/__init__.py
29 | megatron/deprecated_data_utils/configure_data.py
30 | megatron/deprecated_data_utils/corpora.py
31 | megatron/deprecated_data_utils/datasets.py
32 | megatron/deprecated_data_utils/file_utils.py
33 | megatron/deprecated_data_utils/lazy_loader.py
34 | megatron/deprecated_data_utils/samplers.py
35 | megatron/deprecated_data_utils/tf_dl.py
36 | megatron/deprecated_data_utils/tokenization.py
37 | megatron/deprecated_data_utils/tokenization_gpt2.py
38 | megatron/deprecated_data_utils/wordpiece.py
39 | megatron/fp16/__init__.py
40 | megatron/fp16/fp16.py
41 | megatron/fp16/fp16util.py
42 | megatron/fp16/loss_scaler.py
43 | megatron/fused_kernels/__init__.py
44 | megatron/model/__init__.py
45 | megatron/model/bert_model.py
46 | megatron/model/classification.py
47 | megatron/model/distributed.py
48 | megatron/model/fused_bias_gelu.py
49 | megatron/model/fused_softmax.py
50 | megatron/model/gpt2_model.py
51 | megatron/model/language_model.py
52 | megatron/model/multiple_choice.py
53 | megatron/model/realm_model.py
54 | megatron/model/transformer.py
55 | megatron/model/utils.py
56 | megatron/mpu/__init__.py
57 | megatron/mpu/cross_entropy.py
58 | megatron/mpu/data.py
59 | megatron/mpu/grads.py
60 | megatron/mpu/initialize.py
61 | megatron/mpu/layers.py
62 | megatron/mpu/mappings.py
63 | megatron/mpu/random.py
64 | megatron/mpu/utils.py
65 | megatron/mpu/tests/__init__.py
66 | megatron/mpu/tests/commons.py
67 | megatron/mpu/tests/test_cross_entropy.py
68 | megatron/mpu/tests/test_data.py
69 | megatron/mpu/tests/test_initialize.py
70 | megatron/mpu/tests/test_layers.py
71 | megatron/mpu/tests/test_random.py
72 | megatron/tokenizer/__init__.py
73 | megatron/tokenizer/bert_tokenization.py
74 | megatron/tokenizer/gpt2_tokenization.py
75 | megatron/tokenizer/tokenizer.py
76 | megatron_lm.egg-info/PKG-INFO
77 | megatron_lm.egg-info/SOURCES.txt
78 | megatron_lm.egg-info/dependency_links.txt
79 | megatron_lm.egg-info/not-zip-safe
80 | megatron_lm.egg-info/requires.txt
81 | megatron_lm.egg-info/top_level.txt


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/goal_directed_score_contributions.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/goal_directed_score_contributions.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_GUACAMOL).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | from typing import List, Tuple, Dict
10 | 
11 | 
12 | class ScoreContributionSpecification:
13 |     """
14 |     Specifies how to calculate the score of a goal-directed benchmark.
15 | 
16 |     The global score will be a weighted average of top-x scores.
17 |     This class specifies which top-x to consider and what the corresponding weights are.
18 |     """
19 | 
20 |     def __init__(self, contributions: List[Tuple[int, float]]) -> None:
21 |         """
22 |         Args:
23 |             contributions: List of tuples (top_count, weight) for the score contributions
24 |         """
25 |         self.contributions = contributions
26 | 
27 |     @property
28 |     def top_counts(self) -> List[int]:
29 |         return [x[0] for x in self.contributions]
30 | 
31 |     @property
32 |     def weights(self) -> List[float]:
33 |         return [x[1] for x in self.contributions]
34 | 
35 | 
36 | def uniform_specification(*top_counts: int) -> ScoreContributionSpecification:
37 |     """
38 |     Creates an instance of ScoreContributionSpecification where all the top-x contributions have equal weight
39 | 
40 |     Args:
41 |         top_counts: list of values, where each value x will correspond to the top-x contribution
42 |     """
43 |     contributions = [(x, 1.0) for x in top_counts]
44 |     return ScoreContributionSpecification(contributions=contributions)
45 | 
46 | 
47 | def compute_global_score(contribution_specification: ScoreContributionSpecification,
48 |                          scores: List[float]) -> Tuple[float, Dict[str, float]]:
49 |     """
50 |     Computes the global score according to the contribution specification.
51 | 
52 |     Args:
53 |         contribution_specification: Score contribution specification
54 |         scores: List of all scores - list must be long enough for all top_counts in contribution_specification
55 | 
56 |     Returns:
57 |         Tuple with the global score and a dict with the considered top-x scores
58 |     """
59 |     sorted_scores = sorted(scores, reverse=True)
60 | 
61 |     global_score = 0.0
62 |     top_x_dict = {}
63 | 
64 |     for top_count, weight in contribution_specification.contributions:
65 |         score = sum(sorted_scores[:top_count]) / top_count
66 |         top_x_dict[f'top_{top_count}'] = score
67 |         global_score += score * weight
68 | 
69 |     global_score /= sum(contribution_specification.weights)
70 | 
71 |     return global_score, top_x_dict
72 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/glue/data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """GLUE dataset."""
17 | 
18 | from abc import ABC
19 | from abc import abstractmethod
20 | 
21 | from torch.utils.data import Dataset
22 | 
23 | from megatron import print_rank_0
24 | from tasks.data_utils import build_sample
25 | from tasks.data_utils import build_tokens_types_paddings_from_text
26 | 
27 | 
28 | class GLUEAbstractDataset(ABC, Dataset):
29 |     """GLUE base dataset class."""
30 | 
31 |     def __init__(self, task_name, dataset_name, datapaths,
32 |                  tokenizer, max_seq_length):
33 |         # Store inputs.
34 |         self.task_name = task_name
35 |         self.dataset_name = dataset_name
36 |         self.tokenizer = tokenizer
37 |         self.max_seq_length = max_seq_length
38 |         print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
39 |                                                              self.dataset_name))
40 |         # Process the files.
41 |         string = '  > paths:'
42 |         for path in datapaths:
43 |             string += ' ' + path
44 |         print_rank_0(string)
45 |         self.samples = []
46 |         for datapath in datapaths:
47 |             self.samples.extend(self.process_samples_from_single_path(datapath))
48 |         print_rank_0('  >> total number of samples: {}'.format(
49 |             len(self.samples)))
50 | 
51 |     def __len__(self):
52 |         return len(self.samples)
53 | 
54 |     def __getitem__(self, idx):
55 |         raw_sample = self.samples[idx]
56 |         ids, types, paddings = build_tokens_types_paddings_from_text(
57 |             raw_sample['text_a'], raw_sample['text_b'],
58 |             self.tokenizer, self.max_seq_length)
59 |         sample = build_sample(ids, types, paddings,
60 |                               raw_sample['label'], raw_sample['uid'])
61 |         return sample
62 | 
63 |     @abstractmethod
64 |     def process_samples_from_single_path(self, datapath):
65 |         """Abstract method that takes a single path / filename and
66 |         returns a list of dataset samples, each sample being a dict of
67 |             {'text_a': string, 'text_b': string, 'label': int, 'uid': int}
68 |         """
69 |         pass
70 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/openwebtext/remove_group_duplicates.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import json
18 | import time
19 | import sys
20 | 
21 | 
22 | if __name__ == '__main__':
23 | 
24 |     url_filename = sys.argv[1]
25 |     data_filename = sys.argv[2]
26 |     output_filename = sys.argv[3]
27 | 
28 |     urls = set()
29 |     with open(url_filename, 'r') as f:
30 |         for line in f:
31 |             myjson = json.loads(line)
32 |             for key in myjson:
33 |                 this_urls = myjson[key]
34 |                 for i in range(1, len(this_urls)):
35 |                     urls.add(this_urls[i])
36 |     print('will be removing {} urls'.format(len(urls)), flush=True)
37 | 
38 |     written_docs = 0
39 |     removed_docs = 0
40 |     removed_chars = 0
41 |     start_time = time.time()
42 |     with open(output_filename, 'wb') as fout:
43 |         with open(data_filename, 'r') as fin:
44 |             for line in fin:
45 |                 try:
46 |                     myjson = json.loads(line)
47 |                     url = myjson['url']
48 |                     if url in urls:
49 |                         print('removing', myjson)
50 |                         removed_docs += 1
51 |                         removed_chars += len(myjson['text'])
52 |                         continue
53 |                     myjson = json.dumps(myjson, ensure_ascii=False)
54 |                     fout.write(myjson.encode('utf-8'))
55 |                     fout.write('\n'.encode('utf-8'))
56 |                     written_docs += 1
57 |                     if written_docs % 10000 == 0:
58 |                         print(' [PROCESSED] time (s): {:.2f} | written: {} '
59 |                               '| removed: {} (char: {})'.format(
60 |                                   time.time() - start_time,
61 |                                   written_docs, removed_docs, removed_chars))
62 |                 except Exception as e:
63 |                     print('[SKIPPING]', line, e)
64 | 
65 |     print(' [PROCESSED] time (s): {:.2f} | written: {} '
66 |           '| removed: {} (char: {})'.format(
67 |               time.time() - start_time,
68 |               written_docs, removed_docs, removed_chars))
69 |     print('done :-)')
70 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/zeroshot_gpt2/detokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Detokenization."""
17 | 
18 | import re
19 | 
20 | 
21 | def ptb_detokenizer(string):
22 |     string = string.replace(" '", "'")
23 |     string = string.replace(" \n", "\n")
24 |     string = string.replace("\n ", "\n")
25 |     string = string.replace(" n't", "n't")
26 |     string = string.replace(" N ", "1 ")
27 |     string = string.replace("$ 1", "$1")
28 |     string = string.replace("# 1", "#1")
29 |     return string
30 | 
31 | 
32 | def wikitext_detokenizer(string):
33 |     # contractions
34 |     string = string.replace("s '", "s'")
35 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
36 |     # number separators
37 |     string = string.replace(" @-@ ", "-")
38 |     string = string.replace(" @,@ ", ",")
39 |     string = string.replace(" @.@ ", ".")
40 |     # punctuation
41 |     string = string.replace(" : ", ": ")
42 |     string = string.replace(" ; ", "; ")
43 |     string = string.replace(" . ", ". ")
44 |     string = string.replace(" ! ", "! ")
45 |     string = string.replace(" ? ", "? ")
46 |     string = string.replace(" , ", ", ")
47 |     # double brackets
48 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
49 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
50 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
51 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
52 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
53 |     # miscellaneous
54 |     string = string.replace("= = = =", "====")
55 |     string = string.replace("= = =", "===")
56 |     string = string.replace("= =", "==")
57 |     string = string.replace(" " + chr(176) + " ", chr(176))
58 |     string = string.replace(" \n", "\n")
59 |     string = string.replace("\n ", "\n")
60 |     string = string.replace(" N ", " 1 ")
61 |     string = string.replace(" 's", "'s")
62 | 
63 |     return string
64 | 
65 | 
66 | def lambada_detokenizer(string):
67 |     return string
68 | 
69 | 
70 | _DETOKENIZERS = {
71 |     'ptb': ptb_detokenizer,
72 |     'wiki': wikitext_detokenizer,
73 |     'lambada': lambada_detokenizer,
74 | }
75 | 
76 | 
77 | def get_detokenizer(path):
78 |     for key in _DETOKENIZERS.keys():
79 |         if key in path:
80 |             return _DETOKENIZERS[key]
81 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import argparse
17 | import os
18 | import random
19 | import numpy
20 | import torch
21 | 
22 | import mpu
23 | 
24 | 
25 | class IdentityLayer(torch.nn.Module):
26 |     def __init__(self, size, scale=1.0):
27 |         super(IdentityLayer, self).__init__()
28 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
29 | 
30 |     def forward(self):
31 |         return self.weight
32 | 
33 | 
34 | def set_random_seed(seed):
35 |     """Set random seed for reproducability."""
36 |     random.seed(seed)
37 |     numpy.random.seed(seed)
38 |     torch.manual_seed(seed)
39 |     mpu.model_parallel_cuda_manual_seed(seed)
40 | 
41 | 
42 | def initialize_distributed(backend='nccl'):
43 |     """Initialize torch.distributed."""
44 |     # Get local rank in case it is provided.
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument('--local_rank', type=int, default=None,
47 |                         help='local rank passed from distributed launcher')
48 |     args = parser.parse_args()
49 |     local_rank = args.local_rank
50 | 
51 |     # Get rank and world size.
52 |     rank = int(os.getenv('RANK', '0'))
53 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
54 | 
55 |     print('> initializing torch.distributed with local rank: {}, '
56 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
57 | 
58 |     # Set the device id.
59 |     device = rank % torch.cuda.device_count()
60 |     if local_rank is not None:
61 |         device = local_rank
62 |     torch.cuda.set_device(device)
63 | 
64 |     # Call the init process.
65 |     init_method = 'tcp://'
66 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
67 |     master_port = os.getenv('MASTER_PORT', '6000')
68 |     init_method += master_ip + ':' + master_port
69 |     torch.distributed.init_process_group(
70 |         backend=backend,
71 |         world_size=world_size,
72 |         rank=rank,
73 |         init_method=init_method)
74 | 
75 | 
76 | def print_separator(message):
77 |     torch.distributed.barrier()
78 |     filler_len = (78 - len(message)) // 2
79 |     filler = '-' * filler_len
80 |     string = '\n' + filler + ' {} '.format(message) + filler
81 |     if torch.distributed.get_rank() == 0:
82 |         print(string, flush=True)
83 |     torch.distributed.barrier()
84 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/main.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Main tasks functionality."""
17 | 
18 | import os
19 | import sys
20 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
21 |                                              os.path.pardir)))
22 | 
23 | from megatron import get_args
24 | from megatron.initialize import initialize_megatron
25 | 
26 | 
27 | def get_tasks_args(parser):
28 |     """Provide extra arguments required for tasks."""
29 |     group = parser.add_argument_group(title='tasks')
30 | 
31 |     group.add_argument('--task', type=str, required=True,
32 |                        help='Task name.')
33 |     group.add_argument('--epochs', type=int, default=None,
34 |                        help='Number of finetunning epochs. Zero results in '
35 |                        'evaluation only.')
36 |     group.add_argument('--pretrained-checkpoint', type=str, default=None,
37 |                        help='Pretrained checkpoint used for finetunning.')
38 |     group.add_argument('--keep-last', action='store_true',
39 |                        help='Keep the last batch (maybe incomplete) in'
40 |                        'the data loader')
41 |     group.add_argument('--train-data', nargs='+', default=None,
42 |                        help='Whitespace separated paths or corpora names '
43 |                        'for training.')
44 |     group.add_argument('--valid-data', nargs='*', default=None,
45 |                        help='path(s) to the validation data.')
46 |     group.add_argument('--overlapping-eval', type=int, default=32,
47 |                        help='Sliding window for overlapping evaluation.')
48 |     group.add_argument('--strict-lambada', action='store_true',
49 |                        help='Use more difficult formulation of lambada.')
50 | 
51 |     return parser
52 | 
53 | 
54 | if __name__ == '__main__':
55 | 
56 |     initialize_megatron(extra_args_provider=get_tasks_args)
57 | 
58 |     args = get_args()
59 |     if args.task == 'RACE':
60 |         from race.finetune import main
61 |     elif args.task in ['MNLI', 'QQP']:
62 |         from glue.finetune import main
63 |     elif args.task in ['LAMBADA', 'WIKITEXT103']:
64 |         from zeroshot_gpt2.evaluate import main
65 |     else:
66 |         raise NotImplementedError('Task {} is not implemented.'.format(
67 |             args.task))
68 | 
69 |     main()
70 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import argparse
17 | import os
18 | import random
19 | import numpy
20 | import torch
21 | 
22 | import mpu
23 | 
24 | 
25 | class IdentityLayer(torch.nn.Module):
26 |     def __init__(self, size, scale=1.0):
27 |         super(IdentityLayer, self).__init__()
28 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
29 | 
30 |     def forward(self):
31 |         return self.weight
32 | 
33 | 
34 | def set_random_seed(seed):
35 |     """Set random seed for reproducability."""
36 |     random.seed(seed)
37 |     numpy.random.seed(seed)
38 |     torch.manual_seed(seed)
39 |     mpu.model_parallel_cuda_manual_seed(seed)
40 | 
41 | 
42 | def initialize_distributed(backend='nccl'):
43 |     """Initialize torch.distributed."""
44 |     # Get local rank in case it is provided.
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument('--local_rank', type=int, default=None,
47 |                         help='local rank passed from distributed launcher')
48 |     args = parser.parse_args()
49 |     local_rank = args.local_rank
50 | 
51 |     # Get rank and world size.
52 |     rank = int(os.getenv('RANK', '0'))
53 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
54 | 
55 |     print('> initializing torch.distributed with local rank: {}, '
56 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
57 | 
58 |     # Set the device id.
59 |     device = rank % torch.cuda.device_count()
60 |     if local_rank is not None:
61 |         device = local_rank
62 |     torch.cuda.set_device(device)
63 | 
64 |     # Call the init process.
65 |     init_method = 'tcp://'
66 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
67 |     master_port = os.getenv('MASTER_PORT', '6000')
68 |     init_method += master_ip + ':' + master_port
69 |     torch.distributed.init_process_group(
70 |         backend=backend,
71 |         world_size=world_size,
72 |         rank=rank,
73 |         init_method=init_method)
74 | 
75 | 
76 | def print_separator(message):
77 |     torch.distributed.barrier()
78 |     filler_len = (78 - len(message)) // 2
79 |     filler = '-' * filler_len
80 |     string = '\n' + filler + ' {} '.format(message) + filler
81 |     if torch.distributed.get_rank() == 0:
82 |         print(string, flush=True)
83 |     torch.distributed.barrier()
84 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch
18 | 
19 | 
20 | def ensure_divisibility(numerator, denominator):
21 |     """Ensure that numerator is divisible by the denominator."""
22 |     assert numerator % denominator == 0, '{} is not divisible by {}'.format(
23 |         numerator, denominator)
24 | 
25 | 
26 | def divide(numerator, denominator):
27 |     """Ensure that numerator is divisible by the denominator and return
28 |     the division value."""
29 |     ensure_divisibility(numerator, denominator)
30 |     return numerator // denominator
31 | 
32 | 
33 | def split_tensor_along_last_dim(tensor, num_partitions,
34 |                                 contiguous_split_chunks=False):
35 |     """Split a tensor along its last dimension.
36 |     Arguments:
37 |         tensor: input tensor.
38 |         num_partitions: number of partitions to split the tensor
39 |         contiguous_split_chunks: If True, make each chunk contiguous
40 |                                  in memory.
41 |     """
42 |     # Get the size and dimension.
43 |     last_dim = tensor.dim() - 1
44 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
45 |     # Split.
46 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
47 |     # Note: torch.split does not create contiguous tensors by default.
48 |     if contiguous_split_chunks:
49 |         return tuple(chunk.contiguous() for chunk in tensor_list)
50 | 
51 |     return tensor_list
52 | 
53 | 
54 | class VocabUtility:
55 |     """Split the vocabulary into `world_size` chunks amd return the
56 |         first and last index of the vocabulary belonging to the `rank`
57 |         partition: Note that indecies in [fist, last)"""
58 | 
59 |     @staticmethod
60 |     def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
61 |                                                   rank, world_size):
62 |         index_f = rank * per_partition_vocab_size
63 |         index_l = index_f + per_partition_vocab_size
64 |         return index_f, index_l
65 | 
66 |     @staticmethod
67 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
68 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
69 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
70 |             per_partition_vocab_size, rank, world_size)
71 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch
18 | 
19 | 
20 | def ensure_divisibility(numerator, denominator):
21 |     """Ensure that numerator is divisible by the denominator."""
22 |     assert numerator % denominator == 0, '{} is not divisible by {}'.format(
23 |         numerator, denominator)
24 | 
25 | 
26 | def divide(numerator, denominator):
27 |     """Ensure that numerator is divisible by the denominator and return
28 |     the division value."""
29 |     ensure_divisibility(numerator, denominator)
30 |     return numerator // denominator
31 | 
32 | 
33 | def split_tensor_along_last_dim(tensor, num_partitions,
34 |                                 contiguous_split_chunks=False):
35 |     """Split a tensor along its last dimension.
36 |     Arguments:
37 |         tensor: input tensor.
38 |         num_partitions: number of partitions to split the tensor
39 |         contiguous_split_chunks: If True, make each chunk contiguous
40 |                                  in memory.
41 |     """
42 |     # Get the size and dimension.
43 |     last_dim = tensor.dim() - 1
44 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
45 |     # Split.
46 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
47 |     # Note: torch.split does not create contiguous tensors by default.
48 |     if contiguous_split_chunks:
49 |         return tuple(chunk.contiguous() for chunk in tensor_list)
50 | 
51 |     return tensor_list
52 | 
53 | 
54 | class VocabUtility:
55 |     """Split the vocabulary into `world_size` chunks amd return the
56 |         first and last index of the vocabulary belonging to the `rank`
57 |         partition: Note that indecies in [fist, last)"""
58 | 
59 |     @staticmethod
60 |     def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
61 |                                                   rank, world_size):
62 |         index_f = rank * per_partition_vocab_size
63 |         index_l = index_f + per_partition_vocab_size
64 |         return index_f, index_l
65 | 
66 |     @staticmethod
67 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
68 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
69 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
70 |             per_partition_vocab_size, rank, world_size)
71 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/model/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Utilities for models."""
17 | 
18 | import math
19 | 
20 | import torch
21 | 
22 | from .transformer import LayerNorm
23 | 
24 | 
25 | def init_method_normal(sigma):
26 |     """Init method based on N(0, sigma)."""
27 |     def init_(tensor):
28 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
29 | 
30 |     return init_
31 | 
32 | 
33 | def scaled_init_method_normal(sigma, num_layers):
34 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
35 |     std = sigma / math.sqrt(2.0 * num_layers)
36 | 
37 |     def init_(tensor):
38 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
39 | 
40 |     return init_
41 | 
42 | 
43 | def get_linear_layer(rows, columns, init_method):
44 |     """Simple linear layer with weight initialization."""
45 |     layer = torch.nn.Linear(rows, columns)
46 |     init_method(layer.weight)
47 |     with torch.no_grad():
48 |         layer.bias.zero_()
49 |     return layer
50 | 
51 | @torch.jit.script
52 | def gelu_impl(x):
53 |     """OpenAI's gelu implementation."""
54 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
55 |                                        (1.0 + 0.044715 * x * x)))
56 | def openai_gelu(x):
57 |     return gelu_impl(x)
58 | 
59 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
60 | @torch.jit.script
61 | def erf_gelu(x):
62 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
63 | 
64 | def get_params_for_weight_decay_optimization(module):
65 |     """Divide params into with-weight-decay and without-weight-decay groups.
66 |     Layernorms and baises will have no weight decay but the rest will.
67 |     """
68 |     weight_decay_params = {'params': []}
69 |     no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
70 |     for module_ in module.modules():
71 |         if isinstance(module_, LayerNorm):
72 |             no_weight_decay_params['params'].extend(
73 |                 [p for p in list(module_._parameters.values())
74 |                  if p is not None])
75 |         else:
76 |             weight_decay_params['params'].extend(
77 |                 [p for n, p in list(module_._parameters.items())
78 |                  if p is not None and n != 'bias'])
79 |             no_weight_decay_params['params'].extend(
80 |                 [p for n, p in list(module_._parameters.items())
81 |                  if p is not None and n == 'bias'])
82 | 
83 |     return weight_decay_params, no_weight_decay_params
84 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/model/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Utilities for models."""
17 | 
18 | import math
19 | 
20 | import torch
21 | 
22 | from .transformer import LayerNorm
23 | 
24 | 
25 | def init_method_normal(sigma):
26 |     """Init method based on N(0, sigma)."""
27 |     def init_(tensor):
28 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
29 | 
30 |     return init_
31 | 
32 | 
33 | def scaled_init_method_normal(sigma, num_layers):
34 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
35 |     std = sigma / math.sqrt(2.0 * num_layers)
36 | 
37 |     def init_(tensor):
38 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
39 | 
40 |     return init_
41 | 
42 | 
43 | def get_linear_layer(rows, columns, init_method):
44 |     """Simple linear layer with weight initialization."""
45 |     layer = torch.nn.Linear(rows, columns)
46 |     init_method(layer.weight)
47 |     with torch.no_grad():
48 |         layer.bias.zero_()
49 |     return layer
50 | 
51 | @torch.jit.script
52 | def gelu_impl(x):
53 |     """OpenAI's gelu implementation."""
54 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
55 |                                        (1.0 + 0.044715 * x * x)))
56 | def openai_gelu(x):
57 |     return gelu_impl(x)
58 | 
59 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
60 | @torch.jit.script
61 | def erf_gelu(x):
62 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
63 | 
64 | def get_params_for_weight_decay_optimization(module):
65 |     """Divide params into with-weight-decay and without-weight-decay groups.
66 |     Layernorms and baises will have no weight decay but the rest will.
67 |     """
68 |     weight_decay_params = {'params': []}
69 |     no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
70 |     for module_ in module.modules():
71 |         if isinstance(module_, LayerNorm):
72 |             no_weight_decay_params['params'].extend(
73 |                 [p for p in list(module_._parameters.values())
74 |                  if p is not None])
75 |         else:
76 |             weight_decay_params['params'].extend(
77 |                 [p for n, p in list(module_._parameters.items())
78 |                  if p is not None and n != 'bias'])
79 |             no_weight_decay_params['params'].extend(
80 |                 [p for n, p in list(module_._parameters.items())
81 |                  if p is not None and n == 'bias'])
82 | 
83 |     return weight_decay_params, no_weight_decay_params
84 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <ATen/ATen.h>
18 | #include <cuda.h>
19 | #include <cuda_runtime.h>
20 | #include <cuda_fp16.h>
21 | #include <cuda_profiler_api.h>
22 | #include "THC/THC.h"
23 | #include <ATen/cuda/CUDAContext.h>
24 | #include <torch/extension.h>
25 | #include "scaled_upper_triang_masked_softmax.h"
26 | 
27 | namespace multihead_attn {
28 | namespace fused_softmax {
29 | namespace scaled_upper_triang_masked_softmax {
30 | 
31 | torch::Tensor fwd_cuda(
32 |     torch::Tensor const& input, 
33 |     float scale_factor)
34 | {
35 |   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
36 |   const int attn_batches = input.size(0);
37 |   const int seq_len = input.size(1);
38 |   TORCH_INTERNAL_ASSERT(seq_len <= 2048);
39 | 
40 |   // Output 
41 |   auto act_options = input.options().requires_grad(false);
42 |   torch::Tensor softmax_results = 
43 |       torch::empty({attn_batches, seq_len, seq_len}, act_options);
44 | 
45 |   // Softmax Intermediate Result Ptr
46 |   void* input_ptr = static_cast<void*>(input.data_ptr());
47 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
48 | 
49 |   dispatch_scaled_upper_triang_masked_softmax_forward<half, half, float>(
50 |       reinterpret_cast<half*>(softmax_results_ptr),
51 |       reinterpret_cast<const half*>(input_ptr),
52 |       scale_factor,
53 |       seq_len,
54 |       seq_len,
55 |       attn_batches);
56 |   return softmax_results;
57 | }
58 | 
59 | torch::Tensor bwd_cuda(
60 |     torch::Tensor const& output_grads_, 
61 |     torch::Tensor const& softmax_results_, 
62 |     float scale_factor)  {
63 | 	
64 |   auto output_grads = output_grads_.contiguous();
65 |   auto softmax_results = softmax_results_.contiguous();
66 | 
67 |   //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
68 |   const int attn_batches = output_grads.size(0);
69 |   const int seq_len = output_grads.size(1);
70 |   TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
71 | 
72 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
73 | 
74 |   //Softmax Grad
75 |   dispatch_scaled_upper_triang_masked_softmax_backward<half, half, float>(
76 |       reinterpret_cast<half*>(output_grads_ptr), 
77 |       reinterpret_cast<half*>(output_grads_ptr), 
78 |       reinterpret_cast<half const*>(softmax_results.data_ptr()),
79 |       scale_factor,
80 |       seq_len,
81 |       seq_len,
82 |       attn_batches);
83 |   
84 |   //backward pass is completely in-place
85 |   return output_grads;
86 | }
87 | }
88 | }
89 | }
90 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from commons import print_separator
17 | from commons import initialize_distributed
18 | from mpu import data as data_utils
19 | import mpu
20 | import torch
21 | import functools
22 | import operator
23 | import sys
24 | sys.path.append("../..")
25 | 
26 | 
27 | def test_boradcast_data(model_parallel_size):
28 | 
29 |     if torch.distributed.get_rank() == 0:
30 |         print('> testing boradcast_data with model parallel size {} ...'.
31 |               format(model_parallel_size))
32 | 
33 |     mpu.initialize_model_parallel(model_parallel_size)
34 |     torch.manual_seed(1234 + mpu.get_data_parallel_rank())
35 |     model_parallel_size = mpu.get_model_parallel_world_size()
36 | 
37 |     key_size_t = {'key1': [7, 11],
38 |                   'key2': [8, 2, 1],
39 |                   'key3': [13],
40 |                   'key4': [5, 1, 2],
41 |                   'key5': [5, 12]}
42 |     keys = list(key_size_t.keys())
43 | 
44 |     data = {}
45 |     data_t = {}
46 |     for key in key_size_t:
47 |         data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
48 |         data_t[key] = data[key].clone()
49 |     data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
50 |     data_t['keyX'] = data['keyX'].clone()
51 |     if mpu.get_model_parallel_rank() != 0:
52 |         data = None
53 | 
54 |     data_utils._check_data_types(keys, data_t, torch.int64)
55 |     key_size, key_numel, \
56 |         total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
57 |     for key in keys:
58 |         assert key_size[key] == key_size_t[key]
59 |     total_numel_t = 0
60 |     for key in keys:
61 |         target_size = functools.reduce(operator.mul, key_size_t[key], 1)
62 |         assert key_numel[key] == target_size
63 |         total_numel_t += target_size
64 |     assert total_numel == total_numel_t
65 | 
66 |     data_b = data_utils.broadcast_data(keys, data, torch.int64)
67 |     for key in keys:
68 |         tensor = data_t[key].cuda()
69 |         assert data_b[key].sub(tensor).abs().max() == 0
70 | 
71 |     # Reset groups
72 |     mpu.destroy_model_parallel()
73 | 
74 |     torch.distributed.barrier()
75 |     if torch.distributed.get_rank() == 0:
76 |         print('>> passed the test :-)')
77 | 
78 | 
79 | if __name__ == '__main__':
80 | 
81 |     initialize_distributed()
82 |     world_size = torch.distributed.get_world_size()
83 | 
84 |     model_parallel_size = 1
85 |     while model_parallel_size <= world_size:
86 |         print_separator('test test boradcast data')
87 |         test_boradcast_data(model_parallel_size)
88 |         model_parallel_size *= 2
89 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from commons import print_separator
17 | from commons import initialize_distributed
18 | from mpu import data as data_utils
19 | import mpu
20 | import torch
21 | import functools
22 | import operator
23 | import sys
24 | sys.path.append("../..")
25 | 
26 | 
27 | def test_boradcast_data(model_parallel_size):
28 | 
29 |     if torch.distributed.get_rank() == 0:
30 |         print('> testing boradcast_data with model parallel size {} ...'.
31 |               format(model_parallel_size))
32 | 
33 |     mpu.initialize_model_parallel(model_parallel_size)
34 |     torch.manual_seed(1234 + mpu.get_data_parallel_rank())
35 |     model_parallel_size = mpu.get_model_parallel_world_size()
36 | 
37 |     key_size_t = {'key1': [7, 11],
38 |                   'key2': [8, 2, 1],
39 |                   'key3': [13],
40 |                   'key4': [5, 1, 2],
41 |                   'key5': [5, 12]}
42 |     keys = list(key_size_t.keys())
43 | 
44 |     data = {}
45 |     data_t = {}
46 |     for key in key_size_t:
47 |         data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
48 |         data_t[key] = data[key].clone()
49 |     data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
50 |     data_t['keyX'] = data['keyX'].clone()
51 |     if mpu.get_model_parallel_rank() != 0:
52 |         data = None
53 | 
54 |     data_utils._check_data_types(keys, data_t, torch.int64)
55 |     key_size, key_numel, \
56 |         total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
57 |     for key in keys:
58 |         assert key_size[key] == key_size_t[key]
59 |     total_numel_t = 0
60 |     for key in keys:
61 |         target_size = functools.reduce(operator.mul, key_size_t[key], 1)
62 |         assert key_numel[key] == target_size
63 |         total_numel_t += target_size
64 |     assert total_numel == total_numel_t
65 | 
66 |     data_b = data_utils.broadcast_data(keys, data, torch.int64)
67 |     for key in keys:
68 |         tensor = data_t[key].cuda()
69 |         assert data_b[key].sub(tensor).abs().max() == 0
70 | 
71 |     # Reset groups
72 |     mpu.destroy_model_parallel()
73 | 
74 |     torch.distributed.barrier()
75 |     if torch.distributed.get_rank() == 0:
76 |         print('>> passed the test :-)')
77 | 
78 | 
79 | if __name__ == '__main__':
80 | 
81 |     initialize_distributed()
82 |     world_size = torch.distributed.get_world_size()
83 | 
84 |     model_parallel_size = 1
85 |     while model_parallel_size <= world_size:
86 |         print_separator('test test boradcast data')
87 |         test_boradcast_data(model_parallel_size)
88 |         model_parallel_size *= 2
89 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/glue/finetune.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """GLUE finetuning/evaluation."""
17 | 
18 | from megatron import get_args
19 | from megatron import print_rank_0
20 | from megatron import get_tokenizer
21 | from megatron.model.classification import Classification
22 | from tasks.eval_utils import accuracy_func_provider
23 | from tasks.finetune_utils import finetune
24 | 
25 | 
26 | def glue_classification(num_classes, Dataset,
27 |                         name_from_datapath_func):
28 | 
29 |     def train_valid_datasets_provider():
30 |         """Build train and validation dataset."""
31 |         args = get_args()
32 |         tokenizer = get_tokenizer()
33 | 
34 |         train_dataset = Dataset('training', args.train_data,
35 |                                 tokenizer, args.seq_length)
36 |         valid_dataset = Dataset('validation', args.valid_data,
37 |                                 tokenizer, args.seq_length)
38 | 
39 |         return train_dataset, valid_dataset
40 | 
41 |     def model_provider():
42 |         """Build the model."""
43 |         args = get_args()
44 | 
45 |         print_rank_0('building classification model for {} ...'.format(
46 |             args.task))
47 | 
48 |         return Classification(num_classes=num_classes, num_tokentypes=2)
49 | 
50 |     def metrics_func_provider():
51 |         """Privde metrics callback function."""
52 |         def single_dataset_provider(datapath):
53 |             args = get_args()
54 |             tokenizer = get_tokenizer()
55 | 
56 |             name = name_from_datapath_func(datapath)
57 |             return Dataset(name, [datapath], tokenizer, args.seq_length)
58 |         return accuracy_func_provider(single_dataset_provider)
59 | 
60 |     """Finetune/evaluate."""
61 |     finetune(train_valid_datasets_provider, model_provider,
62 |              end_of_epoch_callback_provider=metrics_func_provider)
63 | 
64 | 
65 | def main():
66 |     args = get_args()
67 | 
68 |     if args.task == 'MNLI':
69 | 
70 |         num_classes = 3
71 |         from tasks.glue.mnli import MNLIDataset as Dataset
72 | 
73 |         def name_from_datapath(datapath):
74 |             return datapath.split('MNLI')[-1].strip(
75 |                 '.tsv').strip('/').replace('_', '-')
76 | 
77 |     elif args.task == 'QQP':
78 | 
79 |         num_classes = 2
80 |         from tasks.glue.qqp import QQPDataset as Dataset
81 | 
82 |         def name_from_datapath(datapath):
83 |             return datapath.split('QQP')[-1].strip(
84 |                 '.tsv').strip('/').replace('_', '-')
85 | 
86 |     else:
87 |         raise NotImplementedError('GLUE task {} is not implemented.'.format(
88 |             args.task))
89 | 
90 |     glue_classification(num_classes, Dataset, name_from_datapath)
91 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/setup.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Setup for pip package for Megatron."""
17 | 
18 | import os
19 | import sys
20 | import setuptools
21 | 
22 | if sys.version_info < (3,):
23 |     raise Exception("Python 2 is not supported by Megatron.")
24 | 
25 | from megatron.package_info import (
26 |     __description__,
27 |     __contact_names__,
28 |     __url__,
29 |     __download_url__,
30 |     __keywords__,
31 |     __license__,
32 |     __package_name__,
33 |     __version__,
34 | )
35 | 
36 | with open("README.md", "r") as fh:
37 |     long_description = fh.read()
38 | 
39 | ###############################################################################
40 | #                             Dependency Loading                              #
41 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
42 | 
43 | 
44 | def req_file(filename):
45 |     with open(filename) as f:
46 |         content = f.readlines()
47 |     return [x.strip() for x in content]
48 | 
49 | 
50 | install_requires = req_file("requirements.txt")
51 | 
52 | setuptools.setup(
53 |     name=__package_name__,
54 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
55 |     # the version across setup.py and the project code, see
56 |     # https://packaging.python.org/en/latest/single_source_version.html
57 |     version=__version__,
58 |     description=__description__,
59 |     long_description=long_description,
60 |     long_description_content_type="text/markdown",
61 |     # The project's main homepage.
62 |     url=__url__,
63 |     author=__contact_names__,
64 |     maintainer=__contact_names__,
65 |     # The licence under which the project is released
66 |     license=__license__,
67 |     classifiers=[
68 |         'Intended Audience :: Developers',
69 |         'Intended Audience :: Science/Research',
70 |         'Intended Audience :: Information Technology',
71 |         # Indicate what your project relates to
72 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
73 |         'Topic :: Software Development :: Libraries :: Python Modules',
74 |         # Supported python versions
75 |         'Programming Language :: Python :: 3.6',
76 |         'Programming Language :: Python :: 3.7',
77 |         'Programming Language :: Python :: 3.8',
78 |         # Additional Setting
79 |         'Environment :: Console',
80 |         'Natural Language :: English',
81 |         'Operating System :: OS Independent',
82 |     ],
83 |     python_requires='>=3.6',
84 |     packages=setuptools.find_packages(),
85 |     install_requires=install_requires,
86 |     # Add in any packaged data.
87 |     include_package_data=True,
88 |     zip_safe=False,
89 |     # PyPI package information.
90 |     keywords=__keywords__
91 | )
92 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/glue/mnli.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """MNLI dataset."""
17 | 
18 | from megatron import print_rank_0
19 | from tasks.data_utils import clean_text
20 | from .data import GLUEAbstractDataset
21 | 
22 | 
23 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2}
24 | 
25 | 
26 | class MNLIDataset(GLUEAbstractDataset):
27 | 
28 |     def __init__(self, name, datapaths, tokenizer, max_seq_length,
29 |                  test_label='contradiction'):
30 |         self.test_label = test_label
31 |         super().__init__('MNLI', name, datapaths,
32 |                          tokenizer, max_seq_length)
33 | 
34 |     def process_samples_from_single_path(self, filename):
35 |         """"Implement abstract method."""
36 |         print_rank_0(' > Processing {} ...'.format(filename))
37 | 
38 |         samples = []
39 |         total = 0
40 |         first = True
41 |         is_test = False
42 |         with open(filename, 'r') as f:
43 |             for line in f:
44 |                 row = line.strip().split('\t')
45 |                 if first:
46 |                     first = False
47 |                     if len(row) == 10:
48 |                         is_test = True
49 |                         print_rank_0(
50 |                             '   reading {}, {} and {} columns and setting '
51 |                             'labels to {}'.format(
52 |                                 row[0].strip(), row[8].strip(),
53 |                                 row[9].strip(), self.test_label))
54 |                     else:
55 |                         print_rank_0('    reading {} , {}, {}, and {} columns '
56 |                                      '...'.format(
57 |                                          row[0].strip(), row[8].strip(),
58 |                                          row[9].strip(), row[-1].strip()))
59 |                     continue
60 | 
61 |                 text_a = clean_text(row[8].strip())
62 |                 text_b = clean_text(row[9].strip())
63 |                 unique_id = int(row[0].strip())
64 |                 label = row[-1].strip()
65 |                 if is_test:
66 |                     label = self.test_label
67 | 
68 |                 assert len(text_a) > 0
69 |                 assert len(text_b) > 0
70 |                 assert label in LABELS
71 |                 assert unique_id >= 0
72 | 
73 |                 sample = {'text_a': text_a,
74 |                           'text_b': text_b,
75 |                           'label': LABELS[label],
76 |                           'uid': unique_id}
77 |                 total += 1
78 |                 samples.append(sample)
79 | 
80 |                 if total % 50000 == 0:
81 |                     print_rank_0('  > processed {} so far ...'.format(total))
82 | 
83 |         print_rank_0(' >> processed {} samples.'.format(len(samples)))
84 |         return samples
85 | 


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/utils/sampling_helpers.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/utils/sampling_helper.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_GUACAMOL).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | from typing import List, Set
10 | 
11 | from guacamol.guacamol.distribution_matching_generator import DistributionMatchingGenerator
12 | from guacamol.guacamol.utils.chemistry import is_valid, canonicalize
13 | 
14 | 
15 | def sample_valid_molecules(model: DistributionMatchingGenerator, number_molecules: int, max_tries=10) -> List[str]:
16 |     """
17 |     Sample from the given generator until the desired number of valid molecules
18 |     has been sampled (i.e., ignore invalid molecules).
19 | 
20 |     Args:
21 |         model: model to sample from
22 |         number_molecules: number of valid molecules to generate
23 |         max_tries: determines the maximum number N of samples to draw, N = number_molecules * max_tries
24 | 
25 |     Returns:
26 |         A list of number_molecules valid molecules. If this was not possible with the given max_tries, the list may be shorter.
27 |     """
28 | 
29 |     max_samples = max_tries * number_molecules
30 |     number_already_sampled = 0
31 | 
32 |     valid_molecules: List[str] = []
33 | 
34 |     while len(valid_molecules) < number_molecules and number_already_sampled < max_samples:
35 |         remaining_to_sample = number_molecules - len(valid_molecules)
36 | 
37 |         samples = model.generate(remaining_to_sample)
38 |         number_already_sampled += remaining_to_sample
39 | 
40 |         valid_molecules += [m for m in samples if is_valid(m)]
41 | 
42 |     return valid_molecules
43 | 
44 | 
45 | def sample_unique_molecules(model: DistributionMatchingGenerator, number_molecules: int, max_tries=10) -> List[str]:
46 |     """
47 |     Sample from the given generator until the desired number of unique (distinct) molecules
48 |     has been sampled (i.e., ignore duplicate molecules).
49 | 
50 |     Args:
51 |         model: model to sample from
52 |         number_molecules: number of unique (distinct) molecules to generate
53 |         max_tries: determines the maximum number N of samples to draw, N = number_molecules * max_tries
54 | 
55 |     Returns:
56 |         A list of number_molecules unique molecules, in canonalized form.
57 |         If this was not possible with the given max_tries, the list may be shorter.
58 |         The generation order is kept.
59 |     """
60 | 
61 |     max_samples = max_tries * number_molecules
62 |     number_already_sampled = 0
63 | 
64 |     unique_list: List[str] = []
65 |     unique_set: Set[str] = set()
66 | 
67 |     while len(unique_list) < number_molecules and number_already_sampled < max_samples:
68 |         remaining_to_sample = number_molecules - len(unique_list)
69 | 
70 |         samples = model.generate(remaining_to_sample)
71 |         number_already_sampled += remaining_to_sample
72 | 
73 |         for smiles in samples:
74 |             canonical_smiles = canonicalize(smiles)
75 |             if canonical_smiles is not None and canonical_smiles not in unique_set:
76 |                 unique_set.add(canonical_smiles)
77 |                 unique_list.append(canonical_smiles)
78 | 
79 |     # this should always be True
80 |     assert len(unique_set) == len(unique_list)
81 | 
82 |     return unique_list
83 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/openwebtext/group_duplicates_url.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import time
18 | import sys
19 | 
20 | 
21 | def is_similar(jaccard_similarity):
22 |     return (js >= 0.9)
23 | 
24 | 
25 | if __name__ == '__main__':
26 | 
27 | 
28 |     print('grouping duplicate urls ...')
29 | 
30 |     input = sys.argv[1]
31 |     output = sys.argv[2]
32 | 
33 |     url_to_index = {}
34 |     index_to_urls = []
35 |     counter = 0
36 |     start_time = time.time()
37 |     with open(input, 'r') as f:
38 |         for line in f:
39 |             counter += 1
40 |             myjson = json.loads(line)
41 |             urls = []
42 |             for main_url in myjson.keys():
43 |                 urls.append(main_url)
44 |                 for value in myjson[main_url]:
45 |                     for other_url, js in value.items():
46 |                         if is_similar(js):
47 |                             urls.append(other_url)
48 |             current_index = -1
49 |             other_indices = set()
50 |             for url in urls:
51 |                 if url in url_to_index:
52 |                     if current_index == -1:
53 |                         current_index = url_to_index[url]
54 |                     elif current_index != url_to_index[url]:
55 |                         other_indices.add(url_to_index[url])
56 |             if current_index == -1:
57 |                 current_index = len(index_to_urls)
58 |                 index_to_urls.append(set())
59 |             for url in urls:
60 |                 url_to_index[url] = current_index
61 |                 index_to_urls[current_index].add(url)
62 |             for index in other_indices:
63 |                 for url in index_to_urls[index]:
64 |                     index_to_urls[current_index].add(url)
65 |                     url_to_index[url] = current_index
66 |                 index_to_urls[index] = None
67 | 
68 |             if counter % 100000 == 0:
69 |                 print(' > processed {} lines in {} seconds ...'.format(
70 |                     counter, time.time() - start_time))
71 | 
72 | 
73 |     total_remove = 0
74 |     total_remain = 0
75 |     for urls in index_to_urls:
76 |         if urls is not None:
77 |             if len(urls) > 1:
78 |                 total_remove += (len(urls) - 1)
79 |                 total_remain += 1
80 |     print('out of {} urls, only {} are unique and {} should be removed'.format(
81 |         total_remove+total_remain, total_remain, total_remove))
82 | 
83 |     with open(output, 'wb') as f:
84 |         for i, urls in enumerate(index_to_urls):
85 |             if urls is not None:
86 |                 if len(urls) > 1:
87 |                     myjson = json.dumps({str(i): list(urls)},
88 |                                         ensure_ascii=False)
89 |                     f.write(myjson.encode('utf-8'))
90 |                     f.write('\n'.encode('utf-8'))
91 | 


--------------------------------------------------------------------------------
/MolBART/eval_megatron_retrieval_controlled.sh:
--------------------------------------------------------------------------------
  1 | GPUS_PER_NODE=1 # 4
  2 | # Change for multinode config
  3 | MASTER_ADDR=localhost
  4 | MASTER_PORT=6003
  5 | NNODES=1
  6 | NODE_RANK=0
  7 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
  8 | 
  9 | export DLWS_NUM_WORKER=${NNODES}
 10 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
 11 | 
 12 | script_path=$(realpath $0)
 13 | script_dir=$(dirname $script_path)
 14 | # config_json="$script_dir/megatron_molbart/ds_config.json"
 15 | config_json="megatron_molbart/ds_config.json"
 16 | 
 17 | #ZeRO Configs
 18 | stage=1
 19 | reduce_scatter=true
 20 | contigious_gradients=false
 21 | rbs=50000000
 22 | agbs=5000000000
 23 | 
 24 | chkp_layers=1
 25 | PA=true
 26 | PA_CPU=false
 27 | CC=true
 28 | SYNCHRONIZE=true
 29 | PROFILE=false
 30 | 
 31 | # Megatron Model Parallelism
 32 | mp_size=1
 33 | # DeepSpeed Pipeline parallelism
 34 | pp_size=0
 35 | 
 36 | 
 37 | #######
 38 | ## JACKMOD: add two options: 1 for data, 1 for tensorboard
 39 | megatron_options=" \
 40 |         --model-parallel-size ${mp_size} \
 41 |         --pipe-parallel-size ${pp_size} \
 42 |         --num-layers 4 \
 43 |         --hidden-size 256 \
 44 |         --num-attention-heads 8 \
 45 |         --seq-length 512 \
 46 |         --max-position-embeddings 512 \
 47 |         --batch-size 320 \
 48 |         --gas 16 \
 49 |         --train-iters 320000 \
 50 |         --lr-decay-iters 320000 \
 51 |         --data-impl mmap \
 52 |         --distributed-backend nccl \
 53 |         --lr 0.0001 \
 54 |         --lr-decay-style cosine \
 55 |         --min-lr 1.0e-5 \
 56 |         --weight-decay 0 \
 57 |         --clip-grad 1.0 \
 58 |         --warmup 0.01 \
 59 |         --checkpoint-activations \
 60 |         --log-interval 1 \
 61 |         --save-interval 1000 \
 62 |         --eval-interval 100000 \
 63 |         --eval-iters 10 \
 64 |         --save megatron_molbart_100m_checkpoint
 65 |         --dataset_path ../data/zinc.tab 
 66 |         --load /mol-gen/drug/models/megamolbart/checkpoints
 67 | "
 68 | 
 69 | deepspeed_options=" \
 70 |                 --deepspeed \
 71 |                 --deepspeed_config ${config_json} \
 72 |                 --zero-stage ${stage} \
 73 |                 --zero-reduce-bucket-size ${rbs} \
 74 |                 --zero-allgather-bucket-size ${agbs} 
 75 |             "
 76 | 
 77 | if [ "${contigious_gradients}" = "true" ]; then
 78 | deepspeed_options="${deepspeed_options} \
 79 |                 --zero-contigious-gradients"
 80 | fi
 81 | 
 82 | if [ "${reduce_scatter}" = "true" ]; then
 83 | deepspeed_options="${deepspeed_options} \
 84 |                 --zero-reduce-scatter"
 85 | fi
 86 | 
 87 | chkp_opt=" \
 88 | --checkpoint-activations \
 89 | --checkpoint-num-layers ${chkp_layers}"
 90 | 
 91 | if [ "${PA}" = "true" ]; then
 92 | chkp_opt="${chkp_opt} \
 93 |         --partition-activations"
 94 | fi
 95 | 
 96 | if [ "${PA_CPU}" = "true" ]; then
 97 | chkp_opt="${chkp_opt} \
 98 |         --checkpoint-in-cpu"
 99 | fi
100 | 
101 | if [ "${SYNCHRONIZE}" = "true" ]; then
102 | chkp_opt="${chkp_opt} \
103 |         --synchronize-each-layer"
104 | fi
105 | 
106 | if [ "${CC}" = "true" ]; then
107 | chkp_opt="${chkp_opt} \
108 |         --contigious-checkpointing"
109 | fi
110 | 
111 | if [ "${PROFILE}" = "true" ]; then
112 | chkp_opt="${chkp_opt} \
113 |         --profile-backward"
114 | fi
115 | 
116 | full_options="${megatron_options} ${deepspeed_options} ${chkp_opt}"
117 | 
118 | run_cmd="deepspeed --include localhost:4 --master_port=${MASTER_PORT} megatron_molbart/eval_retrieval_controlled.py $@ ${full_options}"
119 | echo ${run_cmd}
120 | eval ${run_cmd}
121 | 
122 | set +x


--------------------------------------------------------------------------------
/inference/cheminformatics/common/cuchemcommon/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/utils/logger.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_CHEMINFORMATICS).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | import logging
10 | import os
11 | from datetime import datetime
12 | 
13 | from cuchemcommon.context import Context
14 | 
15 | from .sysinfo import get_machine_config, print_machine_config
16 | 
17 | BENCHMARK_FILE = '/data/benchmark.csv'
18 | 
19 | logger = logging.getLogger(__name__)
20 | 
21 | 
22 | def initialize_logfile(benchmark_file=BENCHMARK_FILE):
23 |     """Initialize benchmark file with header if needed"""
24 | 
25 |     config = get_machine_config()
26 |     config_message = print_machine_config(config)
27 | 
28 |     if not os.path.exists(benchmark_file):
29 |         with open(benchmark_file, 'w') as fh:
30 |             fh.write(f'# {config_message}\n')
31 |             fh.write('date,benchmark_type,step,time(hh:mm:ss.ms),n_molecules,n_workers,metric_name,metric_value\n')
32 |     return benchmark_file
33 | 
34 | 
35 | class MetricsLogger(object):
36 | 
37 |     def __init__(self,
38 |                  task_name,
39 |                  n_molecules):
40 | 
41 |         self.task_name = task_name
42 |         self.n_molecules = n_molecules
43 |         self.start_time = None
44 |         self.metric_name = None
45 |         self.metric_value = None
46 | 
47 |         self.metric_func = None
48 |         self.metric_func_args = None
49 |         self.metric_func_kwargs = {}
50 | 
51 |     def __enter__(self):
52 |         self.start_time = datetime.now()
53 | 
54 |         return self
55 | 
56 |     def __exit__(self, type, value, traceback):
57 |         context = Context()
58 | 
59 |         runtime = datetime.now() - self.start_time
60 |         logger.info('### Runtime {} time (hh:mm:ss.ms) {}'.format(self.task_name, runtime))
61 |         n_workers = len(context.dask_client.cluster.workers)
62 | 
63 |         if self.metric_func and context.is_benchmark:
64 |             self.metric_value = self.metric_func(*self.metric_func_args,
65 |                                                  **self.metric_func_kwargs)
66 | 
67 |         if self.metric_value is None:
68 |             self.metric_name = ''
69 |             self.metric_value = ''
70 |         else:
71 |             logger.info('Calculated {} is {}'.format(self.metric_name, self.metric_value))
72 | 
73 |         log_results(self.start_time, context.compute_type, self.task_name,
74 |                     runtime,
75 |                     n_molecules=self.n_molecules,
76 |                     n_workers=n_workers,
77 |                     metric_name=self.metric_name,
78 |                     metric_value=self.metric_value,
79 |                     benchmark_file=context.benchmark_file)
80 | 
81 | 
82 | def log_results(date,
83 |                 benchmark_type,
84 |                 step,
85 |                 time,
86 |                 n_molecules,
87 |                 n_workers,
88 |                 metric_name='',
89 |                 metric_value='',
90 |                 benchmark_file=BENCHMARK_FILE):
91 |     """Log benchmark results to a file"""
92 | 
93 |     out_list = [date, benchmark_type, step, time, n_molecules, n_workers, metric_name, metric_value]
94 |     out_fmt = ','.join(['{}'] * len(out_list)) + '\n'
95 | 
96 |     with open(benchmark_file, 'a') as fh:
97 |         out_string = out_fmt.format(*out_list)
98 |         fh.write(out_string)
99 | 


--------------------------------------------------------------------------------
/inference/cheminformatics/common/cuchemcommon/fingerprint.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------------------------------------
  2 | # Taken from the following link as is from:
  3 | # https://github.com/NVIDIA/cheminformatics/blob/master/common/cuchemcommon/fingerprint.py
  4 | #
  5 | # The license for the original version of this file can be
  6 | # found in this directory (LICENSE_CHEMINFORMATICS).
  7 | # ---------------------------------------------------------------
  8 | 
  9 | import logging
 10 | import os
 11 | from abc import ABC
 12 | from enum import Enum
 13 | 
 14 | import numpy as np
 15 | import pandas as pd
 16 | from cddd.inference import InferenceModel
 17 | from cuchem.utils.data_peddler import download_cddd_models
 18 | from rdkit import Chem
 19 | from rdkit.Chem import AllChem
 20 | 
 21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | 
 25 | def calc_morgan_fingerprints(dataframe, smiles_col='canonical_smiles'):
 26 |     """Calculate Morgan fingerprints on SMILES strings
 27 | 
 28 |     Args:
 29 |         dataframe (pd.DataFrame): dataframe containing a SMILES column for calculation
 30 | 
 31 |     Returns:
 32 |         pd.DataFrame: new dataframe containing fingerprints
 33 |     """
 34 |     mf = MorganFingerprint()
 35 |     fp = mf.transform(dataframe, col_name=smiles_col)
 36 |     fp = pd.DataFrame(fp)
 37 |     fp.index = dataframe.index
 38 |     return fp
 39 | 
 40 | 
 41 | class TransformationDefaults(Enum):
 42 |     MorganFingerprint = {'radius': 2, 'nBits': 512}
 43 |     Embeddings = {}
 44 | 
 45 | 
 46 | class BaseTransformation(ABC):
 47 |     def __init__(self, **kwargs):
 48 |         self.name = None
 49 |         self.kwargs = None
 50 |         self.func = None
 51 | 
 52 |     def transform(self, data):
 53 |         return NotImplemented
 54 | 
 55 |     def transform_many(self, data):
 56 |         return list(map(self.transform, data))
 57 | 
 58 |     def __len__(self):
 59 |         return NotImplemented
 60 | 
 61 | 
 62 | class MorganFingerprint(BaseTransformation):
 63 | 
 64 |     def __init__(self, **kwargs):
 65 |         self.name = __class__.__name__.split('.')[-1]
 66 |         self.kwargs = TransformationDefaults[self.name].value
 67 |         self.kwargs.update(kwargs)
 68 |         self.func = AllChem.GetMorganFingerprintAsBitVect
 69 | 
 70 |     def transform(self, data, col_name='transformed_smiles'):
 71 |         data = data[col_name]
 72 |         fp_array = []
 73 |         for mol in data:
 74 |             m = Chem.MolFromSmiles(mol)
 75 |             fp = self.func(m, **self.kwargs)
 76 |             fp_array.append(list(fp.ToBitString()))
 77 |         fp_array = np.asarray(fp_array)
 78 |         return fp_array
 79 | 
 80 |     def __len__(self):
 81 |         return self.kwargs['nBits']
 82 | 
 83 | 
 84 | class Embeddings(BaseTransformation):
 85 | 
 86 |     def __init__(self, use_gpu=True, cpu_threads=5, model_dir=None, **kwargs):
 87 |         self.name = __class__.__name__.split('.')[-1]
 88 |         self.kwargs = TransformationDefaults[self.name].value
 89 |         self.kwargs.update(kwargs)
 90 |         model_dir = download_cddd_models()
 91 |         self.func = InferenceModel(model_dir, use_gpu=use_gpu, cpu_threads=cpu_threads)
 92 | 
 93 |     def transform(self, data):
 94 |         data = data['transformed_smiles']
 95 |         return self.func.seq_to_emb(data).squeeze()
 96 | 
 97 |     def inverse_transform(self, embeddings):
 98 |         "Embedding array -- individual compound embeddings are in rows"
 99 |         embeddings = np.asarray(embeddings)
100 |         return self.func.emb_to_seq(embeddings)
101 | 
102 |     def __len__(self):
103 |         return self.func.hparams.emb_size
104 | 


--------------------------------------------------------------------------------
/guacamol/guacamol/guacamol/assess_goal_directed_generation.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------------------------------------
 2 | # Taken from the following link as is from:
 3 | # https://github.com/BenevolentAI/guacamol/blob/master/guacamol/assess_goal_directed_generation.py
 4 | #
 5 | # The license for the original version of this file can be
 6 | # found in this directory (LICENSE_GUACAMOL).
 7 | # ---------------------------------------------------------------
 8 | 
 9 | import datetime
10 | import json
11 | import logging
12 | from collections import OrderedDict
13 | from typing import List, Any, Dict
14 | 
15 | import guacamol
16 | from guacamol.goal_directed_benchmark import GoalDirectedBenchmark, GoalDirectedBenchmarkResult
17 | from guacamol.goal_directed_generator import GoalDirectedGenerator
18 | from guacamol.benchmark_suites import goal_directed_benchmark_suite
19 | from guacamol.utils.data import get_time_string
20 | 
21 | logger = logging.getLogger(__name__)
22 | logger.addHandler(logging.NullHandler())
23 | 
24 | from pdb import set_trace
25 | 
26 | def assess_goal_directed_generation(goal_directed_molecule_generator: GoalDirectedGenerator,
27 |                                     json_output_file='output_goal_directed.json',
28 |                                     benchmark_version='v1') -> None:
29 |     """
30 |     Assesses a distribution-matching model for de novo molecule design.
31 | 
32 |     Args:
33 |         goal_directed_molecule_generator: Model to evaluate
34 |         json_output_file: Name of the file where to save the results in JSON format
35 |         benchmark_version: which benchmark suite to execute
36 |     """
37 |     logger.info(f'Benchmarking goal-directed molecule generation, version {benchmark_version}')
38 |     benchmarks = goal_directed_benchmark_suite(version_name=benchmark_version)
39 |     set_trace()
40 |     
41 |     results = _evaluate_goal_directed_benchmarks(
42 |         goal_directed_molecule_generator=goal_directed_molecule_generator,
43 |         benchmarks=benchmarks)
44 | 
45 |     benchmark_results: Dict[str, Any] = OrderedDict()
46 |     benchmark_results['guacamol_version'] = guacamol.__version__
47 |     benchmark_results['benchmark_suite_version'] = benchmark_version
48 |     benchmark_results['timestamp'] = get_time_string()
49 |     benchmark_results['results'] = [vars(result) for result in results]
50 | 
51 |     logger.info(f'Save results to file {json_output_file}')
52 |     with open(json_output_file, 'wt') as f:
53 |         f.write(json.dumps(benchmark_results, indent=4))
54 | 
55 | 
56 | def _evaluate_goal_directed_benchmarks(goal_directed_molecule_generator: GoalDirectedGenerator,
57 |                                        benchmarks: List[GoalDirectedBenchmark]
58 |                                        ) -> List[GoalDirectedBenchmarkResult]:
59 |     """
60 |     Evaluate a model with the given benchmarks.
61 |     Should not be called directly except for testing purposes.
62 | 
63 |     Args:
64 |         goal_directed_molecule_generator: model to assess
65 |         benchmarks: list of benchmarks to evaluate
66 |         json_output_file: Name of the file where to save the results in JSON format
67 |     """
68 | 
69 |     logger.info(f'Number of benchmarks: {len(benchmarks)}')
70 | 
71 |     results = []
72 |     for i, benchmark in enumerate(benchmarks, 1):
73 |         logger.info(f'Running benchmark {i}/{len(benchmarks)}: {benchmark.name}')
74 |         result = benchmark.assess_model(goal_directed_molecule_generator)
75 |         logger.info(f'Results for the benchmark "{result.benchmark_name}":')
76 |         logger.info(f'  Score: {result.score:.6f}')
77 |         logger.info(f'  Execution time: {str(datetime.timedelta(seconds=int(result.execution_time)))}')
78 |         logger.info(f'  Metadata: {result.metadata}')
79 |         results.append(result)
80 | 
81 |     logger.info('Finished execution of the benchmarks')
82 | 
83 |     return results
84 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/test_initialize.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from commons import print_separator
17 | from commons import initialize_distributed
18 | import mpu
19 | import torch
20 | import sys
21 | sys.path.append("../..")
22 | 
23 | 
24 | def test_initialize_model_parallel(model_parallel_size):
25 | 
26 |     if torch.distributed.get_rank() == 0:
27 |         print('> testing initialize_model_parallel with size {} ...'.format(
28 |             model_parallel_size))
29 |     model_parallel_size_ = min(model_parallel_size,
30 |                                torch.distributed.get_world_size())
31 |     assert not mpu.model_parallel_is_initialized()
32 |     mpu.initialize_model_parallel(model_parallel_size_)
33 |     assert mpu.model_parallel_is_initialized()
34 | 
35 |     # Checks.
36 |     def check(group, world_size, rank):
37 |         assert world_size == torch.distributed.get_world_size(group=group)
38 |         assert rank == torch.distributed.get_rank(group=group)
39 | 
40 |     # Model parallel.
41 |     world_size = model_parallel_size_
42 |     rank = torch.distributed.get_rank() % model_parallel_size_
43 |     assert world_size == mpu.get_model_parallel_world_size()
44 |     assert rank == mpu.get_model_parallel_rank()
45 |     check(mpu.get_model_parallel_group(), world_size, rank)
46 | 
47 |     # Data parallel.
48 |     world_size = torch.distributed.get_world_size() // model_parallel_size_
49 |     rank = torch.distributed.get_rank() // model_parallel_size
50 |     assert world_size == mpu.get_data_parallel_world_size()
51 |     assert rank == mpu.get_data_parallel_rank()
52 |     check(mpu.get_data_parallel_group(), world_size, rank)
53 | 
54 |     # Reset groups
55 |     mpu.destroy_model_parallel()
56 | 
57 |     torch.distributed.barrier()
58 |     if torch.distributed.get_rank() == 0:
59 |         print('>> passed the test :-)')
60 | 
61 | 
62 | def test_get_model_parallel_src_rank(model_parallel_size_):
63 | 
64 |     if torch.distributed.get_rank() == 0:
65 |         print('> testing get_model_parallel_src_rank with size {} ...'.format(
66 |             model_parallel_size_))
67 |     model_parallel_size = min(model_parallel_size_,
68 |                               torch.distributed.get_world_size())
69 |     assert not mpu.model_parallel_is_initialized()
70 |     mpu.initialize_model_parallel(model_parallel_size)
71 |     assert mpu.model_parallel_is_initialized()
72 | 
73 |     # Checks
74 |     src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
75 |     assert mpu.get_model_parallel_src_rank() == src_rank
76 | 
77 |     # Reset groups
78 |     mpu.destroy_model_parallel()
79 | 
80 |     torch.distributed.barrier()
81 |     if torch.distributed.get_rank() == 0:
82 |         print('>> passed the test :-)')
83 | 
84 | 
85 | if __name__ == '__main__':
86 | 
87 |     initialize_distributed()
88 |     world_size = torch.distributed.get_world_size()
89 |     model_parallel_size = 1
90 |     while model_parallel_size <= world_size:
91 |         print_separator('test initialize model parallel')
92 |         test_initialize_model_parallel(model_parallel_size)
93 |         print_separator('test model parallel source rank')
94 |         test_get_model_parallel_src_rank(model_parallel_size)
95 |         model_parallel_size *= 2
96 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/build/lib/megatron/mpu/tests/test_initialize.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from commons import print_separator
17 | from commons import initialize_distributed
18 | import mpu
19 | import torch
20 | import sys
21 | sys.path.append("../..")
22 | 
23 | 
24 | def test_initialize_model_parallel(model_parallel_size):
25 | 
26 |     if torch.distributed.get_rank() == 0:
27 |         print('> testing initialize_model_parallel with size {} ...'.format(
28 |             model_parallel_size))
29 |     model_parallel_size_ = min(model_parallel_size,
30 |                                torch.distributed.get_world_size())
31 |     assert not mpu.model_parallel_is_initialized()
32 |     mpu.initialize_model_parallel(model_parallel_size_)
33 |     assert mpu.model_parallel_is_initialized()
34 | 
35 |     # Checks.
36 |     def check(group, world_size, rank):
37 |         assert world_size == torch.distributed.get_world_size(group=group)
38 |         assert rank == torch.distributed.get_rank(group=group)
39 | 
40 |     # Model parallel.
41 |     world_size = model_parallel_size_
42 |     rank = torch.distributed.get_rank() % model_parallel_size_
43 |     assert world_size == mpu.get_model_parallel_world_size()
44 |     assert rank == mpu.get_model_parallel_rank()
45 |     check(mpu.get_model_parallel_group(), world_size, rank)
46 | 
47 |     # Data parallel.
48 |     world_size = torch.distributed.get_world_size() // model_parallel_size_
49 |     rank = torch.distributed.get_rank() // model_parallel_size
50 |     assert world_size == mpu.get_data_parallel_world_size()
51 |     assert rank == mpu.get_data_parallel_rank()
52 |     check(mpu.get_data_parallel_group(), world_size, rank)
53 | 
54 |     # Reset groups
55 |     mpu.destroy_model_parallel()
56 | 
57 |     torch.distributed.barrier()
58 |     if torch.distributed.get_rank() == 0:
59 |         print('>> passed the test :-)')
60 | 
61 | 
62 | def test_get_model_parallel_src_rank(model_parallel_size_):
63 | 
64 |     if torch.distributed.get_rank() == 0:
65 |         print('> testing get_model_parallel_src_rank with size {} ...'.format(
66 |             model_parallel_size_))
67 |     model_parallel_size = min(model_parallel_size_,
68 |                               torch.distributed.get_world_size())
69 |     assert not mpu.model_parallel_is_initialized()
70 |     mpu.initialize_model_parallel(model_parallel_size)
71 |     assert mpu.model_parallel_is_initialized()
72 | 
73 |     # Checks
74 |     src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
75 |     assert mpu.get_model_parallel_src_rank() == src_rank
76 | 
77 |     # Reset groups
78 |     mpu.destroy_model_parallel()
79 | 
80 |     torch.distributed.barrier()
81 |     if torch.distributed.get_rank() == 0:
82 |         print('>> passed the test :-)')
83 | 
84 | 
85 | if __name__ == '__main__':
86 | 
87 |     initialize_distributed()
88 |     world_size = torch.distributed.get_world_size()
89 |     model_parallel_size = 1
90 |     while model_parallel_size <= world_size:
91 |         print_separator('test initialize model parallel')
92 |         test_initialize_model_parallel(model_parallel_size)
93 |         print_separator('test model parallel source rank')
94 |         test_get_model_parallel_src_rank(model_parallel_size)
95 |         model_parallel_size *= 2
96 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | 
  3 | GPUS_PER_NODE=8
  4 | # Change for multinode config
  5 | MASTER_ADDR=localhost
  6 | MASTER_PORT=6000
  7 | NNODES=1
  8 | NODE_RANK=0
  9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 10 | 
 11 | export DLWS_NUM_WORKER=${NNODES}
 12 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
 13 | 
 14 | DATA_PATH=data/webtext/webtext_text_document
 15 | VOCAB_PATH=data/gpt2-vocab.json
 16 | MERGE_PATH=data/gpt2-merges.txt
 17 | CHECKPOINT_PATH=checkpoints/gpt2_345m_ds
 18 | 
 19 | script_path=$(realpath $0)
 20 | script_dir=$(dirname $script_path)
 21 | config_json="$script_dir/ds_zero_stage_2_config.json"
 22 | 
 23 | # Megatron Model Parallelism
 24 | mp_size=4
 25 | 
 26 | NLAYERS=24
 27 | NHIDDEN=1024
 28 | BATCHSIZE=9
 29 | LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${mp_size}mp_${BATCHSIZE}b_ds4"
 30 | 
 31 | #ZeRO Configs
 32 | stage=0
 33 | reduce_scatter=true
 34 | contigious_gradients=true
 35 | rbs=50000000
 36 | agbs=5000000000
 37 | 
 38 | #Actication Checkpointing and Contigious Memory
 39 | chkp_layers=1
 40 | PA=true
 41 | PA_CPU=false
 42 | CC=true
 43 | SYNCHRONIZE=true
 44 | PROFILE=false
 45 | 
 46 | 
 47 | gpt_options=" \
 48 |         --model-parallel-size ${mp_size} \
 49 |         --num-layers $NLAYERS \
 50 |         --hidden-size $NHIDDEN \
 51 |         --num-attention-heads 16 \
 52 |         --seq-length 1024 \
 53 |         --max-position-embeddings 1024 \
 54 |         --batch-size $BATCHSIZE \
 55 |         --train-iters 320000 \
 56 |         --lr-decay-iters 320000 \
 57 |         --save $CHECKPOINT_PATH \
 58 |         --load $CHECKPOINT_PATH \
 59 |         --data-path $DATA_PATH \
 60 |         --vocab-file $VOCAB_PATH \
 61 |         --merge-file $MERGE_PATH \
 62 |         --data-impl mmap \
 63 |         --split 949,50,1 \
 64 |         --distributed-backend nccl \
 65 |         --lr 1.5e-4 \
 66 |         --lr-decay-style cosine \
 67 |         --min-lr 1.0e-5 \
 68 |         --weight-decay 1e-2 \
 69 |         --clip-grad 1.0 \
 70 |         --warmup 0.01 \
 71 |         --checkpoint-activations \
 72 |         --log-interval 100 \
 73 |         --save-interval 10000 \
 74 |         --eval-interval 1000 \
 75 |         --eval-iters 10 \
 76 |         --fp16 \
 77 |         --tensorboard-dir ${LOGDIR}
 78 | "
 79 |   
 80 |  deepspeed_options=" \
 81 |                 --deepspeed \
 82 |                 --deepspeed_config ${config_json} \
 83 |                 --zero-stage ${stage} \
 84 |                 --zero-reduce-bucket-size ${rbs} \
 85 |                 --zero-allgather-bucket-size ${agbs} 
 86 |             "
 87 | 
 88 | if [ "${contigious_gradients}" = "true" ]; then
 89 | deepspeed_options="${deepspeed_options} \
 90 |                 --zero-contigious-gradients"
 91 | fi
 92 | 
 93 | if [ "${reduce_scatter}" = "true" ]; then
 94 | deepspeed_options="${deepspeed_options} \
 95 |                 --zero-reduce-scatter"
 96 | fi
 97 | 
 98 | chkp_opt=" \
 99 | --checkpoint-activations \
100 | --checkpoint-num-layers ${chkp_layers}"
101 | 
102 | if [ "${PA}" = "true" ]; then
103 | chkp_opt="${chkp_opt} \
104 |         --partition-activations"
105 | fi
106 | 
107 | if [ "${PA_CPU}" = "true" ]; then
108 | chkp_opt="${chkp_opt} \
109 |         --checkpoint-in-cpu"
110 | fi
111 | 
112 | if [ "${SYNCHRONIZE}" = "true" ]; then
113 | chkp_opt="${chkp_opt} \
114 |         --synchronize-each-layer"
115 | fi
116 | 
117 | if [ "${CC}" = "true" ]; then
118 | chkp_opt="${chkp_opt} \
119 |         --contigious-checkpointing"
120 | fi
121 | 
122 | if [ "${PROFILE}" = "true" ]; then
123 | chkp_opt="${chkp_opt} \
124 |         --profile-backward"
125 | fi
126 | 
127 | full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}"
128 | 
129 | run_cmd="deepspeed --num_nodes ${DLWS_NUM_WORKER} --num_gpus ${DLWS_NUM_GPU_PER_WORKER} pretrain_gpt2.py $@ ${full_options}"
130 | echo ${run_cmd}
131 | eval ${run_cmd}
132 | 
133 | set +x
134 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/megatron/fused_kernels/scaled_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
  1 | /* coding=utf-8
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <ATen/ATen.h>
 18 | #include <cuda.h>
 19 | #include <cuda_runtime.h>
 20 | #include <cuda_fp16.h>
 21 | #include <cuda_profiler_api.h>
 22 | #include "THC/THC.h"
 23 | #include <ATen/cuda/CUDAContext.h>
 24 | #include <torch/extension.h>
 25 | #include "scaled_masked_softmax.h"
 26 | 
 27 | namespace multihead_attn {
 28 | namespace fused_softmax {
 29 | namespace scaled_masked_softmax {
 30 | 
 31 | torch::Tensor fwd_cuda(
 32 |     torch::Tensor const& input,
 33 |     torch::Tensor const& mask,
 34 |     float scale_factor)
 35 | {
 36 |   // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 37 |   const int batches = input.size(0);
 38 |   const int pad_batches = mask.size(0);
 39 |   const int attn_heads = input.size(1);
 40 |   const int seq_len = input.size(2);
 41 |   TORCH_INTERNAL_ASSERT(seq_len <= 2048);
 42 |   TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
 43 |   TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
 44 |   TORCH_INTERNAL_ASSERT(mask.size(2) == seq_len);
 45 |   TORCH_INTERNAL_ASSERT(mask.size(3) == seq_len);
 46 | 
 47 |   // Output 
 48 |   auto act_options = input.options().requires_grad(false);
 49 |   torch::Tensor softmax_results = 
 50 |       torch::empty({batches, attn_heads, seq_len, seq_len}, act_options);
 51 | 
 52 |   // Softmax Intermediate Result Ptr
 53 |   void* input_ptr = static_cast<void*>(input.data_ptr());
 54 |   void* mask_ptr = static_cast<void*>(mask.data_ptr());
 55 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
 56 | 
 57 |   dispatch_scaled_masked_softmax_forward<half, half, float>(
 58 |       reinterpret_cast<half*>(softmax_results_ptr),
 59 |       reinterpret_cast<const half*>(input_ptr),
 60 |       reinterpret_cast<const uint8_t*>(mask_ptr),
 61 |       scale_factor,
 62 |       seq_len,
 63 |       seq_len,
 64 |       batches,
 65 |       attn_heads,
 66 |       pad_batches);
 67 |   return softmax_results;
 68 | }
 69 | 
 70 | torch::Tensor bwd_cuda(
 71 |     torch::Tensor const& output_grads_, 
 72 |     torch::Tensor const& softmax_results_, 
 73 |     float scale_factor)  {
 74 | 	
 75 |   auto output_grads = output_grads_.contiguous();
 76 |   auto softmax_results = softmax_results_.contiguous();
 77 | 
 78 |   //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 79 |   const int batches = output_grads.size(0);
 80 |   const int attn_heads = output_grads.size(1);
 81 |   const int seq_len = output_grads.size(2);
 82 |   TORCH_INTERNAL_ASSERT(output_grads.size(2) == output_grads.size(3));
 83 | 
 84 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
 85 | 
 86 |   //Softmax Grad
 87 |   dispatch_scaled_masked_softmax_backward<half, half, float>(
 88 |       reinterpret_cast<half*>(output_grads_ptr), 
 89 |       reinterpret_cast<half*>(output_grads_ptr), 
 90 |       reinterpret_cast<half const*>(softmax_results.data_ptr()),
 91 |       scale_factor,
 92 |       seq_len,
 93 |       seq_len,
 94 |       batches,
 95 |       attn_heads);
 96 |   
 97 |   //backward pass is completely in-place
 98 |   return output_grads;
 99 | }
100 | }
101 | }
102 | }
103 | 


--------------------------------------------------------------------------------
/MolBART/eval_megatron_retrieval.sh:
--------------------------------------------------------------------------------
  1 | GPUS_PER_NODE=1 # 4
  2 | # Change for multinode config
  3 | MASTER_ADDR=localhost
  4 | MASTER_PORT=6004
  5 | NNODES=1
  6 | NODE_RANK=0
  7 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
  8 | 
  9 | export DLWS_NUM_WORKER=${NNODES}
 10 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
 11 | 
 12 | script_path=$(realpath $0)
 13 | script_dir=$(dirname $script_path)
 14 | # config_json="$script_dir/megatron_molbart/ds_config.json"
 15 | config_json="megatron_molbart/ds_config.json"
 16 | 
 17 | #ZeRO Configs
 18 | stage=1
 19 | reduce_scatter=true
 20 | contigious_gradients=false
 21 | rbs=50000000
 22 | agbs=5000000000
 23 | 
 24 | chkp_layers=1
 25 | PA=true
 26 | PA_CPU=false
 27 | CC=true
 28 | SYNCHRONIZE=true
 29 | PROFILE=false
 30 | 
 31 | # Megatron Model Parallelism
 32 | mp_size=1
 33 | # DeepSpeed Pipeline parallelism
 34 | pp_size=0
 35 | 
 36 | 
 37 | #######
 38 | ## JACKMOD: add two options: 1 for data, 1 for tensorboard
 39 | megatron_options=" \
 40 |         --model-parallel-size ${mp_size} \
 41 |         --pipe-parallel-size ${pp_size} \
 42 |         --num-layers 4 \
 43 |         --hidden-size 256 \
 44 |         --num-attention-heads 8 \
 45 |         --seq-length 512 \
 46 |         --max-position-embeddings 512 \
 47 |         --batch-size 320 \
 48 |         --gas 16 \
 49 |         --train-iters 320000 \
 50 |         --lr-decay-iters 320000 \
 51 |         --data-impl mmap \
 52 |         --distributed-backend nccl \
 53 |         --lr 0.0001 \
 54 |         --lr-decay-style cosine \
 55 |         --min-lr 1.0e-5 \
 56 |         --weight-decay 0 \
 57 |         --clip-grad 1.0 \
 58 |         --warmup 0.01 \
 59 |         --checkpoint-activations \
 60 |         --log-interval 1 \
 61 |         --save-interval 1000 \
 62 |         --eval-interval 100000 \
 63 |         --eval-iters 10 \
 64 |         --save megatron_molbart_100m_checkpoint
 65 |         --dataset_path ../data/zinc.tab 
 66 |         --load /mol-gen/drug/models/megamolbart/checkpoints
 67 | "
 68 | 
 69 | deepspeed_options=" \
 70 |                 --deepspeed \
 71 |                 --deepspeed_config ${config_json} \
 72 |                 --zero-stage ${stage} \
 73 |                 --zero-reduce-bucket-size ${rbs} \
 74 |                 --zero-allgather-bucket-size ${agbs} 
 75 |             "
 76 | 
 77 | if [ "${contigious_gradients}" = "true" ]; then
 78 | deepspeed_options="${deepspeed_options} \
 79 |                 --zero-contigious-gradients"
 80 | fi
 81 | 
 82 | if [ "${reduce_scatter}" = "true" ]; then
 83 | deepspeed_options="${deepspeed_options} \
 84 |                 --zero-reduce-scatter"
 85 | fi
 86 | 
 87 | chkp_opt=" \
 88 | --checkpoint-activations \
 89 | --checkpoint-num-layers ${chkp_layers}"
 90 | 
 91 | if [ "${PA}" = "true" ]; then
 92 | chkp_opt="${chkp_opt} \
 93 |         --partition-activations"
 94 | fi
 95 | 
 96 | if [ "${PA_CPU}" = "true" ]; then
 97 | chkp_opt="${chkp_opt} \
 98 |         --checkpoint-in-cpu"
 99 | fi
100 | 
101 | if [ "${SYNCHRONIZE}" = "true" ]; then
102 | chkp_opt="${chkp_opt} \
103 |         --synchronize-each-layer"
104 | fi
105 | 
106 | if [ "${CC}" = "true" ]; then
107 | chkp_opt="${chkp_opt} \
108 |         --contigious-checkpointing"
109 | fi
110 | 
111 | if [ "${PROFILE}" = "true" ]; then
112 | chkp_opt="${chkp_opt} \
113 |         --profile-backward"
114 | fi
115 | 
116 | full_options="${megatron_options} ${deepspeed_options} ${chkp_opt}"
117 | 
118 | 
119 | custom_train_options=" \
120 |                 --stage 1 \
121 |                 --train_from pretrain \
122 |                 --model_ckpt_itr 134000 \
123 |                 --attr logp-sa \
124 |                 --attr_offset 0 \
125 |                 --data_source jtnn \
126 |                 --enumeration_input false \
127 |                 --retriever_rule random \
128 |                 --pred_target nearestn \
129 |                 --n_retrievals 10 \
130 |                 --n_neighbors 100
131 |                 "
132 | 
133 | 
134 | run_cmd="deepspeed --include localhost:0 --master_port=${MASTER_PORT} megatron_molbart/eval_retrieval.py $@ ${full_options} ${custom_train_options}"
135 | echo ${run_cmd}
136 | eval ${run_cmd}
137 | 
138 | set +x


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2_pipe.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | 
  3 | GPUS_PER_NODE=16
  4 | # Change for multinode config
  5 | MASTER_ADDR=localhost
  6 | MASTER_PORT=6000
  7 | NNODES=1
  8 | NODE_RANK=0
  9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 10 | 
 11 | export DLWS_NUM_WORKER=${NNODES}
 12 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
 13 | 
 14 | DATA_PATH=data/webtext/webtext_text_document
 15 | VOCAB_PATH=data/gpt2-vocab.json
 16 | MERGE_PATH=data/gpt2-merges.txt
 17 | CHECKPOINT_PATH=checkpoints/gpt2_345m_ds
 18 | 
 19 | script_path=$(realpath $0)
 20 | script_dir=$(dirname $script_path)
 21 | #config_json="$script_dir/ds_zero_stage_2_config.json"
 22 | config_json="$script_dir/ds_config.json"
 23 | 
 24 | # Megatron Model Parallelism
 25 | mp_size=2
 26 | # DeepSpeed Pipeline parallelism
 27 | pp_size=2
 28 | 
 29 | NLAYERS=24
 30 | NHIDDEN=1024
 31 | BATCHSIZE=4
 32 | LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4"
 33 | 
 34 | GAS=16
 35 | 
 36 | #ZeRO Configs
 37 | stage=0
 38 | reduce_scatter=true
 39 | contigious_gradients=true
 40 | rbs=50000000
 41 | agbs=5000000000
 42 | 
 43 | #Actication Checkpointing and Contigious Memory
 44 | chkp_layers=1
 45 | PA=true
 46 | PA_CPU=false
 47 | CC=true
 48 | SYNCHRONIZE=true
 49 | PROFILE=false
 50 | 
 51 | 
 52 | gpt_options=" \
 53 |         --model-parallel-size ${mp_size} \
 54 |         --pipe-parallel-size ${pp_size} \
 55 |         --num-layers $NLAYERS \
 56 |         --hidden-size $NHIDDEN \
 57 |         --num-attention-heads 16 \
 58 |         --seq-length 1024 \
 59 |         --max-position-embeddings 1024 \
 60 |         --batch-size $BATCHSIZE \
 61 |         --gas $GAS \
 62 |         --train-iters 320000 \
 63 |         --lr-decay-iters 320000 \
 64 |         --save $CHECKPOINT_PATH \
 65 |         --load $CHECKPOINT_PATH \
 66 |         --data-path $DATA_PATH \
 67 |         --vocab-file $VOCAB_PATH \
 68 |         --merge-file $MERGE_PATH \
 69 |         --data-impl mmap \
 70 |         --split 949,50,1 \
 71 |         --distributed-backend nccl \
 72 |         --lr 1.5e-4 \
 73 |         --lr-decay-style cosine \
 74 |         --min-lr 1.0e-5 \
 75 |         --weight-decay 1e-2 \
 76 |         --clip-grad 1.0 \
 77 |         --warmup 0.01 \
 78 |         --checkpoint-activations \
 79 |         --log-interval 1 \
 80 |         --save-interval 500 \
 81 |         --eval-interval 100 \
 82 |         --eval-iters 10 \
 83 |         --fp16 \
 84 |         --tensorboard-dir ${LOGDIR}
 85 | "
 86 |   
 87 |  deepspeed_options=" \
 88 |                 --deepspeed \
 89 |                 --deepspeed_config ${config_json} \
 90 |                 --zero-stage ${stage} \
 91 |                 --zero-reduce-bucket-size ${rbs} \
 92 |                 --zero-allgather-bucket-size ${agbs} 
 93 |             "
 94 | 
 95 | if [ "${contigious_gradients}" = "true" ]; then
 96 | deepspeed_options="${deepspeed_options} \
 97 |                 --zero-contigious-gradients"
 98 | fi
 99 | 
100 | if [ "${reduce_scatter}" = "true" ]; then
101 | deepspeed_options="${deepspeed_options} \
102 |                 --zero-reduce-scatter"
103 | fi
104 | 
105 | chkp_opt=" \
106 | --checkpoint-activations \
107 | --checkpoint-num-layers ${chkp_layers}"
108 | 
109 | if [ "${PA}" = "true" ]; then
110 | chkp_opt="${chkp_opt} \
111 |         --partition-activations"
112 | fi
113 | 
114 | if [ "${PA_CPU}" = "true" ]; then
115 | chkp_opt="${chkp_opt} \
116 |         --checkpoint-in-cpu"
117 | fi
118 | 
119 | if [ "${SYNCHRONIZE}" = "true" ]; then
120 | chkp_opt="${chkp_opt} \
121 |         --synchronize-each-layer"
122 | fi
123 | 
124 | if [ "${CC}" = "true" ]; then
125 | chkp_opt="${chkp_opt} \
126 |         --contigious-checkpointing"
127 | fi
128 | 
129 | if [ "${PROFILE}" = "true" ]; then
130 | chkp_opt="${chkp_opt} \
131 |         --profile-backward"
132 | fi
133 | 
134 | full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}"
135 | 
136 | run_cmd="deepspeed --num_nodes ${DLWS_NUM_WORKER} --num_gpus ${DLWS_NUM_GPU_PER_WORKER} pretrain_gpt2.py $@ ${full_options}"
137 | echo ${run_cmd}
138 | eval ${run_cmd}
139 | 
140 | set +x
141 | 


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tasks/data_utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """ Tasks data utility."""
 17 | 
 18 | import re
 19 | import numpy as np
 20 | 
 21 | 
 22 | def clean_text(text):
 23 |     """Remove new lines and multiple spaces and adjust end of sentence dot."""
 24 | 
 25 |     text = text.replace("\n", " ")
 26 |     text = re.sub(r'\s+', ' ', text)
 27 |     for _ in range(3):
 28 |         text = text.replace(' . ', '. ')
 29 | 
 30 |     return text
 31 | 
 32 | 
 33 | def build_sample(ids, types, paddings, label, unique_id):
 34 |     """Convert to numpy and return a sample consumed by the batch producer."""
 35 | 
 36 |     ids_np = np.array(ids, dtype=np.int64)
 37 |     types_np = np.array(types, dtype=np.int64)
 38 |     paddings_np = np.array(paddings, dtype=np.int64)
 39 |     sample = ({'text': ids_np,
 40 |                'types': types_np,
 41 |                'padding_mask': paddings_np,
 42 |                'label': int(label),
 43 |                'uid': int(unique_id)})
 44 | 
 45 |     return sample
 46 | 
 47 | 
 48 | def build_tokens_types_paddings_from_text(text_a, text_b,
 49 |                                           tokenizer, max_seq_length):
 50 |     """Build token types and paddings, trim if needed, and pad if needed."""
 51 | 
 52 |     text_a_ids = tokenizer.tokenize(text_a)
 53 |     text_b_ids = None
 54 |     if text_b is not None:
 55 |         text_b_ids = tokenizer.tokenize(text_b)
 56 | 
 57 |     return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
 58 |                                                 max_seq_length, tokenizer.cls,
 59 |                                                 tokenizer.sep, tokenizer.pad)
 60 | 
 61 | 
 62 | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
 63 |                                          cls_id, sep_id, pad_id):
 64 |     """Build token types and paddings, trim if needed, and pad if needed."""
 65 | 
 66 |     ids = []
 67 |     types = []
 68 |     paddings = []
 69 | 
 70 |     # [CLS].
 71 |     ids.append(cls_id)
 72 |     types.append(0)
 73 |     paddings.append(1)
 74 | 
 75 |     # A.
 76 |     len_text_a = len(text_a_ids)
 77 |     ids.extend(text_a_ids)
 78 |     types.extend([0] * len_text_a)
 79 |     paddings.extend([1] * len_text_a)
 80 | 
 81 |     # [SEP].
 82 |     ids.append(sep_id)
 83 |     types.append(0)
 84 |     paddings.append(1)
 85 | 
 86 |     # B.
 87 |     if text_b_ids is not None:
 88 |         len_text_b = len(text_b_ids)
 89 |         ids.extend(text_b_ids)
 90 |         types.extend([1] * len_text_b)
 91 |         paddings.extend([1] * len_text_b)
 92 | 
 93 |     # Cap the size.
 94 |     trimmed = False
 95 |     if len(ids) >= max_seq_length:
 96 |         max_seq_length_m1 = max_seq_length - 1
 97 |         ids = ids[0:max_seq_length_m1]
 98 |         types = types[0:max_seq_length_m1]
 99 |         paddings = paddings[0:max_seq_length_m1]
100 |         trimmed = True
101 | 
102 |     # [SEP].
103 |     if (text_b_ids is not None) or trimmed:
104 |         ids.append(sep_id)
105 |         if text_b_ids is None:
106 |             types.append(0)
107 |         else:
108 |             types.append(1)
109 |         paddings.append(1)
110 | 
111 |     # Padding.
112 |     padding_length = max_seq_length - len(ids)
113 |     if padding_length > 0:
114 |         ids.extend([pad_id] * padding_length)
115 |         types.extend([pad_id] * padding_length)
116 |         paddings.extend([0] * padding_length)
117 | 
118 |     return ids, types, paddings
119 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | NVIDIA Source Code License for RetMol
 2 | 
 3 | 1. Definitions
 4 | 
 5 | “Licensor” means any person or entity that distributes its Work.
 6 | 
 7 | “Software” means the original work of authorship made available under this License.
 8 | 
 9 | “Work” means the Software and any additions to or derivative works of the Software that are made available under
10 | this License.
11 | 
12 | The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under
13 | U.S. copyright law; provided, however, that for the purposes of this License, derivative works shall not include
14 | works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work.
15 | 
16 | Works, including the Software, are “made available” under this License by including in or with the Work either
17 | (a) a copyright notice referencing the applicability of this License to the Work, or (b) a copy of this License.
18 | 
19 | 2. License Grant
20 | 
21 | 2.1 Copyright Grant. Subject to the terms and conditions of this License, each Licensor grants to you a perpetual,
22 | worldwide, non-exclusive, royalty-free, copyright license to reproduce, prepare derivative works of, publicly
23 | display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form.
24 | 
25 | 3. Limitations
26 | 
27 | 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this License, (b) you
28 | include a complete copy of this License with your distribution, and (c) you retain without modification any
29 | copyright, patent, trademark, or attribution notices that are present in the Work.
30 | 
31 | 3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and
32 | distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use
33 | limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works
34 | that are subject to Your Terms. Notwithstanding Your Terms, this License (including the redistribution
35 | requirements in Section 3.1) will continue to apply to the Work itself.
36 | 
37 | 3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use
38 | non-commercially. Notwithstanding the foregoing, NVIDIA and its affiliates may use the Work and any derivative
39 | works commercially. As used herein, “non-commercially” means for research or evaluation purposes only.
40 | 
41 | 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim,
42 | cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then
43 | your rights under this License from such Licensor (including the grant in Section 2.1) will terminate immediately.
44 | 
45 | 3.5 Trademarks. This License does not grant any rights to use any Licensor’s or its affiliates’ names, logos,
46 | or trademarks, except as necessary to reproduce the notices described in this License.
47 | 
48 | 3.6 Termination. If you violate any term of this License, then your rights under this License (including the
49 | grant in Section 2.1) will terminate immediately.
50 | 
51 | 4. Disclaimer of Warranty.
52 | 
53 | THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING
54 | WARRANTIES OR CONDITIONS OF M ERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU
55 | BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
56 | 
57 | 5. Limitation of Liability.
58 | 
59 | EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING
60 | NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
61 | INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR
62 | INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR
63 | DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER COMM ERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN
64 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.


--------------------------------------------------------------------------------
/MolBART/megatron_molbart/Megatron-LM-v1.1.5-3D_parallelism/tools/openwebtext/find_duplicates.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import itertools
 18 | import json
 19 | from lsh import cache, minhash
 20 | import time
 21 | import sys
 22 | 
 23 | 
 24 | # This function is adapted from:
 25 | #   https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb
 26 | def shingles(text, char_ngram=5):
 27 |     return set(text[head:head + char_ngram]
 28 |                for head in range(0, len(text) - char_ngram))
 29 | 
 30 | 
 31 | # This function is adapted from:
 32 | #  https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb
 33 | def jaccard(set_a, set_b):
 34 |     intersection = set_a & set_b
 35 |     union = set_a | set_b
 36 |     return len(intersection) / len(union)
 37 | 
 38 | 
 39 | if __name__ == '__main__':
 40 | 
 41 |     print('finding possible duplicate content ...')
 42 | 
 43 |     input = sys.argv[1]
 44 |     output = sys.argv[2]
 45 | 
 46 |     hasher = minhash.MinHasher(seeds=100, char_ngram=5, hashbytes=4)
 47 |     lshcache = cache.Cache(bands=10, hasher=hasher)
 48 | 
 49 |     counter = 0
 50 |     url_doc = {}
 51 |     start_time = time.time()
 52 |     with open(input, 'r') as f:
 53 |         for line in f:
 54 |             try:
 55 |                 myjson = json.loads(line)
 56 |                 url = myjson['url']
 57 |                 text = myjson['text']
 58 |                 counter += 1
 59 |                 url_doc[url] = text
 60 |                 lshcache.add_fingerprint(hasher.fingerprint(text), url)
 61 |             except Exception as e:
 62 |                 print('Error:', e)
 63 |             if counter % 10000 == 0:
 64 |                 print(' [read]> processed {} documents in {:.2f} seconds ...'.
 65 |                       format(counter, time.time() - start_time), flush=True)
 66 | 
 67 |     counter = 0
 68 |     start_time = time.time()
 69 |     deduped = 0
 70 |     with open(output, 'wb') as f:
 71 |         for b in lshcache.bins:
 72 |             for bucket_id in b:
 73 |                 if len(b[bucket_id]) > 1:
 74 |                     items = list(b[bucket_id])
 75 |                     main_url = items[0]
 76 |                     main_dhingles = shingles(url_doc[main_url])
 77 |                     remove_urls = []
 78 |                     for i in range(1, len(items)):
 79 |                         counter += 1
 80 |                         other_url= items[i]
 81 |                         other_shingles = shingles(url_doc[other_url])
 82 |                         try:
 83 |                             jaccard_sim = jaccard(main_dhingles, other_shingles)
 84 |                         except Exception as e:
 85 |                             print('Error:', e)
 86 |                         if jaccard_sim > 0.5:
 87 |                             remove_urls.append({other_url: jaccard_sim})
 88 |                             deduped += 1
 89 |                         if counter % 10000 == 0:
 90 |                             print(' [write]> processed {} documents in {:.2f} '
 91 |                                   'seoncds and deduped {} documents ...'.
 92 |                                   format(counter, time.time() - start_time,
 93 |                                          deduped), flush=True)
 94 |                     if len(remove_urls) > 0:
 95 |                         myjson = json.dumps({main_url: remove_urls},
 96 |                                             ensure_ascii=False)
 97 |                         f.write(myjson.encode('utf-8'))
 98 |                         f.write('\n'.encode('utf-8'))
 99 | 
100 |     print('done :-)')
101 | 


--------------------------------------------------------------------------------