├── .here ├── data ├── raw │ └── .gitkeep ├── external │ └── .gitkeep ├── interim │ └── .gitkeep └── processed │ └── .gitkeep ├── models └── .gitkeep ├── reports └── .gitkeep ├── tests ├── __init__.py ├── models │ ├── __init__.py │ └── test_rnn_generators.py ├── test_utils.py ├── data │ ├── test_smiles_voc.txt │ ├── test_selfies_voc.txt │ └── test.smi └── test_tokenizers.py ├── experiments ├── .gitkeep ├── seh_frag.py └── train_lstm_prior.py ├── references └── .gitkeep ├── rxitect ├── __init__.py ├── utils │ ├── __init__.py │ ├── transforms.py │ ├── multiprocessing_proxy.py │ └── metrics.py ├── trainers │ ├── __init__.py │ └── gfn_trainer.py ├── algorithms │ ├── __init__.py │ ├── gfn_algorithm.py │ └── trajectory_balance.py ├── tasks │ ├── __init__.py │ ├── gfn_task.py │ └── original_task.py ├── envs │ ├── __init__.py │ ├── contexts │ │ ├── __init__.py │ │ ├── fragment_env_context.py │ │ └── graph_env_context.py │ └── fragment_env.py ├── models │ ├── __init__.py │ ├── gflownet.py │ ├── transformers.py │ ├── lstm_generator.py │ └── bengio2021flow.py ├── data │ ├── __init__.py │ ├── datamodules.py │ ├── datasets.py │ └── iterators.py ├── utils.py └── tokenizers.py ├── docs ├── source │ ├── modules.rst │ ├── tests.models.rst │ ├── index.rst │ ├── rxitect.envs.contexts.rst │ ├── rxitect.envs.rst │ ├── rxitect.data.rst │ ├── rxitect.models.rst │ ├── tests.rst │ ├── rxitect.rst │ └── conf.py ├── Makefile └── make.bat ├── environment.yml ├── pyproject.toml ├── LICENSE ├── README.rst └── .gitignore /.here: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/raw/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reports/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/external/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/interim/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/processed/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/seh_frag.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /references/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rxitect/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rxitect/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rxitect/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | from rxitect.trainers.gfn_trainer import GFNTrainer 2 | -------------------------------------------------------------------------------- /rxitect/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | from rxitect.algorithms.trajectory_balance import TrajectoryBalance 2 | -------------------------------------------------------------------------------- /rxitect/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from rxitect.tasks.gfn_task import FlatRewards, GFNTask, ScalarReward 2 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | final 2 | ===== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | rxitect 8 | tests 9 | -------------------------------------------------------------------------------- /rxitect/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from rxitect.envs.contexts.fragment_env_context import FragmentEnvContext 2 | from rxitect.envs.fragment_env import FragmentEnv 3 | -------------------------------------------------------------------------------- /rxitect/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Sub-package containing implementations of various Molecular Generator models""" 2 | from rxitect.models.gflownet import FragmentBasedGFN 3 | from rxitect.models.lstm_generator import LSTMGenerator 4 | -------------------------------------------------------------------------------- /rxitect/data/__init__.py: -------------------------------------------------------------------------------- 1 | """Sub-package containing implementations of various datasets for each representation type""" 2 | from rxitect.data.datamodules import SmilesDataModule 3 | from rxitect.data.datasets import SelfiesDataset, SmilesDataset 4 | -------------------------------------------------------------------------------- /rxitect/envs/contexts/__init__.py: -------------------------------------------------------------------------------- 1 | from rxitect.envs.contexts.graph_env_context import (Action, ActionCategorical, 2 | ActionIndex, ActionType, 3 | Graph, GraphEnvContext, 4 | StateActionPair) 5 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from rxitect import utils 4 | 5 | 6 | def test_that_is_valid_smiles_works_as_expected(): 7 | valid_smiles = "O=C(Nc1cccc(F)c1)N1CCN(c2ccnc(Cl)n2)CC1" 8 | invalid_smiles = "O=C(Nc1cccc(F)c1)N1CCN(c2cDnc(Cl)n2)CC1" 9 | assert utils.is_valid_smiles(valid_smiles) 10 | assert not utils.is_valid_smiles(invalid_smiles) 11 | -------------------------------------------------------------------------------- /docs/source/tests.models.rst: -------------------------------------------------------------------------------- 1 | tests.models package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | tests.models.test\_rnn\_generators module 8 | ----------------------------------------- 9 | 10 | .. automodule:: tests.models.test_rnn_generators 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: tests.models 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: rx 2 | 3 | channels: 4 | - pyg 5 | - pytorch 6 | - gpytorch 7 | - conda-forge 8 | - defaults 9 | 10 | dependencies: 11 | - python=3.9 12 | - pytorch=1.11 13 | - botorch=0.7.2 14 | - cvxopt=1.3.0 15 | - torchmetrics=0.9.2 16 | - pyg=2.0.4 17 | - cudatoolkit=11.3 18 | - rdkit=2022.03.04 19 | - poetry>=1.1.4,<2.0 20 | - pip>=20.0 21 | - h5py=3.7.0 22 | - pip: 23 | - https://github.com/MolecularAI/aizynthfinder/archive/v3.4.0.tar.gz 24 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Rxitect documentation master file, created by 2 | sphinx-quickstart on Mon Oct 3 23:40:20 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Rxitect's documentation! 7 | =================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/source/rxitect.envs.contexts.rst: -------------------------------------------------------------------------------- 1 | rxitect.envs.contexts package 2 | ============================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | rxitect.envs.contexts.graph\_env\_context module 8 | ------------------------------------------------ 9 | 10 | .. automodule:: rxitect.envs.contexts.graph_env_context 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: rxitect.envs.contexts 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/rxitect.envs.rst: -------------------------------------------------------------------------------- 1 | rxitect.envs package 2 | ==================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | rxitect.envs.contexts 11 | 12 | Submodules 13 | ---------- 14 | 15 | rxitect.envs.fragment\_env module 16 | --------------------------------- 17 | 18 | .. automodule:: rxitect.envs.fragment_env 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: rxitect.envs 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/source/rxitect.data.rst: -------------------------------------------------------------------------------- 1 | rxitect.data package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | rxitect.data.datamodules module 8 | ------------------------------- 9 | 10 | .. automodule:: rxitect.data.datamodules 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | rxitect.data.datasets module 16 | ---------------------------- 17 | 18 | .. automodule:: rxitect.data.datasets 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: rxitect.data 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/source/rxitect.models.rst: -------------------------------------------------------------------------------- 1 | rxitect.models package 2 | ====================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | rxitect.models.gflownet module 8 | ------------------------------ 9 | 10 | .. automodule:: rxitect.models.gflownet 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | rxitect.models.lstm\_generator module 16 | ------------------------------------- 17 | 18 | .. automodule:: rxitect.models.lstm_generator 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: rxitect.models 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/source/tests.rst: -------------------------------------------------------------------------------- 1 | tests package 2 | ============= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | tests.models 11 | 12 | Submodules 13 | ---------- 14 | 15 | tests.test\_tokenizers module 16 | ----------------------------- 17 | 18 | .. automodule:: tests.test_tokenizers 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | tests.test\_utils module 24 | ------------------------ 25 | 26 | .. automodule:: tests.test_utils 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | Module contents 32 | --------------- 33 | 34 | .. automodule:: tests 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/source/rxitect.rst: -------------------------------------------------------------------------------- 1 | rxitect package 2 | =============== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | rxitect.data 11 | rxitect.envs 12 | rxitect.models 13 | 14 | Submodules 15 | ---------- 16 | 17 | rxitect.tokenizers module 18 | ------------------------- 19 | 20 | .. automodule:: rxitect.tokenizers 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | rxitect.utils module 26 | -------------------- 27 | 28 | .. automodule:: rxitect.utils 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | Module contents 34 | --------------- 35 | 36 | .. automodule:: rxitect 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | -------------------------------------------------------------------------------- /tests/data/test_smiles_voc.txt: -------------------------------------------------------------------------------- 1 | ( 2 | ) 3 | [OH+] 4 | 4 5 | [I+] 6 | O 7 | [IH] 8 | [BH2-] 9 | [As+] 10 | [CH] 11 | L 12 | [te+] 13 | F 14 | 6 15 | [O] 16 | [te] 17 | [S+] 18 | [se+] 19 | [S-] 20 | [C+] 21 | I 22 | 1 23 | [TeH] 24 | 7 25 | [SeH] 26 | - 27 | b 28 | [c-] 29 | 0 30 | [BH3-] 31 | [Se] 32 | s 33 | [B-] 34 | [nH+] 35 | [c+] 36 | [SiH2] 37 | = 38 | B 39 | N 40 | [PH] 41 | [BH-] 42 | 9 43 | [CH2-] 44 | c 45 | p 46 | [o+] 47 | [SH+] 48 | 2 49 | [CH2] 50 | # 51 | [NH+] 52 | [s+] 53 | o 54 | [O-] 55 | 8 56 | n 57 | [N+] 58 | [Te] 59 | [SH2] 60 | [n-] 61 | [P+] 62 | [As] 63 | 3 64 | [NH2+] 65 | [N-] 66 | [Si] 67 | [SiH] 68 | [C-] 69 | C 70 | 5 71 | [cH-] 72 | P 73 | % 74 | [O+] 75 | [SH] 76 | [NH-] 77 | S 78 | [Se+] 79 | [b-] 80 | R 81 | [se] 82 | [nH] 83 | [n+] -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "rxitect" 3 | version = "0.1.0" 4 | description = "Code for Julius Cathalina's M.Sc. Thesis" 5 | authors = ["Julius Cathalina "] 6 | 7 | [tool.poetry.dependencies] 8 | python = "^3.9" 9 | seaborn = "^0.11.2" 10 | hydra-core = "^1.2.0" 11 | hydra-colorlog = "^1.2.0" 12 | dvc = "^2.17.0" 13 | wandb = "^0.13.1" 14 | graphviz = "^0.20.1" 15 | pyprojroot = "^0.2.0" 16 | selfies = "^2.1.1" 17 | jsonargparse = {extras = ["signatures"], version = "^4.14.0"} 18 | 19 | [tool.poetry.dev-dependencies] 20 | pytest = "^7.1.2" 21 | jupyterlab = "^3.4.4" 22 | ipywidgets = "^7.7.1" 23 | black = "^22.6.0" 24 | isort = "^5.10.1" 25 | Sphinx = "^5.1.1" 26 | 27 | [build-system] 28 | requires = ["poetry-core>=1.0.0"] 29 | build-backend = "poetry.core.masonry.api" 30 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /tests/data/test_selfies_voc.txt: -------------------------------------------------------------------------------- 1 | [F] 2 | [Te] 3 | [P+1] 4 | [=Ring1] 5 | [CH2] 6 | [=NH1+1] 7 | [NH1-1] 8 | [=N+1] 9 | [=NH2+1] 10 | [Ring1] 11 | [SH1] 12 | [=Branch2] 13 | [=Ring2] 14 | [=Te+1] 15 | [C] 16 | [CH1-1] 17 | [SiH2] 18 | [=O] 19 | [=B-1] 20 | [S-1] 21 | [=N] 22 | [SH2] 23 | [B-1] 24 | [Branch2] 25 | [=Se+1] 26 | [O] 27 | [Ring2] 28 | [#C] 29 | [OH0] 30 | [=S] 31 | [#N] 32 | [#N+1] 33 | [BH3-1] 34 | [P] 35 | [As] 36 | [Cl] 37 | [BH2-1] 38 | [=C] 39 | [Se] 40 | [PH1] 41 | [S+1] 42 | [C-1] 43 | [=B] 44 | [Se+1] 45 | [=Se] 46 | [SiH1] 47 | [=N-1] 48 | [N+1] 49 | [Branch1] 50 | [I+1] 51 | [N-1] 52 | [SH1+1] 53 | [CH1] 54 | [=Branch1] 55 | [#S] 56 | [O-1] 57 | [O+1] 58 | [OH1+1] 59 | [#Branch1] 60 | [#C-1] 61 | [N] 62 | [NH1] 63 | [=As] 64 | [#Branch2] 65 | [=O+1] 66 | [S] 67 | [B] 68 | [NH1+1] 69 | [C+1] 70 | [CH2-1] 71 | [BH1-1] 72 | [=PH1] 73 | [SeH1] 74 | [Br] 75 | [NH2+1] 76 | [=SH1] 77 | [=P] 78 | [=S+1] 79 | [=C-1] 80 | [I] 81 | [TeH1] 82 | [As+1] 83 | [Si] 84 | -------------------------------------------------------------------------------- /rxitect/utils/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | 4 | 5 | def thermometer( 6 | v: Tensor, n_bins: int = 50, vmin: float = 0.0, vmax: float = 1.0 7 | ) -> Tensor: 8 | """Thermometer encoding of a scalar quantity. 9 | Parameters 10 | ---------- 11 | v: Tensor 12 | Value(s) to encode. Can be any shape 13 | n_bins: int 14 | The number of dimensions to encode the values into 15 | vmin: float 16 | The smallest value, below which the encoding is equal to torch.zeros(n_bins) 17 | vmax: float 18 | The largest value, beyond which the encoding is equal to torch.ones(n_bins) 19 | Returns 20 | ------- 21 | encoding: Tensor 22 | The encoded values, shape: `v.shape + (n_bins,)` 23 | """ 24 | bins = torch.linspace(vmin, vmax, n_bins) 25 | gap = bins[1] - bins[0] 26 | return (v[..., None] - bins.reshape((1,) * v.ndim + (-1,))).clamp( 27 | 0, gap.item() 28 | ) / gap 29 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = "Rxitect" 10 | copyright = "2022, Julius Cathalina" 11 | author = "Julius Cathalina" 12 | 13 | # -- General configuration --------------------------------------------------- 14 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 15 | 16 | extensions = ["sphinx.ext.napoleon"] 17 | 18 | templates_path = ["_templates"] 19 | exclude_patterns = [] 20 | 21 | 22 | # -- Options for HTML output ------------------------------------------------- 23 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 24 | 25 | html_theme = "alabaster" 26 | html_static_path = ["_static"] 27 | -------------------------------------------------------------------------------- /experiments/train_lstm_prior.py: -------------------------------------------------------------------------------- 1 | import pytorch_lightning as pl 2 | from pyprojroot import here 3 | from pytorch_lightning.profilers import AdvancedProfiler 4 | 5 | from rxitect.data import SmilesDataModule 6 | from rxitect.models import LSTMGenerator 7 | 8 | # from pytorch_lightning.cli import LightningCLI 9 | 10 | 11 | if __name__ == "__main__": 12 | # cli = LightningCLI(LSTMGenerator) 13 | lr = 1e-3 14 | epochs = 5 15 | 16 | net = LSTMGenerator( 17 | vocabulary_filepath=here() / "data/processed/chembl_v30_smi_voc.txt", 18 | ) 19 | dm = SmilesDataModule( 20 | dataset_filepath=here() / "data/processed/chembl_v30_clean.smi", 21 | tokenizer=net.tokenizer, 22 | num_workers=4, 23 | ) 24 | 25 | profiler = AdvancedProfiler(dirpath=here() / "logs", filename="perf_logs_lstm") 26 | 27 | trainer = pl.Trainer( 28 | accelerator="gpu", 29 | devices=1, 30 | max_epochs=epochs, 31 | profiler=profiler, 32 | check_val_every_n_epoch=1, 33 | ) 34 | trainer.fit(net, datamodule=dm) 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Recursion Pharmaceuticals 4 | Copyright (c) 2022 Julius Cathalina 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /tests/models/test_rnn_generators.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn as nn 4 | from pyprojroot import here 5 | from torch.utils.data import DataLoader 6 | 7 | from rxitect.data import SelfiesDataset, SmilesDataset 8 | from rxitect.models import LSTMGenerator 9 | from rxitect.tokenizers import SelfiesTokenizer, SmilesTokenizer, get_tokenizer 10 | 11 | 12 | @pytest.fixture() 13 | def smiles_tokenizer() -> SmilesTokenizer: 14 | test_vocabulary_filepath = here() / "tests/data/test_smiles_voc.txt" 15 | smiles_tokenizer = get_tokenizer("smiles", test_vocabulary_filepath, 100) 16 | return smiles_tokenizer 17 | 18 | 19 | @pytest.fixture() 20 | def smiles_dataloader(smiles_tokenizer: SmilesTokenizer) -> DataLoader: 21 | test_dataset_filepath = here() / "tests/data/test.smi" 22 | dataset = SmilesDataset( 23 | dataset_filepath=test_dataset_filepath, tokenizer=smiles_tokenizer 24 | ) 25 | dataloader = DataLoader( 26 | dataset=dataset, 27 | batch_size=128, 28 | num_workers=1, 29 | shuffle=True, 30 | pin_memory=True, 31 | collate_fn=SmilesDataset.collate_fn, 32 | ) 33 | return dataloader 34 | 35 | 36 | def test_dataloader_loads_dataset_in_properly(smiles_dataloader: DataLoader): 37 | dataloader = smiles_dataloader 38 | assert len(dataloader.dataset) == 500 39 | -------------------------------------------------------------------------------- /tests/test_tokenizers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyprojroot import here 3 | 4 | from rxitect.tokenizers import SelfiesTokenizer, SmilesTokenizer, get_tokenizer 5 | 6 | 7 | @pytest.fixture 8 | def smiles_tokenizer() -> SmilesTokenizer: 9 | test_vocabulary_filepath = here() / "tests/data/test_smiles_voc.txt" 10 | smiles_tokenizer = get_tokenizer("smiles", test_vocabulary_filepath, 100) 11 | return smiles_tokenizer 12 | 13 | 14 | @pytest.fixture 15 | def selfies_tokenizer() -> SelfiesTokenizer: 16 | test_vocabulary_filepath = here() / "tests/data/test_selfies_voc.txt" 17 | smiles_tokenizer = get_tokenizer("selfies", test_vocabulary_filepath, 100) 18 | return smiles_tokenizer 19 | 20 | 21 | def test_decoding_encoded_smiles_reconstructs_smiles_correctly(smiles_tokenizer): 22 | sample_smiles = "CCBr[nH]" 23 | tokenizer = smiles_tokenizer 24 | encoded_smiles = tokenizer.encode(sample_smiles) 25 | decoded_smiles = tokenizer.decode(encoded_smiles) 26 | 27 | assert decoded_smiles == sample_smiles 28 | 29 | 30 | def test_decoding_encoded_selfies_reconstructs_selfies_correctly(selfies_tokenizer): 31 | sample_selfies = "[C][C][Br][NH1]" 32 | tokenizer = selfies_tokenizer 33 | encoded_selfies = tokenizer.encode(sample_selfies) 34 | decoded_smiles = tokenizer.decode(encoded_selfies) 35 | 36 | assert decoded_smiles == sample_selfies 37 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Rxitect 3 | ======= 4 | ------------------------------------------------------------------------------------ 5 | A de-novo drug design library for creating retrosynthesis-aware reward-driven models 6 | ------------------------------------------------------------------------------------ 7 | 8 | Introduction 9 | ============ 10 | 11 | This library was made for my M.Sc. thesis research with the aim of understanding 12 | how computational chemists can incorporate synthesis planning into de-novo drug design 13 | systems. Many molecule generators propose interesting but impractical molecules, which is why we need 14 | to design them with synthesizability in mind. Modern Computer-Assisted Synthesis Planning (CASP) tools are quite powerful 15 | but are of limited use in algorithms that need to call said tools many times (e.g., > 100.000 calls) 16 | due to the time it takes to solve a single molecule on average. This research aims to 17 | create a useful proxy that is cheap to call yet robust, and then using a myriad of techniques 18 | that are known to be effective in searching the vast molecular search space such as Reinforcement Learning (RL), 19 | and Generative Flow Networks (GFlowNets), we can experimentally test if these proxies are useful to propose more 20 | practical and synthesizable molecules. 21 | 22 | Quickstart 23 | ---------- 24 | Run the following code to get up and running 25 | ``` 26 | conda env create -f environment.yml 27 | # alternatively you can use mamba, which I recommend 28 | conda activate rx 29 | poetry install 30 | ``` 31 | 32 | Examples 33 | -------- 34 | Coming Soon! 35 | -------------------------------------------------------------------------------- /rxitect/tasks/gfn_task.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Any, Dict, List, NewType, Tuple, Union 5 | 6 | import torch 7 | from rdkit.Chem.rdchem import Mol 8 | from torch import nn 9 | 10 | # This type represents an unprocessed list of reward signals/conditioning information 11 | FlatRewards = NewType("FlatRewards", torch.Tensor) # type: ignore 12 | 13 | # This type represents the outcome for a multi-objective task of 14 | # converting FlatRewards to a scalar, e.g. (sum R_i omega_i) ** beta 15 | ScalarReward = NewType("ScalarReward", torch.Tensor) # type: ignore 16 | 17 | 18 | class GFNTask(ABC): 19 | @abstractmethod 20 | def flat_reward_transform(self, y: Union[float, torch.Tensor]) -> FlatRewards: 21 | pass 22 | 23 | @abstractmethod 24 | def inverse_flat_reward_transform( 25 | self, rp: FlatRewards 26 | ) -> Union[float, torch.Tensor]: 27 | pass 28 | 29 | @abstractmethod 30 | def _load_task_models(self) -> Dict[str, nn.Module]: 31 | pass 32 | 33 | @abstractmethod 34 | def sample_conditional_information(self, n: int) -> Dict[str, Any]: 35 | """ 36 | Parameters 37 | ---------- 38 | n: size of random sample 39 | 40 | Returns 41 | ------- 42 | Dictionary containing conditional information 43 | """ 44 | pass 45 | 46 | @abstractmethod 47 | def cond_info_to_reward( 48 | self, cond_info: Dict[str, torch.Tensor], flat_rewards: FlatRewards 49 | ) -> ScalarReward: 50 | """Combines a minibatch of reward signal vectors and conditional information into a scalar reward. 51 | Parameters 52 | ---------- 53 | cond_info: Dict[str, Tensor] 54 | A dictionary with various conditional information (e.g. temperature) 55 | flat_rewards: FlatRewards 56 | A 2d tensor where each row represents a series of flat rewards. 57 | Returns 58 | ------- 59 | reward: ScalarReward 60 | A 1d tensor, a scalar reward for each minibatch entry. 61 | """ 62 | pass 63 | 64 | @abstractmethod 65 | def compute_flat_rewards(self, mols: List[Mol]) -> Tuple[FlatRewards, torch.Tensor]: 66 | """Compute the flat rewards of mols according the tasks' proxies 67 | Parameters 68 | ---------- 69 | mols: List[Mol] 70 | A list of RDKit molecules. 71 | Returns 72 | ------- 73 | reward: FlatRewards 74 | A 2d tensor, a vector of scalar reward for valid each molecule. 75 | is_valid: Tensor 76 | A 1d tensor, a boolean indicating whether the molecule is valid. 77 | """ 78 | pass 79 | -------------------------------------------------------------------------------- /rxitect/data/datamodules.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Optional, Tuple 3 | 4 | import torch 5 | from pytorch_lightning import LightningDataModule 6 | from pytorch_lightning.utilities.types import (EVAL_DATALOADERS, 7 | TRAIN_DATALOADERS) 8 | from torch.utils.data import DataLoader, random_split 9 | 10 | from rxitect.data.datasets import SmilesDataset 11 | from rxitect.tokenizers import SmilesTokenizer 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class SmilesDataModule(LightningDataModule): 17 | def __init__( 18 | self, 19 | dataset_filepath: str, 20 | tokenizer=SmilesTokenizer, 21 | train_val_test_split: Tuple[int, int, int] = (1_500_000, 185_000, 186_227), 22 | batch_size: int = 128, 23 | num_workers: int = 0, 24 | num_partitions: Optional[int] = None, 25 | pin_memory: bool = False, 26 | random_state: int = 42, 27 | ) -> None: 28 | super().__init__() 29 | 30 | self.save_hyperparameters() 31 | self.dataset_filepath = dataset_filepath 32 | self.train_val_test_split = train_val_test_split 33 | self.batch_size = batch_size 34 | self.num_workers = num_workers 35 | self.num_partitions = num_partitions 36 | self.pin_memory = pin_memory 37 | self.random_state = random_state 38 | self.tokenizer = tokenizer 39 | 40 | def prepare_data(self) -> None: 41 | pass 42 | # TODO: Download the tokenized ChEMBL file here 43 | # saves us the params if we init the vocab internally as well. 44 | 45 | def setup(self, stage: Optional[str] = None) -> None: 46 | # TODO: Make ChEMBL v30 a downloadable dataset like MNIST from torch and simplify 47 | data = SmilesDataset(self.dataset_filepath, self.tokenizer) 48 | # Create splits for train/val/test 49 | self.train_data, self.val_data, self.test_data = random_split( 50 | dataset=data, 51 | lengths=self.train_val_test_split, 52 | generator=torch.Generator().manual_seed(self.random_state), 53 | ) 54 | 55 | def train_dataloader(self) -> TRAIN_DATALOADERS: 56 | return DataLoader( 57 | dataset=self.train_data, 58 | batch_size=self.batch_size, 59 | pin_memory=self.pin_memory, 60 | num_workers=self.num_workers, 61 | collate_fn=SmilesDataset.collate_fn, 62 | shuffle=True, 63 | ) 64 | 65 | def val_dataloader(self) -> EVAL_DATALOADERS: 66 | return DataLoader( 67 | dataset=self.val_data, 68 | batch_size=self.batch_size, 69 | pin_memory=self.pin_memory, 70 | num_workers=self.num_workers, 71 | collate_fn=SmilesDataset.collate_fn, 72 | shuffle=False, 73 | ) 74 | 75 | def test_dataloader(self) -> EVAL_DATALOADERS: 76 | return DataLoader( 77 | dataset=self.test_data, 78 | batch_size=self.batch_size, 79 | pin_memory=self.pin_memory, 80 | num_workers=self.num_workers, 81 | collate_fn=SmilesDataset.collate_fn, 82 | shuffle=False, 83 | ) 84 | -------------------------------------------------------------------------------- /rxitect/models/gflownet.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | import torch 6 | from torch import nn 7 | from torch_geometric.data import Batch 8 | 9 | from rxitect.envs.contexts import ActionCategorical 10 | from rxitect.models.transformers import GraphTransformer, create_mlp 11 | 12 | if TYPE_CHECKING: 13 | from rxitect.envs import FragmentEnvContext 14 | 15 | 16 | class FragmentBasedGFN(nn.Module): 17 | """GraphTransformer class for a GFlowNet which outputs a GraphActionCategorical. Meant for 18 | fragment-wise generation. 19 | Outputs logits for the following actions 20 | - STOP 21 | - ADD_NODE 22 | - SET_EDGE_ATTR 23 | """ 24 | 25 | def __init__( 26 | self, 27 | ctx: FragmentEnvContext, 28 | num_emb: int = 64, 29 | num_layers: int = 3, 30 | num_heads: int = 2, 31 | ): 32 | """ 33 | Parameters 34 | ---------- 35 | ctx: FragmentEnvContext 36 | x 37 | num_emb: int 38 | x 39 | num_layers: int 40 | x 41 | num_heads: int 42 | x 43 | """ 44 | super().__init__() 45 | self.transformer = GraphTransformer( 46 | x_dim=ctx.num_node_dim, 47 | e_dim=ctx.num_edge_dim, 48 | g_dim=ctx.num_cond_dim, 49 | num_emb=num_emb, 50 | num_layers=num_layers, 51 | num_heads=num_heads, 52 | ) 53 | num_final = num_emb * 2 54 | num_mlp_layers = 0 55 | self.emb2add_node = create_mlp( 56 | num_final, num_emb, ctx.num_new_node_values, num_mlp_layers 57 | ) 58 | # Edge attr logits are "sided", so we will compute both sides independently 59 | self.emb2set_edge_attr = create_mlp( 60 | num_emb + num_final, num_emb, ctx.num_edge_attr_logits // 2, num_mlp_layers 61 | ) 62 | self.emb2stop = create_mlp(num_emb * 3, num_emb, 1, num_mlp_layers) 63 | self.emb2reward = create_mlp(num_emb * 3, num_emb, 1, num_mlp_layers) 64 | self.edge2emb = create_mlp(num_final, num_emb, num_emb, num_mlp_layers) 65 | self.log_z = create_mlp(ctx.num_cond_dim, num_emb * 2, 1, 2) 66 | self.action_type_order = ctx.action_type_order 67 | 68 | def forward(self, g: Batch, cond: torch.Tensor): 69 | """See `GraphTransformer` for argument values""" 70 | node_embeddings, graph_embeddings = self.transformer(g, cond) 71 | # On `::2`, edges are duplicated to make graphs undirected, only take the even ones 72 | e_row, e_col = g.edge_index[:, ::2] 73 | edge_emb = self.edge2emb(node_embeddings[e_row] + node_embeddings[e_col]) 74 | src_anchor_logits = self.emb2set_edge_attr( 75 | torch.cat([edge_emb, node_embeddings[e_row]], 1) 76 | ) 77 | dst_anchor_logits = self.emb2set_edge_attr( 78 | torch.cat([edge_emb, node_embeddings[e_col]], 1) 79 | ) 80 | 81 | def _mask(x, m): 82 | # mask logit vector x with binary mask m, -1000 is a tiny log-value 83 | return x * m + -1000 * (1 - m) 84 | 85 | cat = ActionCategorical( 86 | g, 87 | logits=[ 88 | self.emb2stop(graph_embeddings), 89 | _mask(self.emb2add_node(node_embeddings), g.add_node_mask), 90 | _mask( 91 | torch.cat([src_anchor_logits, dst_anchor_logits], 1), 92 | g.set_edge_attr_mask, 93 | ), 94 | ], 95 | keys=[None, "x", "edge_index"], 96 | types=self.action_type_order, 97 | ) 98 | return cat, self.emb2reward(graph_embeddings) 99 | 100 | 101 | if __name__ == "__main__": 102 | env = FragmentEnvContext() 103 | gfn = FragmentBasedGFN(ctx=env) 104 | # gfn. 105 | -------------------------------------------------------------------------------- /rxitect/data/datasets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from rxitect.tokenizers import SelfiesTokenizer, SmilesTokenizer 4 | 5 | import tarfile 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import rdkit.Chem as Chem 10 | from torch.utils.data import Dataset 11 | 12 | 13 | class SmilesDataset(Dataset): 14 | def __init__(self, dataset_filepath: str, tokenizer: SmilesTokenizer) -> None: 15 | self.tokenizer = tokenizer 16 | self.padding_value = tokenizer.tk2ix_[tokenizer.pad_token] 17 | with open(dataset_filepath, "r") as f: 18 | self.smiles = [line.split()[0] for line in f] 19 | 20 | def __getitem__(self, index: int) -> torch.Tensor: 21 | smiles = self.smiles[index] 22 | return self.tokenizer.encode(smiles) 23 | 24 | def __len__(self): 25 | return len(self.smiles) 26 | 27 | def __str__(self) -> str: 28 | return f"SMILES Dataset containing {len(self)} structures" 29 | 30 | @classmethod 31 | def collate_fn(cls, arr: torch.Tensor) -> torch.Tensor: 32 | """Function to take a list of encoded sequences and turn them into a batch""" 33 | max_len = max([seq.size(0) for seq in arr]) 34 | collated_arr = torch.zeros(len(arr), max_len, dtype=torch.long) 35 | for i, seq in enumerate(arr): 36 | collated_arr[i, : seq.size(0)] = seq 37 | return collated_arr 38 | 39 | 40 | class SelfiesDataset(Dataset): 41 | def __init__(self, dataset_filepath: str, tokenizer: SelfiesTokenizer) -> None: 42 | self.tokenizer = tokenizer 43 | with open(dataset_filepath, "r") as f: 44 | self.selfies = [line.split()[0] for line in f] 45 | 46 | def __getitem__(self, index: int) -> torch.Tensor: 47 | selfies = self.selfies[index] 48 | return self.tokenizer.encode(selfies) 49 | 50 | def __len__(self): 51 | return len(self.selfies) 52 | 53 | def __str__(self) -> str: 54 | return f"SELFIES Dataset containing {len(self)} structures" 55 | 56 | @classmethod 57 | def collate_fn(cls, arr: torch.Tensor) -> torch.Tensor: 58 | """Function to take a list of encoded sequences and turn them into a batch""" 59 | max_len = max([seq.size(0) for seq in arr]) 60 | collated_arr = torch.zeros(len(arr), max_len) 61 | for i, seq in enumerate(arr): 62 | collated_arr[i, : seq.size(0)] = seq 63 | return collated_arr 64 | 65 | 66 | class QM9Dataset(Dataset): 67 | def __init__(self, h5_file=None, xyz_file=None, train=True, target='gap', split_seed=142857, ratio=0.9): 68 | if h5_file is not None: 69 | self.df = pd.HDFStore(h5_file, 'r')['df'] 70 | elif xyz_file is not None: 71 | self.load_tar() 72 | rng = np.random.default_rng(split_seed) 73 | idcs = np.arange(len(self.df)) # TODO: error if there is no h5_file provided. Should h5 be required 74 | rng.shuffle(idcs) 75 | self.target = target 76 | if train: 77 | self.idcs = idcs[:int(np.floor(ratio * len(self.df)))] 78 | else: 79 | self.idcs = idcs[int(np.floor(ratio * len(self.df))):] 80 | 81 | def get_stats(self, percentile=0.95): 82 | y = self.df[self.target] 83 | return y.min(), y.max(), np.sort(y)[int(y.shape[0] * percentile)] 84 | 85 | def load_tar(self, xyz_file): 86 | f = tarfile.TarFile(xyz_file, 'r') 87 | labels = ['rA', 'rB', 'rC', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv'] 88 | all_mols = [] 89 | for pt in f: 90 | pt = f.extractfile(pt) 91 | data = pt.read().decode().splitlines() 92 | all_mols.append(data[-2].split()[:1] + list(map(float, data[1].split()[2:]))) 93 | self.df = pd.DataFrame(all_mols, columns=['SMILES'] + labels) 94 | 95 | def __len__(self): 96 | return len(self.idcs) 97 | 98 | def __getitem__(self, idx): 99 | return Chem.MolFromSmiles(self.df['SMILES'][self.idcs[idx]]), self.df[self.target][self.idcs[idx]] 100 | -------------------------------------------------------------------------------- /rxitect/algorithms/gfn_algorithm.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import TYPE_CHECKING, Dict, List, Optional, Tuple 5 | 6 | import torch 7 | from torch import nn 8 | from torch_geometric.data import Batch 9 | 10 | from rxitect.envs.contexts.graph_env_context import ( 11 | StateActionPair, generate_forward_trajectory) 12 | 13 | if TYPE_CHECKING: 14 | from rxitect.envs.contexts import (Action, ActionCategorical, ActionIndex, 15 | Graph) 16 | 17 | 18 | class SamplingModel(nn.Module): 19 | def forward(self, batch: Batch) -> Tuple[ActionCategorical, torch.Tensor]: 20 | raise NotImplementedError() 21 | 22 | def log_z(self, cond_info: torch.Tensor) -> torch.Tensor: 23 | raise NotImplementedError() 24 | 25 | 26 | class GFNAlgorithm(ABC): 27 | @abstractmethod 28 | def create_training_data_from_own_samples( 29 | self, model: SamplingModel, n: int, cond_info: torch.Tensor 30 | ) -> List[Dict]: 31 | """Generate trajectories by sampling a model 32 | Parameters 33 | ---------- 34 | model: SamplingModel 35 | The model being sampled 36 | n: int 37 | Number of samples 38 | cond_info: torch.Tensor 39 | Conditional information, shape (N, n_info) 40 | Returns 41 | ------- 42 | data: List[Dict] 43 | A list of trajectories. Each trajectory is a dict with keys 44 | - trajs: List[Tuple[Graph, GraphAction]] 45 | - reward_pred: float, -100 if an illegal action is taken, predicted R(x) if bootstrapping, None otherwise 46 | - fwd_logprob: log Z + sum logprobs P_F 47 | - bck_logprob: sum logprobs P_B 48 | - logZ: predicted log Z 49 | - loss: predicted loss (if bootstrapping) 50 | - is_valid: is the generated graph valid according to the env & ctx 51 | """ 52 | pass 53 | 54 | @staticmethod 55 | def create_training_data_from_graphs(graphs: List[Graph]) -> List[Trajectory]: 56 | """Generate trajectories from known endpoints 57 | Parameters 58 | ---------- 59 | graphs: List[Graph] 60 | List of Graph endpoints 61 | Returns 62 | ------- 63 | trajs: List[Dict{'traj': List[tuple[Graph, GraphAction]]}] 64 | A list of trajectories. 65 | """ 66 | return [{"traj": generate_forward_trajectory(i)} for i in graphs] 67 | 68 | def construct_batch(self, trajs, cond_info, rewards) -> Batch: 69 | """Construct a batch from a list of trajectories and their information 70 | Parameters 71 | ---------- 72 | trajs: List[List[tuple[Graph, GraphAction]]] 73 | A list of N trajectories. 74 | cond_info: Tensor 75 | The conditional info that is considered for each trajectory. Shape (N, n_info) 76 | rewards: Tensor 77 | The transformed reward (e.g. R(x) ** beta) for each trajectory. Shape (N,) 78 | Returns 79 | ------- 80 | batch: Batch 81 | A (CPU) Batch object with relevant attributes added 82 | """ 83 | pass 84 | 85 | @abstractmethod 86 | def compute_batch_losses( 87 | self, model: nn.Module, batch: Batch, num_bootstrap: Optional[int] = 0 88 | ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: 89 | """Computes the loss for a batch of data, and proves logging information 90 | Parameters 91 | ---------- 92 | model: nn.Module 93 | The model being trained or evaluated 94 | batch: gd.Batch 95 | A batch of graphs 96 | num_bootstrap: Optional[int] 97 | The number of trajectories with reward targets in the batch (if applicable). 98 | Returns 99 | ------- 100 | loss: Tensor 101 | The loss for that batch 102 | info: Dict[str, Tensor] 103 | Logged information about model predictions. 104 | """ 105 | pass 106 | 107 | 108 | Trajectory = Dict[str, List[StateActionPair]] 109 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # Visual Studio Code 156 | .vscode/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | .idea/ 164 | 165 | # Project specific directories 166 | aizynthfinder/* 167 | !aizynthfinder/.gitkeep 168 | 169 | # Data files (delete after DVC integration) 170 | data/* 171 | !data/raw 172 | !data/interim 173 | !data/processed 174 | !data/external 175 | data/raw/* 176 | data/interim/* 177 | data/processed/* 178 | data/external/* 179 | !data/raw/.gitkeep 180 | !data/interim/.gitkeep 181 | !data/processed/.gitkeep 182 | !data/external/.gitkeep 183 | 184 | # Model files 185 | models/* 186 | !models/.gitkeep 187 | 188 | # Scratchpad notebooks 189 | notebooks/Untitled*.ipynb 190 | 191 | # Logs 192 | logs/* 193 | !logs/.gitkeep 194 | lightning_logs/ 195 | 196 | # Active Learning Dump 197 | *_iter_* -------------------------------------------------------------------------------- /rxitect/utils/multiprocessing_proxy.py: -------------------------------------------------------------------------------- 1 | import queue 2 | import threading 3 | from typing import Tuple 4 | 5 | import torch 6 | import torch.multiprocessing as mp 7 | from torch import nn 8 | from torch.utils.data import get_worker_info 9 | 10 | 11 | class MPModelPlaceholder: 12 | """This class can be used as a Model in a worker process, and 13 | translates calls to queries to the main process""" 14 | 15 | def __init__(self, in_queues, out_queues): 16 | self.qs = in_queues, out_queues 17 | self.device = torch.device("cpu") 18 | self._is_init = False 19 | 20 | def _check_init(self): 21 | if self._is_init: 22 | return 23 | info = get_worker_info() 24 | self.in_queue = self.qs[0][info.id] 25 | self.out_queue = self.qs[1][info.id] 26 | self._is_init = True 27 | 28 | # TODO: make a generic method for this based on __getattr__ 29 | def log_z(self, *a): 30 | self._check_init() 31 | self.in_queue.put(("log_z", *a)) 32 | return self.out_queue.get() 33 | 34 | def __call__(self, *a): 35 | self._check_init() 36 | self.in_queue.put(("__call__", *a)) 37 | return self.out_queue.get() 38 | 39 | 40 | class MPModelProxy: 41 | """This class maintains a reference to an in-cuda-memory model, and 42 | creates a `placeholder` attribute which can be safely passed to 43 | multiprocessing DataLoader workers. 44 | This placeholder model sends messages across multiprocessing 45 | queues, which are received by this proxy instance, which calls the 46 | model and sends the return value back to the worker. 47 | Starts its own (daemon) thread. Always passes CPU tensors between 48 | processes. 49 | """ 50 | 51 | def __init__(self, model: nn.Module, num_workers: int, cast_types: Tuple): 52 | """Construct a multiprocessing model proxy for torch DataLoaders. 53 | Parameters 54 | ---------- 55 | model: torch.nn.Module 56 | A torch model which lives in the main process to which method calls are passed 57 | num_workers: int 58 | Number of DataLoader workers 59 | cast_types: tuple 60 | Types that will be cast to cuda when received as arguments of method calls. 61 | torch.Tensor is cast by default. 62 | """ 63 | self.in_queues = [mp.Queue() for _ in range(num_workers)] 64 | self.out_queues = [mp.Queue() for _ in range(num_workers)] 65 | self.placeholder = MPModelPlaceholder(self.in_queues, self.out_queues) 66 | self.model = model 67 | self.device = next(model.parameters()).device 68 | self.cuda_types = (torch.Tensor,) + cast_types 69 | self.stop = threading.Event() 70 | self.thread = threading.Thread(target=self.run, daemon=True) 71 | self.thread.start() 72 | 73 | def __del__(self): 74 | self.stop.set() 75 | 76 | def run(self) -> None: 77 | while not self.stop.is_set(): 78 | for qi, q in enumerate(self.in_queues): 79 | try: 80 | r = q.get(True, 1e-5) 81 | except queue.Empty: 82 | continue 83 | except ConnectionError: 84 | break 85 | attr, *args = r 86 | f = getattr(self.model, attr) 87 | args = [ 88 | i.to(self.device) if isinstance(i, self.cuda_types) else i 89 | for i in args 90 | ] 91 | result = f(*args) 92 | if isinstance(result, (list, tuple)): 93 | msg = [ 94 | i.detach().to(torch.device("cpu")) 95 | if isinstance(i, self.cuda_types) 96 | else i 97 | for i in result 98 | ] 99 | self.out_queues[qi].put(msg) 100 | else: 101 | msg = ( 102 | result.detach().to(torch.device("cpu")) 103 | if isinstance(result, self.cuda_types) 104 | else result 105 | ) 106 | self.out_queues[qi].put(msg) 107 | 108 | 109 | def wrap_model_mp(model: nn.Module, num_workers: int, cast_types: Tuple) -> MPModelPlaceholder: 110 | """Construct a multiprocessing model proxy for torch DataLoaders so 111 | that only one process ends up making cuda calls and holding cuda 112 | tensors in memory. 113 | Parameters 114 | ---------- 115 | model: nn.Module 116 | A torch model which lives in the main process to which method calls are passed 117 | num_workers: int 118 | Number of DataLoader workers 119 | cast_types: tuple 120 | Types that will be cast to cuda when received as arguments of method calls. 121 | torch.Tensor is cast by default. 122 | Returns 123 | ------- 124 | placeholder: MPModelPlaceholder 125 | A placeholder model whose method calls route arguments to the main process 126 | """ 127 | return MPModelProxy(model, num_workers, cast_types).placeholder 128 | -------------------------------------------------------------------------------- /rxitect/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, List, Optional, Tuple 4 | 5 | import numpy as np 6 | import torch 7 | from numpy.typing import ArrayLike 8 | from rdkit import Chem 9 | from rdkit.Chem import AllChem 10 | from torch_geometric.data import Batch, Data 11 | 12 | if TYPE_CHECKING: 13 | from rxitect.data.composable_molecule import ComposableMolecule 14 | from rxitect.mdp import MarkovDecisionProcess 15 | 16 | 17 | def is_valid_smiles(smiles: str) -> bool: 18 | return Chem.MolFromSmiles(smiles) is not None 19 | 20 | 21 | def filter_duplicate_tensors(x: torch.Tensor) -> torch.Tensor: 22 | return x.unique_consecutive(dim=0) 23 | 24 | 25 | def mol_from_fragments( 26 | jbonds: ArrayLike, 27 | frags: Optional[List[Chem.rdchem.Mol]] = None, 28 | frag_smiles: Optional[List[str]] = None, 29 | optimize: bool = False, 30 | ) -> Tuple[Chem.rdchem.Mol, List[int]]: 31 | """Joins 2 or more fragments into a single molecule 32 | 33 | Args: 34 | jbonds (ArrayLike): An array-like (e.g., a list) object containing junction bonds 35 | frags (Optional[List[Mol]]): A list of RDKit Mol objects to be combined. Should be given if frag_smiles is not present 36 | frag_smiles (Optional[List[Mol]]): A list of SMILES strings to be made into RDKit Mol objects and combined. Must be present if frags is not 37 | optimize (bool): If the molecule's 3D structure should be optimized. Defaults to False 38 | 39 | Returns: 40 | Tuple[Mol, List[int]]: A tuple containing the combined molecule as an RDKit Mol object, and a list containing the bonds 41 | """ 42 | jbonds = np.asarray(jbonds) 43 | 44 | if frags is not None: 45 | pass 46 | elif frags is None and frag_smiles is not None: 47 | frags = [Chem.MolFromSmiles(smi) for smi in frag_smiles] 48 | else: 49 | raise ValueError("At least one of `frags` or `frag_smiles` should be given.") 50 | 51 | if len(frags) == 0: 52 | return None, None 53 | 54 | num_frags = len(frags) 55 | # combine fragments into a single molecule 56 | mol = frags[0] 57 | for i in np.arange(start=1, stop=num_frags): 58 | mol = Chem.CombineMols(mol, frags[i]) 59 | # add junction bonds between fragments 60 | frag_start_idx = np.concatenate( 61 | [[0], np.cumsum([frag.GetNumAtoms() for frag in frags])], 0 62 | )[:-1] 63 | 64 | if jbonds.size == 0: 65 | mol_bonds = [] 66 | else: 67 | mol_bonds = frag_start_idx[jbonds[:, 0:2]] + jbonds[:, 2:4] 68 | 69 | rw_mol = Chem.EditableMol(mol) 70 | 71 | [ 72 | rw_mol.AddBond(int(bond[0]), int(bond[1]), Chem.BondType.SINGLE) 73 | for bond in mol_bonds 74 | ] 75 | mol = rw_mol.GetMol() 76 | atoms = list(mol.GetAtoms()) 77 | 78 | def _pop_H(atom): 79 | num_h = atom.GetNumExplicitHs() 80 | if num_h > 0: 81 | atom.SetNumExplicitHs(num_h - 1) 82 | 83 | [(_pop_H(atoms[bond[0]]), _pop_H(atoms[bond[1]])) for bond in mol_bonds] 84 | Chem.SanitizeMol(mol) 85 | 86 | # create and optimize 3D structure 87 | if optimize: 88 | assert not "h" in set( 89 | [atom.GetSymbol().lower() for atom in mol.GetAtoms()] 90 | ), "can't optimize molecule with h" 91 | Chem.AddHs(mol) 92 | AllChem.EmbedMolecule(mol) 93 | AllChem.MMFFOptimizeMolecule(mol) 94 | Chem.RemoveHs(mol) 95 | return mol, mol_bonds 96 | 97 | 98 | def mol2graph(cmol: ComposableMolecule, mdp: MarkovDecisionProcess) -> Data: 99 | """ 100 | TODO 101 | """ 102 | long_tensor = lambda x: torch.tensor(x, dtype=torch.long, device=mdp.device) 103 | if len(cmol.block_idxs) == 0: 104 | data = Data( # There's an extra block embedding for the empty molecule 105 | x=long_tensor([mdp.num_true_blocks]), 106 | edge_index=long_tensor([[], []]), 107 | edge_attr=long_tensor([]).reshape((0, 2)), 108 | stems=long_tensor([(0, 0)]), 109 | stem_types=long_tensor([mdp.num_stem_types]), 110 | ) # also extra stem type embedding 111 | return data 112 | edges = [(i[0], i[1]) for i in cmol.jbonds] 113 | # edge_attrs = [mdp.bond_type_offset[i[2]] + i[3] for i in mol.jbonds] 114 | t = mdp.true_block_idx 115 | if 0: 116 | edge_attrs = [ 117 | ( 118 | (mdp.stem_type_offset[t[cmol.block_idxs[i[0]]]] + i[2]) 119 | * mdp.num_stem_types 120 | + (mdp.stem_type_offset[t[cmol.block_idxs[i[1]]]] + i[3]) 121 | ) 122 | for i in cmol.jbonds 123 | ] 124 | else: 125 | edge_attrs = [ 126 | ( 127 | mdp.stem_type_offset[t[cmol.block_idxs[i[0]]]] + i[2], 128 | mdp.stem_type_offset[t[cmol.block_idxs[i[1]]]] + i[3], 129 | ) 130 | for i in cmol.jbonds 131 | ] 132 | """ 133 | Here stem_type_offset is a list of offsets to know which 134 | embedding to use for a particular stem. Each (blockidx, atom) 135 | pair has its own embedding. 136 | """ 137 | stem_types = [ 138 | mdp.stem_type_offset[t[cmol.block_idxs[i[0]]]] + i[1] for i in cmol.stems 139 | ] 140 | 141 | data = Data( 142 | x=long_tensor([t[i] for i in cmol.block_idxs]), 143 | edge_index=long_tensor(edges).T if len(edges) else long_tensor([[], []]), 144 | edge_attr=long_tensor(edge_attrs) 145 | if len(edges) 146 | else long_tensor([]).reshape((0, 2)), 147 | stems=long_tensor(cmol.stems) if len(cmol.stems) else long_tensor([(0, 0)]), 148 | stem_types=long_tensor(stem_types) 149 | if len(cmol.stems) 150 | else long_tensor([mdp.num_stem_types]), 151 | ) 152 | data.to(mdp.device) 153 | return data 154 | 155 | 156 | def mols2batch(mols: List[Data], mdp: MarkovDecisionProcess) -> Batch: 157 | """ 158 | TODO 159 | """ 160 | batch = Batch.from_data_list(mols, follow_batch=["stems"]) 161 | batch.to(mdp.device) 162 | return batch 163 | -------------------------------------------------------------------------------- /rxitect/envs/fragment_env.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import networkx as nx 4 | 5 | from rxitect.envs.contexts import Action, ActionType, Graph, StateActionPair 6 | 7 | 8 | class FragmentEnv: 9 | """ 10 | A Graph building environment which induces a DAG state space, compatible with GFlowNet. 11 | Supports forward and backward actions, with a `parents` function that list parents of 12 | forward actions. 13 | Edges and nodes can have attributes added to them in a key:value style. 14 | Edges and nodes are created with _implicit_ default attribute 15 | values (e.g. chirality, single/double bondness) so that: 16 | - an agent gets to do an extra action to set that attribute, but only 17 | if it is still default-valued (DAG property preserved) 18 | - we can generate a legal action for any attribute that isn't a default one. 19 | """ 20 | 21 | def __init__( 22 | self, 23 | allow_add_edge: bool = True, 24 | allow_node_attr: bool = True, 25 | allow_edge_attr: bool = True, 26 | ): 27 | """A graph building environment instance 28 | Parameters 29 | ---------- 30 | allow_add_edge: bool 31 | if True, allows this action and computes AddEdge parents (i.e. if False, this 32 | env only allows for tree generation) 33 | allow_node_attr: bool 34 | if True, allows this action and computes SetNodeAttr parents 35 | allow_edge_attr: bool 36 | if True, allows this action and computes SetEdgeAttr parents 37 | """ 38 | self.allow_add_edge = allow_add_edge 39 | self.allow_node_attr = allow_node_attr 40 | self.allow_edge_attr = allow_edge_attr 41 | 42 | @staticmethod 43 | def new(): 44 | return Graph() 45 | 46 | def step(self, g: Graph, action: Action) -> Graph: 47 | """Step forward the given graph state with an action 48 | Parameters 49 | ---------- 50 | g: Graph 51 | the graph to be modified 52 | action: GraphAction 53 | the action taken on the graph, indices must match 54 | Returns 55 | ------- 56 | gp: Graph 57 | the new graph 58 | """ 59 | gp = g.copy() 60 | if action.act_type is ActionType.ADD_EDGE: 61 | a, b = action.source, action.target 62 | assert self.allow_add_edge 63 | assert a in g and b in g 64 | if a > b: 65 | a, b = b, a 66 | assert a != b 67 | assert not g.has_edge(a, b) 68 | # Ideally the FA underlying this must only be able to send 69 | # create_edge actions which respect this a List[StateActionPair]: 103 | """List possible parents of graph `g` 104 | Parameters 105 | ---------- 106 | g: Graph 107 | graph 108 | Returns 109 | ------- 110 | parents: List[Pair(GraphAction, Graph)] 111 | The list of parent-action pairs that lead to `g`. 112 | """ 113 | raise NotImplementedError() 114 | 115 | @staticmethod 116 | def count_backward_transitions(g: Graph): 117 | """Counts the number of parents of g without checking for isomorphisms""" 118 | c = 0 119 | deg = [g.degree[i] for i in range(len(g.nodes))] 120 | for a, b in g.edges: 121 | if deg[a] > 1 and deg[b] > 1 and len(g.edges[(a, b)]) == 0: 122 | # Can only remove edges connected to non-leaves and without 123 | # attributes (the agent has to remove the attrs, then remove 124 | # the edge). Removal cannot disconnect the graph. 125 | new_g = graph_without_edge(g, (a, b)) 126 | if nx.algorithms.is_connected(new_g): 127 | c += 1 128 | c += len(g.edges[(a, b)]) # One action per edge attr 129 | for i in g.nodes: 130 | if ( 131 | deg[i] == 1 132 | and len(g.nodes[i]) == 1 133 | and len(g.edges[list(g.edges(i))[0]]) == 0 134 | ): 135 | c += 1 136 | c += len(g.nodes[i]) - 1 # One action per node attr, except 'v' 137 | if len(g.nodes) == 1 and len(g.nodes[i]) == 1: 138 | # special case if last node in graph 139 | c += 1 140 | return c 141 | 142 | 143 | # TODO: Move these to a graph utils file 144 | def graph_without_edge(g, e): 145 | gp = g.copy() 146 | gp.remove_edge(*e) 147 | return gp 148 | -------------------------------------------------------------------------------- /rxitect/models/transformers.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, Callable 4 | 5 | import torch 6 | import torch_geometric.nn as gnn 7 | from torch import nn 8 | from torch_geometric.data import Batch 9 | from torch_geometric.utils import add_self_loops 10 | 11 | 12 | class GraphTransformer(nn.Module): 13 | """An agnostic GraphTransformer class, and the main model used by other model classes 14 | This graph model takes in node features, edge features, and graph features (referred to as 15 | conditional information, since they condition the output). The graph features are projected to 16 | virtual nodes (one per graph), which are fully connected. 17 | 18 | The per node outputs are the concatenation of the final (post graph-convolution) node embeddings 19 | and of the final virtual node embedding of the graph each node corresponds to. 20 | The per graph outputs are the concatenation of a global mean pooling operation, of the final 21 | virtual node embeddings, and of the conditional information embedding. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | x_dim: int, 27 | e_dim: int, 28 | g_dim: int, 29 | num_emb: int = 64, 30 | num_layers: int = 3, 31 | num_heads: int = 2, 32 | ): 33 | """ 34 | Parameters 35 | ---------- 36 | x_dim: int 37 | The number of node features 38 | e_dim: int 39 | The number of edge features 40 | g_dim: int 41 | The number of graph-level features 42 | num_emb: int 43 | The number of hidden dimensions, i.e. embedding size. Default 64. 44 | num_layers: int 45 | The number of Transformer layers. 46 | num_heads: int 47 | The number of Transformer heads per layer. 48 | """ 49 | super().__init__() 50 | self.num_layers = num_layers 51 | 52 | self.x2h = create_mlp(x_dim, num_emb, num_emb, 2) 53 | self.e2h = create_mlp(e_dim, num_emb, num_emb, 2) 54 | self.c2h = create_mlp(g_dim, num_emb, num_emb, 2) 55 | self.graph2emb = nn.ModuleList( 56 | sum( 57 | [ 58 | [ 59 | gnn.GENConv( 60 | num_emb, num_emb, num_layers=1, aggr="add", norm=None 61 | ), 62 | gnn.TransformerConv( 63 | num_emb * 2, num_emb, edge_dim=num_emb, heads=num_heads 64 | ), 65 | nn.Linear(num_heads * num_emb, num_emb), 66 | gnn.LayerNorm(num_emb, affine=False), 67 | create_mlp(num_emb, num_emb * 4, num_emb, 1), 68 | gnn.LayerNorm(num_emb, affine=False), 69 | ] 70 | for i in range(self.num_layers) 71 | ], 72 | [], 73 | ) 74 | ) 75 | 76 | def forward(self, g: Batch, cond: torch.Tensor): 77 | """Forward pass 78 | Parameters 79 | ---------- 80 | g: Batch 81 | A standard torch_geometric Batch object. Expects `edge_attr` to be set. 82 | cond: torch.Tensor 83 | The per-graph conditioning information. Shape: (g.num_graphs, self.g_dim). 84 | Returns 85 | ------- 86 | node_embeddings: torch.Tensor 87 | Per node embeddings. Shape: (g.num_nodes, self.num_emb * 2). 88 | graph_embeddings: torch.Tensor 89 | Per graph embeddings. Shape: (g.num_graphs, self.num_emb * 3). 90 | """ 91 | o = self.x2h(g.x) 92 | e = self.e2h(g.edge_attr) 93 | c = self.c2h(cond) 94 | num_total_nodes = g.x.shape[0] 95 | # Augment the edges with a new edge to the conditioning 96 | # information node. This new node is connected to every node 97 | # within its graph. 98 | u, v = torch.arange(num_total_nodes, device=o.device), g.batch + num_total_nodes 99 | aug_edge_index = torch.cat( 100 | [g.edge_index, torch.stack([u, v]), torch.stack([v, u])], 1 101 | ) 102 | e_p = torch.zeros((num_total_nodes * 2, e.shape[1]), device=g.x.device) 103 | e_p[:, 0] = 1 # Manually create a bias term 104 | aug_e = torch.cat([e, e_p], 0) 105 | aug_edge_index, aug_e = add_self_loops(aug_edge_index, aug_e, "mean") 106 | aug_batch = torch.cat([g.batch, torch.arange(c.shape[0], device=o.device)], 0) 107 | 108 | # Append the conditioning information node embedding to o 109 | o = torch.cat([o, c], 0) 110 | for i in range(self.num_layers): 111 | # Run the graph transformer forward 112 | gen, trans, linear, norm1, ff, norm2 = self.graph2emb[i * 6 : (i + 1) * 6] 113 | agg = gen(o, aug_edge_index, aug_e) 114 | o = norm1( 115 | o + linear(trans(torch.cat([o, agg], 1), aug_edge_index, aug_e)), 116 | aug_batch, 117 | ) 118 | o = norm2(o + ff(o), aug_batch) 119 | 120 | glob = torch.cat( 121 | [gnn.global_mean_pool(o[: -c.shape[0]], g.batch), o[-c.shape[0] :], c], 1 122 | ) 123 | o_final = torch.cat([o[: -c.shape[0]], c[g.batch]], 1) 124 | return o_final, glob 125 | 126 | 127 | def create_mlp( 128 | n_in: int, 129 | n_hid: int, 130 | n_out: int, 131 | n_layer: int, 132 | activation_fn: Callable = nn.LeakyReLU, 133 | ): 134 | """Helper function that creates a fully-connected network with no activation after the last layer. 135 | If `n_layer` is 0 then this corresponds to `nn.Linear(n_in, n_out)`. 136 | 137 | Parameters 138 | ---------- 139 | n_in: int 140 | x 141 | n_hid: int 142 | x 143 | n_out: int 144 | x 145 | n_layer: int 146 | x 147 | activation_fn: Callable 148 | x 149 | """ 150 | n = [n_in] + [n_hid] * n_layer + [n_out] 151 | return nn.Sequential( 152 | *sum( 153 | [[nn.Linear(n[i], n[i + 1]), activation_fn()] for i in range(n_layer + 1)], 154 | [], 155 | )[:-1] 156 | ) 157 | -------------------------------------------------------------------------------- /rxitect/models/lstm_generator.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import pytorch_lightning as pl 4 | import torch 5 | import torch.nn as nn 6 | 7 | from rxitect import utils 8 | from rxitect.tokenizers import get_tokenizer 9 | 10 | 11 | class LSTMGenerator(pl.LightningModule): 12 | """ 13 | A molecule generator that uses an LSTM to learn how to build valid molecular representations 14 | through BPTT. 15 | 16 | Attributes 17 | ---------- 18 | tokenizer : Tokenizer 19 | A tokenizer to handle a given molecular representation (e.g., SMILES or SELFIES). 20 | embedding_size : int 21 | TODO 22 | hidden_size : int 23 | TODO 24 | embedding_layer : torch.nn.Embedding 25 | TODO 26 | lstm : torch.nn.LSTM 27 | TODO 28 | output_layer : torch.nn.Linear 29 | TODO 30 | """ 31 | 32 | def __init__( 33 | self, 34 | vocabulary_filepath: str, 35 | molecule_repr: str = "smiles", 36 | embedding_size: int = 128, 37 | hidden_size: int = 512, 38 | num_layers: int = 3, 39 | lr: float = 1e-3, 40 | weight_decay: float = 0, 41 | ) -> None: 42 | """ 43 | Parameters 44 | ---------- 45 | vocabulary_filepath : str 46 | TODO 47 | molecule_repr : str, optional 48 | The type of molecular (string) representation to use (default is "smiles") 49 | embedding_size : int, optional 50 | The size of the embedding layer (default is 128) 51 | hidden_size : int, optional 52 | The size of the hidden layer (default is 512) 53 | num_layers : int 54 | TODO 55 | lr: float 56 | The learning rate for the LSTM generator (default is 1e-3) 57 | weight_decay: float 58 | TODO 59 | """ 60 | super().__init__() 61 | self.save_hyperparameters() 62 | self.tokenizer = get_tokenizer( 63 | molecule_repr, 64 | vocabulary_filepath=vocabulary_filepath, 65 | ) 66 | self.embedding_size = embedding_size 67 | self.hidden_size = hidden_size 68 | self.embedding_layer = nn.Embedding( 69 | num_embeddings=self.tokenizer.vocabulary_size_, embedding_dim=embedding_size 70 | ) 71 | self.num_layers = num_layers 72 | self.lstm = nn.LSTM( 73 | embedding_size, hidden_size, num_layers=num_layers, batch_first=True 74 | ) 75 | self.output_layer = nn.Linear(hidden_size, self.tokenizer.vocabulary_size_) 76 | self.lr = lr 77 | self.weight_decay = weight_decay 78 | 79 | def forward( 80 | self, x: torch.Tensor, h: torch.Tensor 81 | ) -> Tuple[torch.Tensor, torch.Tensor]: 82 | x = self.embedding_layer(x.unsqueeze(dim=-1)) 83 | x, h_out = self.lstm(x, h) 84 | x = self.output_layer(x).squeeze(dim=1) 85 | return x, h_out 86 | 87 | def training_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor: 88 | loss = self.likelihood(batch) 89 | loss = -loss.mean() 90 | self.log( 91 | "train/loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True 92 | ) 93 | return loss 94 | 95 | def validation_step( 96 | self, batch: torch.Tensor, batch_idx: torch.Tensor 97 | ) -> torch.Tensor: 98 | loss = self.likelihood(batch) 99 | loss = -loss.mean() 100 | self.log( 101 | "val/loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True 102 | ) 103 | return loss 104 | 105 | def on_validation_epoch_end(self) -> None: 106 | sequences = self.sample(1024) 107 | sequences = utils.filter_duplicate_tensors(sequences) 108 | valid_arr = [ 109 | utils.is_valid_smiles(smi) for smi in self.tokenizer.batch_decode(sequences) 110 | ] 111 | frac_valid = sum(valid_arr) / len(valid_arr) 112 | frac_unique = sum(valid_arr) / 1024 113 | self.log("frac_valid_smiles", frac_valid) 114 | self.log("frac_unique_smiles", frac_unique) 115 | 116 | def configure_optimizers(self): 117 | optimizer = torch.optim.Adam( 118 | self.parameters(), lr=self.lr, weight_decay=self.weight_decay 119 | ) 120 | return optimizer 121 | 122 | def init_hidden(self, batch_size: int) -> Tuple[torch.Tensor, torch.Tensor]: 123 | h = torch.rand( 124 | self.num_layers, batch_size, self.hidden_size, device=self.device 125 | ) 126 | c = torch.rand( 127 | self.num_layers, batch_size, self.hidden_size, device=self.device 128 | ) 129 | return h, c 130 | 131 | def likelihood(self, target: torch.Tensor) -> torch.Tensor: 132 | batch_size, seq_len = target.size() 133 | x = torch.tensor( 134 | [self.tokenizer.tk2ix_[self.tokenizer.start_token]] * batch_size, 135 | device=self.device, 136 | dtype=torch.long, 137 | ) 138 | h = self.init_hidden(batch_size) 139 | scores = torch.zeros(batch_size, seq_len, device=self.device) 140 | for step in range(seq_len): 141 | logits, h = self(x, h) 142 | logits = logits.log_softmax(dim=-1) 143 | score = logits.gather(1, target[:, step : step + 1]).squeeze() 144 | scores[:, step] = score 145 | x = target[:, step] 146 | return scores 147 | 148 | def sample(self, batch_size: int, max_len: int = 140): 149 | x = torch.tensor( 150 | [self.tokenizer.tk2ix_[self.tokenizer.start_token]] * batch_size, 151 | dtype=torch.long, 152 | device=self.device, 153 | ) 154 | h = self.init_hidden(batch_size) 155 | sequences = torch.zeros( 156 | batch_size, max_len, dtype=torch.long, device=self.device 157 | ) 158 | is_end = torch.zeros(batch_size, dtype=torch.bool, device=self.device) 159 | 160 | for step in range(max_len): 161 | logit, h = self(x, h) 162 | proba = logit.softmax(dim=-1) 163 | x = torch.multinomial(proba, 1).view(-1) 164 | x[is_end] = self.tokenizer.tk2ix_[self.tokenizer.stop_token] 165 | sequences[:, step] = x 166 | 167 | end_token = x == self.tokenizer.tk2ix_[self.tokenizer.stop_token] 168 | is_end = torch.ge(is_end + end_token, 1) 169 | if (is_end == 1).all(): 170 | break 171 | return sequences 172 | 173 | 174 | class GRUGenerator(nn.Module): 175 | pass 176 | -------------------------------------------------------------------------------- /rxitect/tokenizers.py: -------------------------------------------------------------------------------- 1 | import re 2 | from abc import ABC, abstractmethod 3 | from typing import Dict, List 4 | 5 | import selfies as sf 6 | import torch 7 | 8 | 9 | class Tokenizer(ABC): 10 | start_token: str 11 | stop_token: str 12 | pad_token: str 13 | 14 | # Inferred attrs 15 | vocabulary_size_: int 16 | tk2ix_: Dict[str, int] 17 | ix2tk_: Dict[int, str] 18 | 19 | @abstractmethod 20 | def encode(self, molecules: List[str]) -> torch.Tensor: 21 | pass 22 | 23 | @abstractmethod 24 | def decode(self, encoded_molecules: torch.Tensor) -> List[str]: 25 | pass 26 | 27 | def _get_vocabulary_from_file(self, vocabulary_filepath: str) -> List[str]: 28 | with open(vocabulary_filepath, "r") as f: 29 | vocabulary = f.read().splitlines() 30 | 31 | return sorted(vocabulary) 32 | 33 | 34 | class SmilesTokenizer(Tokenizer): 35 | def __init__(self, vocabulary_filepath: str) -> None: 36 | self.pad_token = "" 37 | self.start_token = "GO" 38 | self.stop_token = "EOS" 39 | SENTINEL_TOKENS = [self.pad_token, self.start_token, self.stop_token] 40 | self.vocabulary = SENTINEL_TOKENS + self._get_vocabulary_from_file( 41 | vocabulary_filepath 42 | ) 43 | self.vocabulary_size_ = len(self.vocabulary) 44 | self.tk2ix_ = dict(zip(self.vocabulary, range(self.vocabulary_size_))) 45 | self.ix2tk_ = {ix: tk for tk, ix in self.tk2ix_.items()} 46 | 47 | def encode(self, molecule: str) -> torch.Tensor: 48 | tokenized_smiles = self._tokenize(molecule) 49 | encoded_smiles = torch.zeros(len(tokenized_smiles), dtype=torch.long) 50 | for i, token in enumerate(tokenized_smiles): 51 | encoded_smiles[i] = self.tk2ix_[token] 52 | return encoded_smiles 53 | 54 | def batch_encode(self, molecules: List[str]) -> torch.Tensor: 55 | max_len = max([len(mol) for mol in molecules]) 56 | encoded_smiles = torch.zeros(len(molecules), max_len, dtype=torch.long) 57 | for i, smi in enumerate(molecules): 58 | tokenized_smi = self._tokenize(smi) 59 | for j, token in enumerate(tokenized_smi): 60 | encoded_smiles[i, j] = self.tk2ix_[token] 61 | return encoded_smiles 62 | 63 | def decode(self, encoded_molecule: torch.Tensor) -> List[str]: 64 | encoded_molecule = encoded_molecule.cpu().detach().numpy() 65 | chars = [] 66 | for i in encoded_molecule: 67 | if i == self.tk2ix_[self.stop_token]: 68 | break 69 | chars.append(self.ix2tk_[i]) 70 | smiles = "".join(chars) 71 | smiles = smiles.replace("L", "Cl").replace("R", "Br") 72 | return smiles 73 | 74 | def batch_decode(self, encoded_molecules: torch.Tensor) -> List[str]: 75 | decoded_smiles = [] 76 | encoded_molecules = encoded_molecules.cpu().detach().numpy() 77 | for enc_smiles in encoded_molecules: 78 | chars = [] 79 | for i in enc_smiles: 80 | if i == self.tk2ix_[self.stop_token]: 81 | break 82 | chars.append(self.ix2tk_[i]) 83 | smiles = "".join(chars) 84 | smiles = smiles.replace("L", "Cl").replace("R", "Br") 85 | decoded_smiles.append(smiles) 86 | return decoded_smiles 87 | 88 | def _tokenize(self, smiles: str) -> List[str]: 89 | """ 90 | Takes a SMILES string and returns a list containing the tokens its composed of. 91 | SOURCE: https://github.com/MarcusOlivecrona/REINVENT/ 92 | 93 | Parameters 94 | ---------- 95 | smiles: A SMILES string representing a molecule 96 | """ 97 | regex = "(\[[^\[\]]{1,6}\])" 98 | smiles = self._replace_halogen(smiles) 99 | char_list = re.split(regex, smiles) 100 | tokenized = [] 101 | for char in char_list: 102 | if char.startswith("["): 103 | tokenized.append(char) 104 | else: 105 | chars = [unit for unit in char] 106 | [tokenized.append(unit) for unit in chars] 107 | tokenized.append(self.stop_token) 108 | return tokenized 109 | 110 | def _replace_halogen(self, smiles: str) -> str: 111 | """Regex to replace Br and Cl with single letters""" 112 | br = re.compile("Br") 113 | cl = re.compile("Cl") 114 | smiles = br.sub("R", smiles) 115 | smiles = cl.sub("L", smiles) 116 | 117 | return smiles 118 | 119 | 120 | class SelfiesTokenizer(Tokenizer): 121 | def __init__(self, vocabulary_filepath: str, max_len: int) -> None: 122 | self.start_token = "[GO]" 123 | self.stop_token = "[EOS]" 124 | self.pad_token = "[nop]" 125 | SENTINEL_TOKENS = [self.pad_token, self.start_token, self.stop_token] 126 | self.vocabulary = SENTINEL_TOKENS + self._get_vocabulary_from_file( 127 | vocabulary_filepath 128 | ) 129 | self.vocabulary_size_ = len(self.vocabulary) 130 | self.max_len = max_len 131 | self.tk2ix_ = dict(zip(self.vocabulary, range(self.vocabulary_size_))) 132 | self.ix2tk_ = {ix: tk for tk, ix in self.tk2ix_.items()} 133 | 134 | def encode(self, molecule: List[str]) -> torch.Tensor: 135 | print("Encoding some SELFIES!") 136 | encoded_smiles = torch.zeros(self.max_len, dtype=torch.long) 137 | tokenized_smiles = self._tokenize(molecule) 138 | for i, token in enumerate(tokenized_smiles): 139 | encoded_smiles[i] = self.tk2ix_[token] 140 | return encoded_smiles 141 | 142 | def decode(self, encoded_molecule: torch.Tensor) -> List[str]: 143 | print("Decoding some tensors to SELFIES!") 144 | encoded_molecule = encoded_molecule.cpu().detach().numpy() 145 | chars = [] 146 | for i in encoded_molecule: 147 | if i == self.tk2ix_[self.stop_token]: 148 | break 149 | chars.append(self.ix2tk_[i]) 150 | selfies = "".join(chars) 151 | return selfies 152 | 153 | def _tokenize(self, selfies: str) -> List[str]: 154 | """ 155 | Takes a SELFIES string and returns a list containing the tokens its composed of. 156 | 157 | Parameters 158 | ---------- 159 | selfies: A SELFIES string representing a molecule 160 | """ 161 | tokenized_selfies = list(sf.split_selfies(selfies)) 162 | tokenized_selfies.append(self.stop_token) 163 | return tokenized_selfies 164 | 165 | 166 | def get_tokenizer(molecule_repr: str, vocabulary_filepath: str) -> Tokenizer: 167 | if molecule_repr == "smiles": 168 | return SmilesTokenizer(vocabulary_filepath=vocabulary_filepath) 169 | elif molecule_repr == "selfies": 170 | return SelfiesTokenizer(vocabulary_filepath=vocabulary_filepath) 171 | else: 172 | raise ValueError(molecule_repr) 173 | -------------------------------------------------------------------------------- /rxitect/trainers/gfn_trainer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from pathlib import Path 5 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple 6 | 7 | import torch 8 | from rdkit.Chem.rdchem import Mol 9 | from torch import nn 10 | from torch.types import Device 11 | from torch.utils.data import DataLoader, Dataset 12 | from torch.utils.tensorboard.writer import SummaryWriter 13 | from torch_geometric.data import Batch 14 | 15 | from rxitect.data.iterators import SamplingIterator 16 | from rxitect.envs.contexts import ActionCategorical 17 | from rxitect.utils.multiprocessing_proxy import wrap_model_mp 18 | 19 | if TYPE_CHECKING: 20 | from rxitect.algorithms.gfn_algorithm import GFNAlgorithm 21 | from rxitect.envs.contexts import GraphEnvContext 22 | from rxitect.tasks.gfn_task import GFNTask 23 | 24 | 25 | class GFNTrainer: 26 | def __init__(self, hps: Dict[str, Any], device: Device): 27 | """A GFlowNet trainer. Contains the main training loop in `run` and should be subclassed. 28 | Parameters 29 | ---------- 30 | hps: Dict[str, Any] 31 | A dictionary of hyperparameters. These override default values obtained by the `default_hps` method. 32 | device: Device 33 | The torch device of the main worker. 34 | """ 35 | # self.setup should at least set these up: 36 | self.training_data: Dataset = None 37 | self.test_data: Dataset = None 38 | self.model: nn.Module = None 39 | # `sampling_model` is used by the data workers to sample new objects from the model. Can be 40 | # the same as `model`. 41 | self.sampling_model: nn.Module = None 42 | self.mb_size: int = None 43 | self.ctx: GraphEnvContext = None 44 | self.task: GFNTask = None 45 | self.algo: GFNAlgorithm = None 46 | 47 | # Override default hyperparameters with the constructor arguments 48 | self.hps = {**self.default_hps(), **hps} 49 | self.device = device 50 | # The number of processes spawned to sample object and do CPU work 51 | self.num_workers: int = self.hps.get("num_data_loader_workers", 0) 52 | # The offline_ratio of samples drawn from `self.training_data` during training. The rest is drawn from 53 | # `self.sampling_model`. 54 | self.offline_ratio: float = 0.5 55 | # idem, but from `self.test_data` during validation. 56 | self.valid_offline_ratio: float = 1 57 | # If True, print messages during training 58 | self.verbose: bool = False 59 | # These hooks allow us to compute extra quantities when sampling data 60 | self.sampling_hooks: List[Callable] = [] 61 | 62 | self.setup() 63 | 64 | def default_hps(self) -> Dict[str, Any]: 65 | raise NotImplementedError() 66 | 67 | def setup(self): 68 | raise NotImplementedError() 69 | 70 | def step(self, loss: torch.Tensor): 71 | raise NotImplementedError() 72 | 73 | def _wrap_model_mp(self, model): 74 | """Wraps a nn.Module instance so that it can be shared to `DataLoader` workers.""" 75 | model.to(self.device) 76 | if self.num_workers > 0: 77 | placeholder = wrap_model_mp( 78 | model, self.num_workers, cast_types=(Batch, ActionCategorical) 79 | ) 80 | return placeholder, torch.device("cpu") 81 | return model, self.device 82 | 83 | def build_training_data_loader(self) -> DataLoader: 84 | model, dev = self._wrap_model_mp(self.sampling_model) 85 | iterator = SamplingIterator( 86 | self.training_data, 87 | model, 88 | self.mb_size * 2, 89 | self.ctx, 90 | self.algo, 91 | self.task, 92 | dev, 93 | offline_ratio=self.offline_ratio, 94 | log_dir=self.hps["log_dir"], 95 | ) 96 | for hook in self.sampling_hooks: 97 | iterator.add_log_hook(hook) 98 | return torch.utils.data.DataLoader( 99 | iterator, 100 | batch_size=None, 101 | num_workers=self.num_workers, 102 | persistent_workers=self.num_workers > 0, 103 | ) 104 | 105 | def build_validation_data_loader(self) -> DataLoader: 106 | model, dev = self._wrap_model_mp(self.model) 107 | iterator = SamplingIterator( 108 | self.test_data, 109 | model, 110 | self.mb_size, 111 | self.ctx, 112 | self.algo, 113 | self.task, 114 | dev, 115 | offline_ratio=self.valid_offline_ratio, 116 | stream=False, 117 | ) 118 | return torch.utils.data.DataLoader( 119 | iterator, 120 | batch_size=None, 121 | num_workers=self.num_workers, 122 | persistent_workers=self.num_workers > 0, 123 | ) 124 | 125 | def train_batch( 126 | self, batch: Batch, epoch_idx: int, batch_idx: int 127 | ) -> Dict[str, Any]: 128 | loss, info = self.algo.compute_batch_losses( 129 | self.model, batch, num_bootstrap=self.mb_size 130 | ) 131 | self.step(loss) 132 | if hasattr(batch, "extra_info"): 133 | info.update(batch.extra_info) 134 | return {k: v.item() if hasattr(v, "item") else v for k, v in info.items()} 135 | 136 | def evaluate_batch( 137 | self, batch: Batch, epoch_idx: int = 0, batch_idx: int = 0 138 | ) -> Dict[str, Any]: 139 | loss, info = self.algo.compute_batch_losses( 140 | self.model, batch, num_bootstrap=batch.num_offline 141 | ) 142 | return {k: v.item() if hasattr(v, "item") else v for k, v in info.items()} 143 | 144 | def run(self): 145 | """Trains the GFN for `num_training_steps` minibatches, performing 146 | validation every `validate_every` minibatches. 147 | """ 148 | self.model.to(self.device) 149 | self.sampling_model.to(self.device) 150 | epoch_length = max(len(self.training_data), 1) 151 | train_dl = self.build_training_data_loader() 152 | valid_dl = self.build_validation_data_loader() 153 | for it, batch in zip(range(1, 1 + self.hps["num_training_steps"]), train_dl): 154 | epoch_idx = it // epoch_length 155 | batch_idx = it % epoch_length 156 | info = self.train_batch(batch.to(self.device), epoch_idx, batch_idx) 157 | if self.verbose: 158 | print(it, " ".join(f"{k}:{v:.2f}" for k, v in info.items())) 159 | self.log(info, it, "train") 160 | 161 | if it % self.hps["validate_every"] == 0: 162 | for val_batch in valid_dl: 163 | info = self.evaluate_batch( 164 | val_batch.to(self.device), epoch_idx, batch_idx 165 | ) 166 | self.log(info, it, "valid") 167 | torch.save( 168 | { 169 | "models_state_dict": [self.model.state_dict()], 170 | "hps": self.hps, 171 | }, 172 | open(Path(self.hps["log_dir"]) / "model_state.pt", "wb"), 173 | ) 174 | 175 | def log(self, info, index, key): 176 | if not hasattr(self, "_summary_writer"): 177 | self._summary_writer = SummaryWriter(self.hps["log_dir"]) 178 | for k, v in info.items(): 179 | self._summary_writer.add_scalar(f"{key}_{k}", v, index) 180 | -------------------------------------------------------------------------------- /rxitect/tasks/original_task.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import copy 3 | 4 | import numpy as np 5 | from rdkit import RDLogger 6 | from rdkit.Chem.rdchem import Mol 7 | from scipy.stats import stats 8 | from torch.types import Device 9 | 10 | from rxitect.algorithms import TrajectoryBalance 11 | from rxitect.envs import FragmentEnv, FragmentEnvContext 12 | from rxitect.models import bengio2021flow, FragmentBasedGFN 13 | from rxitect.tasks import GFNTask, FlatRewards, ScalarReward 14 | import torch 15 | from torch_geometric.data import Batch 16 | from torch.utils.data import DataLoader, Dataset 17 | from torch import nn 18 | from typing import TYPE_CHECKING, Tuple, Union, List, Callable, Dict, Any 19 | 20 | from rxitect.trainers import GFNTrainer 21 | from rxitect.utils.transforms import thermometer 22 | 23 | 24 | class SEHTask(GFNTask): 25 | """Sets up a task where the reward is computed using a proxy for the binding energy of a molecule to 26 | Soluble Epoxide Hydrolases. 27 | The proxy is pretrained, and obtained from the original GFlowNet paper, see `gflownet.models.bengio2021flow`. 28 | This setup essentially reproduces the results of the Trajectory Balance paper when using the TB 29 | objective, or of the original paper when using Flow Matching (TODO: port to this repo). 30 | """ 31 | def __init__(self, dataset: Dataset, temperature_distribution: str, temperature_parameters: Tuple[float], 32 | wrap_model: Callable[[nn.Module], nn.Module] = None): 33 | self._wrap_model = wrap_model 34 | self.models = self._load_task_models() 35 | self.dataset = dataset 36 | self.temperature_sample_dist = temperature_distribution 37 | self.temperature_dist_params = temperature_parameters 38 | 39 | def flat_reward_transform(self, y: Union[float, torch.Tensor]) -> FlatRewards: 40 | return FlatRewards(torch.as_tensor(y) / 8) 41 | 42 | def inverse_flat_reward_transform(self, rp): 43 | return rp * 8 44 | 45 | def _load_task_models(self): 46 | model = bengio2021flow.load_original_model() 47 | model, self.device = self._wrap_model(model) 48 | return {'seh': model} 49 | 50 | def sample_conditional_information(self, n): 51 | beta = None 52 | if self.temperature_sample_dist == 'gamma': 53 | loc, scale = self.temperature_dist_params 54 | beta = self.rng.gamma(loc, scale, n).astype(np.float32) 55 | upper_bound = stats.gamma.ppf(0.95, loc, scale=scale) 56 | elif self.temperature_sample_dist == 'uniform': 57 | beta = self.rng.uniform(*self.temperature_dist_params, n).astype(np.float32) 58 | upper_bound = self.temperature_dist_params[1] 59 | elif self.temperature_sample_dist == 'beta': 60 | beta = self.rng.beta(*self.temperature_dist_params, n).astype(np.float32) 61 | upper_bound = 1 62 | else: 63 | raise ValueError() 64 | beta_enc = thermometer(torch.tensor(beta), 32, 0, upper_bound) # TODO: hyperparameters 65 | return {'beta': torch.tensor(beta), 'encoding': beta_enc} 66 | 67 | def cond_info_to_reward(self, cond_info: Dict[str, torch.Tensor], flat_reward: FlatRewards) -> ScalarReward: 68 | if isinstance(flat_reward, list): 69 | flat_reward = torch.tensor(flat_reward) 70 | return flat_reward**cond_info['beta'] 71 | 72 | def compute_flat_rewards(self, mols: List[Mol]) -> Tuple[FlatRewards, torch.Tensor]: 73 | graphs = [bengio2021flow.mol2graph(i) for i in mols] 74 | is_valid = torch.tensor([i is not None for i in graphs]).bool() 75 | if not is_valid.any(): 76 | return FlatRewards(torch.zeros((0,))), is_valid 77 | batch = Batch.from_data_list([i for i in graphs if i is not None]) 78 | batch.to(self.device) 79 | preds = self.models['seh'](batch).reshape((-1,)).data.cpu() 80 | preds[preds.isnan()] = 0 81 | preds = self.flat_reward_transform(preds).clip(1e-4, 100).reshape((-1, 1)) 82 | return FlatRewards(preds), is_valid 83 | 84 | 85 | class SEHFragTrainer(GFNTrainer): 86 | def __init__(self, hps: Dict[str, Any], device: Device): 87 | super().__init__(hps, device) 88 | 89 | def default_hps(self) -> Dict[str, Any]: 90 | return { 91 | 'bootstrap_own_reward': False, 92 | 'learning_rate': 1e-4, 93 | 'global_batch_size': 64, 94 | 'num_emb': 128, 95 | 'num_layers': 4, 96 | 'tb_epsilon': None, 97 | 'illegal_action_logreward': -75, 98 | 'reward_loss_multiplier': 1, 99 | 'temperature_sample_dist': 'uniform', 100 | 'temperature_dist_params': '(.5, 32)', 101 | 'weight_decay': 1e-8, 102 | 'num_data_loader_workers': 1, 103 | 'momentum': 0.9, 104 | 'adam_eps': 1e-8, 105 | 'lr_decay': 20_000, 106 | 'Z_lr_decay': 20_000, 107 | 'clip_grad_type': 'norm', 108 | 'clip_grad_param': 10, 109 | 'random_action_prob': 0., 110 | 'sampling_tau': 0., 111 | 'num_cond_dim': 32, 112 | } 113 | 114 | def setup(self): 115 | hps = self.hps 116 | RDLogger.DisableLog('rdApp.*') 117 | self.rng = np.random.default_rng(142857) 118 | self.env = FragmentEnv() 119 | self.ctx = FragmentEnvContext(max_frags=9, num_cond_dim=hps['num_cond_dim']) 120 | self.training_data = [] 121 | self.test_data = [] 122 | self.offline_ratio = 0 123 | self.valid_offline_ratio = 0 124 | 125 | model = FragmentBasedGFN(self.ctx, num_emb=hps['num_emb'], num_layers=hps['num_layers']) 126 | self.model = model 127 | # Separate Z parameters from non-Z to allow for LR decay on the former 128 | Z_params = list(model.log_z.parameters()) 129 | non_Z_params = [i for i in self.model.parameters() if all(id(i) != id(j) for j in Z_params)] 130 | self.opt = torch.optim.Adam(non_Z_params, hps['learning_rate'], (hps['momentum'], 0.999), 131 | weight_decay=hps['weight_decay'], eps=hps['adam_eps']) 132 | self.opt_Z = torch.optim.Adam(Z_params, hps['learning_rate'], (0.9, 0.999)) 133 | self.lr_sched = torch.optim.lr_scheduler.LambdaLR(self.opt, lambda steps: 2**(-steps / hps['lr_decay'])) 134 | self.lr_sched_Z = torch.optim.lr_scheduler.LambdaLR(self.opt_Z, lambda steps: 2**(-steps / hps['Z_lr_decay'])) 135 | 136 | self.sampling_tau = hps['sampling_tau'] 137 | if self.sampling_tau > 0: 138 | self.sampling_model = copy.deepcopy(model) 139 | else: 140 | self.sampling_model = self.model 141 | eps = hps['tb_epsilon'] 142 | hps['tb_epsilon'] = ast.literal_eval(eps) if isinstance(eps, str) else eps 143 | self.algo = TrajectoryBalance(self.env, self.ctx, self.rng, hps, max_nodes=9) 144 | 145 | self.task = SEHTask(self.training_data, hps['temperature_sample_dist'], 146 | ast.literal_eval(hps['temperature_dist_params']), wrap_model=self._wrap_model_mp) 147 | self.mb_size = hps['global_batch_size'] 148 | self.clip_grad_param = hps['clip_grad_param'] 149 | self.clip_grad_callback = { 150 | 'value': (lambda params: torch.nn.utils.clip_grad_value_(params, self.clip_grad_param)), 151 | 'norm': (lambda params: torch.nn.utils.clip_grad_norm_(params, self.clip_grad_param)), 152 | 'none': (lambda x: None) 153 | }[hps['clip_grad_type']] 154 | 155 | def step(self, loss: torch.Tensor): 156 | loss.backward() 157 | for i in self.model.parameters(): 158 | self.clip_grad_callback(i) 159 | self.opt.step() 160 | self.opt.zero_grad() 161 | self.opt_Z.step() 162 | self.opt_Z.zero_grad() 163 | self.lr_sched.step() 164 | self.lr_sched_Z.step() 165 | if self.sampling_tau > 0: 166 | for a, b in zip(self.model.parameters(), self.sampling_model.parameters()): 167 | b.data.mul_(self.sampling_tau).add_(a.data * (1 - self.sampling_tau)) 168 | 169 | 170 | def main(): 171 | """Example of how this model can be run outside Determined""" 172 | from pyprojroot import here 173 | log_dir = str(here() / 'scratch/logs/seh_frag/run_0/') 174 | hps = { 175 | 'lr_decay': 10, 176 | 'qm9_h5_path': 'data/chem/qm9/qm9.h5', 177 | 'log_dir': log_dir, 178 | 'num_training_steps': 10, 179 | 'validate_every': 5, 180 | 'sampling_tau': 0.99, 181 | 'temperature_dist_params': '(0, 64)', 182 | } 183 | trial = SEHFragTrainer(hps, torch.device('cpu')) 184 | trial.verbose = True 185 | print(f"params: {trial.hps}") 186 | trial.run() 187 | 188 | 189 | if __name__ == "__main__": 190 | main() 191 | -------------------------------------------------------------------------------- /rxitect/models/bengio2021flow.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is code adapted from Bengio et al. (2021), 'Flow Network based 3 | Generative Models for Non-Iterative Diverse Candidate Generation', 4 | from 5 | https://github.com/GFNOrg/gflownet 6 | In particular, this model class allows us to compare to the same 7 | target proxy used in that paper (sEH binding affinity prediction). 8 | """ 9 | import gzip 10 | import os 11 | import pickle # nosec 12 | 13 | import numpy as np 14 | import requests # type: ignore 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | from rdkit import RDConfig 19 | from rdkit.Chem import ChemicalFeatures 20 | from rdkit.Chem.rdchem import BondType as BT 21 | from rdkit.Chem.rdchem import HybridizationType 22 | from torch_geometric.data import Batch, Data 23 | from torch_geometric.nn import NNConv, Set2Set 24 | from torch_sparse import coalesce 25 | 26 | NUM_ATOMIC_NUMBERS = 56 # Number of atoms used in the molecules (i.e. up to Ba) 27 | 28 | 29 | class MPNNet(nn.Module): 30 | def __init__( 31 | self, 32 | num_feat=14, 33 | num_vec=3, 34 | dim=64, 35 | num_out_per_mol=1, 36 | num_out_per_stem=105, 37 | num_out_per_bond=1, 38 | num_conv_steps=12, 39 | ): 40 | super().__init__() 41 | self.lin0 = nn.Linear(num_feat + num_vec, dim) 42 | self.num_ops = num_out_per_stem 43 | self.num_opm = num_out_per_mol 44 | self.num_conv_steps = num_conv_steps 45 | self.dropout_rate = 0 46 | 47 | self.act = nn.LeakyReLU() 48 | 49 | net = nn.Sequential(nn.Linear(4, 128), self.act, nn.Linear(128, dim * dim)) 50 | self.conv = NNConv(dim, dim, net, aggr="mean") 51 | self.gru = nn.GRU(dim, dim) 52 | 53 | self.set2set = Set2Set(dim, processing_steps=3) 54 | self.lin3 = nn.Linear(dim * 2, num_out_per_mol) 55 | self.bond2out = nn.Sequential( 56 | nn.Linear(dim * 2, dim), 57 | self.act, 58 | nn.Linear(dim, dim), 59 | self.act, 60 | nn.Linear(dim, num_out_per_bond), 61 | ) 62 | 63 | def forward(self, data, do_dropout=False): 64 | out = self.act(self.lin0(data.x)) 65 | h = out.unsqueeze(0) 66 | h = F.dropout(h, training=do_dropout, p=self.dropout_rate) 67 | 68 | for i in range(self.num_conv_steps): 69 | m = self.act(self.conv(out, data.edge_index, data.edge_attr)) 70 | m = F.dropout(m, training=do_dropout, p=self.dropout_rate) 71 | out, h = self.gru(m.unsqueeze(0).contiguous(), h.contiguous()) 72 | h = F.dropout(h, training=do_dropout, p=self.dropout_rate) 73 | out = out.squeeze(0) 74 | 75 | global_out = self.set2set(out, data.batch) 76 | global_out = F.dropout(global_out, training=do_dropout, p=self.dropout_rate) 77 | per_mol_out = self.lin3(global_out) # per mol scalar outputs 78 | return per_mol_out 79 | 80 | 81 | def load_original_model(): 82 | num_feat = 14 + 1 + NUM_ATOMIC_NUMBERS 83 | mpnn = MPNNet( 84 | num_feat=num_feat, 85 | num_vec=0, 86 | dim=64, 87 | num_out_per_mol=1, 88 | num_out_per_stem=105, 89 | num_conv_steps=12, 90 | ) 91 | f = requests.get( 92 | "https://github.com/GFNOrg/gflownet/raw/master/mols/data/pretrained_proxy/best_params.pkl.gz", 93 | stream=True, 94 | ) 95 | params = pickle.load(gzip.open(f.raw)) # nosec 96 | param_map = { 97 | "lin0.weight": params[0], 98 | "lin0.bias": params[1], 99 | "conv.bias": params[3], 100 | "conv.nn.0.weight": params[4], 101 | "conv.nn.0.bias": params[5], 102 | "conv.nn.2.weight": params[6], 103 | "conv.nn.2.bias": params[7], 104 | "conv.lin.weight": params[2], 105 | "gru.weight_ih_l0": params[8], 106 | "gru.weight_hh_l0": params[9], 107 | "gru.bias_ih_l0": params[10], 108 | "gru.bias_hh_l0": params[11], 109 | "set2set.lstm.weight_ih_l0": params[16], 110 | "set2set.lstm.weight_hh_l0": params[17], 111 | "set2set.lstm.bias_ih_l0": params[18], 112 | "set2set.lstm.bias_hh_l0": params[19], 113 | "lin3.weight": params[20], 114 | "lin3.bias": params[21], 115 | } 116 | for k, v in param_map.items(): 117 | mpnn.get_parameter(k).data = torch.tensor(v) 118 | return mpnn 119 | 120 | 121 | _mpnn_feat_cache = [None] 122 | 123 | 124 | def mpnn_feat( 125 | mol, ifcoord=True, panda_fmt=False, one_hot_atom=False, donor_features=False 126 | ): 127 | atomtypes = {"H": 0, "C": 1, "N": 2, "O": 3, "F": 4} 128 | bondtypes = { 129 | BT.SINGLE: 0, 130 | BT.DOUBLE: 1, 131 | BT.TRIPLE: 2, 132 | BT.AROMATIC: 3, 133 | BT.UNSPECIFIED: 0, 134 | } 135 | 136 | natm = len(mol.GetAtoms()) 137 | ntypes = len(atomtypes) 138 | # featurize elements 139 | # columns are: ["type_idx" .. , "atomic_number", "acceptor", "donor", 140 | # "aromatic", "sp", "sp2", "sp3", "num_hs", [atomic_number_onehot] .. ]) 141 | 142 | nfeat = ntypes + 1 + 8 143 | if one_hot_atom: 144 | nfeat += NUM_ATOMIC_NUMBERS 145 | atmfeat = np.zeros((natm, nfeat)) 146 | 147 | # featurize 148 | for i, atom in enumerate(mol.GetAtoms()): 149 | type_idx = atomtypes.get(atom.GetSymbol(), 5) 150 | atmfeat[i, type_idx] = 1 151 | if one_hot_atom: 152 | atmfeat[i, ntypes + 9 + atom.GetAtomicNum() - 1] = 1 153 | else: 154 | atmfeat[i, ntypes + 1] = (atom.GetAtomicNum() % 16) / 2.0 155 | atmfeat[i, ntypes + 4] = atom.GetIsAromatic() 156 | hybridization = atom.GetHybridization() 157 | atmfeat[i, ntypes + 5] = hybridization == HybridizationType.SP 158 | atmfeat[i, ntypes + 6] = hybridization == HybridizationType.SP2 159 | atmfeat[i, ntypes + 7] = hybridization == HybridizationType.SP3 160 | atmfeat[i, ntypes + 8] = atom.GetTotalNumHs(includeNeighbors=True) 161 | 162 | # get donors and acceptors 163 | if donor_features: 164 | if _mpnn_feat_cache[0] is None: 165 | fdef_name = os.path.join(RDConfig.RDDataDir, "BaseFeatures.fdef") 166 | factory = ChemicalFeatures.BuildFeatureFactory(fdef_name) 167 | _mpnn_feat_cache[0] = factory 168 | else: 169 | factory = _mpnn_feat_cache[0] 170 | feats = factory.GetFeaturesForMol(mol) 171 | for j in range(0, len(feats)): 172 | if feats[j].GetFamily() == "Donor": 173 | node_list = feats[j].GetAtomIds() 174 | for k in node_list: 175 | atmfeat[k, ntypes + 3] = 1 176 | elif feats[j].GetFamily() == "Acceptor": 177 | node_list = feats[j].GetAtomIds() 178 | for k in node_list: 179 | atmfeat[k, ntypes + 2] = 1 180 | # get coord 181 | if ifcoord: 182 | coord = np.asarray( 183 | [mol.GetConformer(0).GetAtomPosition(j) for j in range(natm)] 184 | ) 185 | else: 186 | coord = None 187 | # get bonds and bond features 188 | bond = np.asarray( 189 | [[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] for bond in mol.GetBonds()] 190 | ) 191 | bondfeat = [bondtypes[bond.GetBondType()] for bond in mol.GetBonds()] 192 | bondfeat = onehot(bondfeat, num_classes=len(bondtypes) - 1) 193 | 194 | return atmfeat, coord, bond, bondfeat 195 | 196 | 197 | def mol_to_graph_backend(atmfeat, coord, bond, bondfeat, props={}, data_cls=Data): 198 | "convert to PyTorch geometric module" 199 | natm = atmfeat.shape[0] 200 | # transform to torch_geometric bond format; send edges both ways; sort bonds 201 | atmfeat = torch.tensor(atmfeat, dtype=torch.float32) 202 | if bond.shape[0] > 0: 203 | edge_index = torch.tensor( 204 | np.concatenate([bond.T, np.flipud(bond.T)], axis=1), dtype=torch.int64 205 | ) 206 | edge_attr = torch.tensor( 207 | np.concatenate([bondfeat, bondfeat], axis=0), dtype=torch.float32 208 | ) 209 | edge_index, edge_attr = coalesce(edge_index, edge_attr, natm, natm) 210 | else: 211 | edge_index = torch.zeros((0, 2), dtype=torch.int64) 212 | edge_attr = torch.tensor(bondfeat, dtype=torch.float32) 213 | 214 | # make torch data 215 | if coord is not None: 216 | coord = torch.tensor(coord, dtype=torch.float32) 217 | data = data_cls( 218 | x=atmfeat, pos=coord, edge_index=edge_index, edge_attr=edge_attr, **props 219 | ) 220 | else: 221 | data = data_cls(x=atmfeat, edge_index=edge_index, edge_attr=edge_attr, **props) 222 | return data 223 | 224 | 225 | def onehot(arr, num_classes, dtype=np.int): 226 | arr = np.asarray(arr, dtype=np.int) 227 | assert len(arr.shape) == 1, "dims other than 1 not implemented" 228 | onehot_arr = np.zeros(arr.shape + (num_classes,), dtype=dtype) 229 | onehot_arr[np.arange(arr.shape[0]), arr] = 1 230 | return onehot_arr 231 | 232 | 233 | def mol2graph(mol, floatX=torch.float, bonds=False, nblocks=False): 234 | rdmol = mol 235 | if rdmol is None: 236 | g = Data( 237 | x=torch.zeros((1, 14 + NUM_ATOMIC_NUMBERS)), 238 | edge_attr=torch.zeros((0, 4)), 239 | edge_index=torch.zeros((0, 2)).long(), 240 | ) 241 | else: 242 | atmfeat, _, bond, bondfeat = mpnn_feat( 243 | mol, ifcoord=False, one_hot_atom=True, donor_features=False 244 | ) 245 | g = mol_to_graph_backend(atmfeat, None, bond, bondfeat) 246 | stem_mask = torch.zeros((g.x.shape[0], 1)) 247 | g.x = torch.cat([g.x, stem_mask], 1).to(floatX) 248 | g.edge_attr = g.edge_attr.to(floatX) 249 | if g.edge_index.shape[0] == 0: 250 | g.edge_index = torch.zeros((2, 1)).long() 251 | g.edge_attr = torch.zeros((1, g.edge_attr.shape[1])).to(floatX) 252 | return g 253 | 254 | 255 | def mols2batch(mols): 256 | batch = Batch.from_data_list(mols) 257 | return batch 258 | -------------------------------------------------------------------------------- /rxitect/envs/contexts/fragment_env_context.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | from typing import TYPE_CHECKING, List, Union 5 | 6 | import numpy as np 7 | import rdkit.Chem as Chem 8 | import torch 9 | from pyprojroot import here 10 | from rdkit.Chem import Atom 11 | from torch_geometric.data import Batch, Data 12 | 13 | from rxitect.envs.contexts import Action, ActionType, Graph, GraphEnvContext 14 | 15 | if TYPE_CHECKING: 16 | from rdkit.Chem.rdchem import Mol 17 | 18 | from rxitect.envs.contexts import ActionIndex 19 | 20 | 21 | class FragmentEnvContext(GraphEnvContext): 22 | """A specification of what is being generated for a GraphBuildingEnv 23 | This context specifies how to create molecules fragment by fragment as encoded by a junction tree. 24 | Fragments are obtained from the original GFlowNet paper, Bengio et al., 2021. 25 | """ 26 | 27 | def __init__( 28 | self, 29 | max_frags: int = 8, 30 | num_cond_dim: int = 0, 31 | frags_filepath: Union[str, Path] = here() 32 | / "data/processed/bengio_2021_fragments.txt", 33 | device: str = "cpu", 34 | ): 35 | """Construct a fragment environment 36 | Parameters 37 | ---------- 38 | max_frags: int 39 | The maximum number of fragments the agent is allowed to insert. 40 | num_cond_dim: int 41 | The dimensionality of the observations' conditional information vector (if >0) 42 | frags_filepath: str 43 | The file containing the fragments available to the agent to construct molecules with. Defaults to Bengio 44 | et al.'s (2021) original GFlowNet paper's fragments 45 | device: str 46 | The device to process the data on, can be either 'cpu' or 'cuda'. Defaults to 'cpu' 47 | """ 48 | self.max_frags = max_frags 49 | self.frags_smi = open(frags_filepath, "r").read().splitlines() 50 | self.frags_mol = [Chem.MolFromSmiles(i) for i in self.frags_smi] 51 | self.frags_stems = [ 52 | [ 53 | atom_idx 54 | for atom_idx in range(m.GetNumAtoms()) 55 | if m.GetAtomWithIdx(atom_idx).GetTotalNumHs() > 0 56 | ] 57 | for m in self.frags_mol 58 | ] 59 | self.frags_num_atoms = [m.GetNumAtoms() for m in self.frags_mol] 60 | self.num_stem_acts = most_stems = max(map(len, self.frags_stems)) 61 | self.action_map = [ 62 | (frag_idx, stem_idx) 63 | for frag_idx in range(len(self.frags_stems)) 64 | for stem_idx in range(len(self.frags_stems[frag_idx])) 65 | ] 66 | self.num_actions = len(self.action_map) 67 | 68 | # These values are used by Models to know how many inputs/logits to produce 69 | self.num_new_node_values = len(self.frags_smi) 70 | self.num_node_attr_logits = 0 71 | self.num_node_dim = len(self.frags_smi) + 1 72 | self.num_edge_attr_logits = most_stems * 2 73 | self.num_edge_dim = most_stems * 2 74 | self.num_cond_dim = num_cond_dim 75 | self.num_stop_logits = 1 76 | 77 | # Order in which models have to output logits 78 | self.action_type_order = [ 79 | ActionType.STOP, 80 | ActionType.ADD_NODE, 81 | ActionType.SET_EDGE_ATTR, 82 | ] 83 | self.device = torch.device(device) 84 | 85 | def idx_to_action(self, g: Data, action_idx: ActionIndex): 86 | """Translate an action index (e.g. from a GraphActionCategorical) to a GraphAction 87 | Parameters 88 | ---------- 89 | g: Data 90 | The graph object on which this action would be applied. 91 | action_idx: ActionIndex 92 | A triple describing the type of action, and the corresponding row and column index for 93 | the corresponding Categorical matrix. 94 | Returns 95 | ------- 96 | action: Action 97 | A graph action whose type is one of STOP, ADD_NODE, or SET_EDGE_ATTR. 98 | """ 99 | act_type, act_row, act_col = [int(i) for i in action_idx] 100 | t = self.action_type_order[act_type] 101 | if t is ActionType.STOP: 102 | return Action(t) 103 | elif t is ActionType.ADD_NODE: 104 | return Action(t, source=act_row, value=act_col) 105 | elif t is ActionType.SET_EDGE_ATTR: 106 | a, b = g.edge_index[ 107 | :, act_row * 2 108 | ] # Edges are duplicated to get undirected GNN, deduplicated for logits 109 | if act_col < self.num_stem_acts: 110 | attr = f"{int(a)}_attach" 111 | val = act_col 112 | else: 113 | attr = f"{int(b)}_attach" 114 | val = act_col - self.num_stem_acts 115 | return Action(t, source=a.item(), target=b.item(), attr=attr, value=val) 116 | 117 | def action_to_idx(self, g: Data, action: Action) -> ActionIndex: 118 | """Translate a GraphAction to an index tuple 119 | Parameters 120 | ---------- 121 | g: Data 122 | The graph object on which this action would be applied. 123 | action: Action 124 | A graph action whose type is one of Stop, AddNode, or SetEdgeAttr. 125 | Returns 126 | ------- 127 | action_idx: ActionIndex 128 | A triple describing the type of action, and the corresponding row and column index for 129 | the corresponding Categorical matrix. 130 | """ 131 | if action.act_type is ActionType.STOP: 132 | row = col = 0 133 | elif action.act_type is ActionType.ADD_NODE: 134 | row = action.source 135 | col = action.value 136 | elif action.act_type is ActionType.SET_EDGE_ATTR: 137 | # Here the edges are duplicated, both (i,j) and (j,i) are in edge_index 138 | # so no need for a double check. 139 | row = ( 140 | (g.edge_index.T == torch.tensor([(action.source, action.target)])) 141 | .prod(1) 142 | .argmax() 143 | ) 144 | # Because edges are duplicated but logits aren't, divide by two 145 | row = row.div(2, rounding_mode="floor") # type: ignore 146 | if action.attr == f"{int(action.source)}_attach": 147 | col = action.value 148 | else: 149 | col = action.value + self.num_stem_acts 150 | else: 151 | raise ValueError(f"Action type '{action.act_type}' is unsupported.") 152 | type_idx = self.action_type_order.index(action.act_type) 153 | return type_idx, int(row), int(col) 154 | 155 | def graph_to_data(self, g: Graph) -> Data: 156 | """Convert a networkx Graph to a torch geometric Data instance 157 | Parameters 158 | ---------- 159 | g: Graph 160 | A Graph object representing a fragment junction tree 161 | Returns 162 | ------- 163 | data: Data 164 | The corresponding torch_geometric object. 165 | """ 166 | x = torch.zeros((max(1, len(g.nodes)), self.num_node_dim)) 167 | x[0, -1] = len(g.nodes) == 0 168 | for i, n in enumerate(g.nodes): 169 | x[i, g.nodes[n]["v"]] = 1 170 | edge_attr = torch.zeros((len(g.edges) * 2, self.num_edge_dim)) 171 | set_edge_attr_mask = torch.zeros((len(g.edges), self.num_edge_attr_logits)) 172 | for i, e in enumerate(g.edges): 173 | ad = g.edges[e] 174 | for n, offset in zip(e, [0, self.num_stem_acts]): 175 | idx = ad.get(f"{int(n)}_attach", 0) + offset 176 | edge_attr[i * 2, idx] = 1 177 | edge_attr[i * 2 + 1, idx] = 1 178 | if f"{int(n)}_attach" not in ad: 179 | set_edge_attr_mask[ 180 | i, offset : offset + len(self.frags_stems[g.nodes[n]["v"]]) 181 | ] = 1 182 | edge_index = ( 183 | torch.tensor( 184 | [e for i, j in g.edges for e in [(i, j), (j, i)]], dtype=torch.long 185 | ) 186 | .reshape((-1, 2)) 187 | .T 188 | ) 189 | if x.shape[0] == self.max_frags: 190 | add_node_mask = torch.zeros((x.shape[0], 1)) 191 | else: 192 | add_node_mask = torch.ones((x.shape[0], 1)) 193 | 194 | return Data( 195 | x, 196 | edge_index, 197 | edge_attr, 198 | add_node_mask=add_node_mask, 199 | set_edge_attr_mask=set_edge_attr_mask, 200 | ) 201 | 202 | def collate_fn(self, graphs: List[Data]) -> Batch: 203 | """Batch Data instances 204 | Parameters 205 | ---------- 206 | graphs: List[gd.Data] 207 | A list of gd.Data objects (e.g. given by graph_to_Data). 208 | Returns 209 | ------- 210 | batch: gd.Batch 211 | A torch_geometric Batch object 212 | """ 213 | return Batch.from_data_list(graphs, follow_batch=["edge_index"]) 214 | 215 | def mol_to_graph(self, mol) -> Graph: 216 | """Convert an RDMol to a Graph""" 217 | raise NotImplementedError() 218 | 219 | def graph_to_mol(self, g: Graph) -> Mol: 220 | """Convert a Graph to an RDKit molecule 221 | Parameters 222 | ---------- 223 | g: Graph 224 | A Graph instance representing a fragment junction tree. 225 | Returns 226 | ------- 227 | m: Mol 228 | The corresponding RDKit molecule 229 | """ 230 | offsets = np.cumsum([0] + [self.frags_num_atoms[g.nodes[i]["v"]] for i in g]) 231 | mol = None 232 | for i in g.nodes: 233 | if mol is None: 234 | mol = self.frags_mol[g.nodes[i]["v"]] 235 | else: 236 | mol = Chem.CombineMols(mol, self.frags_mol[g.nodes[i]["v"]]) 237 | 238 | mol = Chem.EditableMol(mol) 239 | bond_atoms = [] 240 | for a, b in g.edges: 241 | frag_a = g.nodes[a]["v"] 242 | frag_b = g.nodes[b]["v"] 243 | u, v = ( 244 | int( 245 | self.frags_stems[frag_a][g.edges[(a, b)].get(f"{a}_attach", 0)] 246 | + offsets[a] 247 | ), 248 | int( 249 | self.frags_stems[frag_b][g.edges[(a, b)].get(f"{b}_attach", 0)] 250 | + offsets[b] 251 | ), 252 | ) 253 | bond_atoms += [u, v] 254 | mol.AddBond(u, v, Chem.BondType.SINGLE) 255 | mol = mol.GetMol(None) 256 | 257 | def _pop_hydrogen_atom(atom: Atom) -> None: 258 | atom = mol.GetAtomWithIdx(atom) 259 | nh = atom.GetNumExplicitHs() 260 | if nh > 0: 261 | atom.SetNumExplicitHs(nh - 1) 262 | 263 | list(map(_pop_hydrogen_atom, bond_atoms)) 264 | return mol 265 | 266 | def is_valid_graph(self, g: Graph) -> bool: 267 | """Verifies whether the given Graph is valid according to RDKit""" 268 | mol = self.graph_to_mol(g) 269 | assert Chem.MolFromSmiles(Chem.MolToSmiles(mol)) is not None 270 | if mol is None: 271 | return False 272 | return True 273 | -------------------------------------------------------------------------------- /rxitect/data/iterators.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from typing import TYPE_CHECKING, Callable, List, Optional 5 | 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | from rdkit import Chem, RDLogger 10 | from torch.utils.data import Dataset, IterableDataset 11 | 12 | if TYPE_CHECKING: 13 | from torch.types import Device 14 | 15 | from rxitect.algorithms.gfn_algorithm import GFNAlgorithm 16 | from rxitect.envs.contexts import GraphEnvContext 17 | from rxitect.tasks.gfn_task import GFNTask 18 | 19 | 20 | class SamplingIterator(IterableDataset): 21 | """This class allows us to parallelize and train faster. 22 | By separating sampling data/the model and building torch geometric 23 | graphs from training the model, we can do the former in different 24 | processes, which is much faster since much of graph construction 25 | is CPU-bound. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | dataset: Dataset, 31 | model: nn.Module, 32 | batch_size: int, 33 | ctx: GraphEnvContext, 34 | algo: GFNAlgorithm, 35 | task: GFNTask, 36 | device: Device, 37 | offline_ratio: float = 0.5, 38 | stream: bool = True, 39 | log_dir: Optional[str] = None, 40 | ): 41 | """Parameters 42 | ---------- 43 | dataset: Dataset 44 | A dataset instance 45 | model: nn.Module 46 | The model we sample from (must be on CUDA already or share_memory() must be called so that 47 | parameters are synchronized between each worker) 48 | batch_size: int 49 | The number of trajectories, each trajectory will be composed of many graphs, so this is 50 | _not_ the batch size in terms of the number of graphs (that will depend on the task) 51 | algo: 52 | The training algorithm, e.g. a TrajectoryBalance instance 53 | task: ConditionalTask 54 | offline_ratio: float 55 | The offline_ratio of offline trajectories in the batch. 56 | stream: bool 57 | If True, data is sampled iid for every batch. Otherwise, this is a normal in-order 58 | dataset iterator. 59 | log_dir: str 60 | If not None, logs each SamplingIterator worker's generated molecules to that file. 61 | """ 62 | self.data = dataset 63 | self.model = model 64 | self.batch_size = batch_size 65 | self.offline_batch_size = int(np.ceil(batch_size * offline_ratio)) 66 | self.online_batch_size = int(np.floor(batch_size * (1 - offline_ratio))) 67 | self.offline_ratio = offline_ratio 68 | self.ctx = ctx 69 | self.algo = algo 70 | self.task = task 71 | self.device = device 72 | self.stream = stream 73 | self.log_dir = log_dir if self.offline_ratio < 1 and self.stream else None 74 | # This SamplingIterator instance will be copied by torch DataLoaders for each worker, so we 75 | # don't want to initialize per-worker things just yet, such as the log the worker writes 76 | # to. This must be done in __iter__, which is called by the DataLoader once this instance 77 | # has been copied into a new python process. 78 | # self.log = SQLiteLog() # Make generic logger that writes to txt file 79 | self.log_hooks: List[Callable] = [] 80 | 81 | def add_log_hook(self, hook: Callable): 82 | self.log_hooks.append(hook) 83 | 84 | def _idx_iterator(self): 85 | RDLogger.DisableLog("rdApp.*") 86 | if self.stream: 87 | # If we're streaming data, just sample `offline_batch_size` indices 88 | while True: 89 | yield self.rng.integers(0, len(self.data), self.offline_batch_size) 90 | else: 91 | # Otherwise, figure out which indices correspond to this worker 92 | worker_info = torch.utils.data.get_worker_info() 93 | n = len(self.data) 94 | if n == 0: 95 | yield np.arange(0, 0) 96 | return 97 | if worker_info is None: 98 | start, end, wid = 0, n, -1 99 | else: 100 | nw = worker_info.num_workers 101 | wid = worker_info.id 102 | start, end = int(np.floor(n / nw * wid)), int( 103 | np.ceil(n / nw * (wid + 1)) 104 | ) 105 | bs = self.offline_batch_size 106 | if end - start < bs: 107 | yield np.arange(start, end) 108 | return 109 | for i in range(start, end - bs, bs): 110 | yield np.arange(i, i + bs) 111 | if i + bs < end: 112 | yield np.arange(i + bs, end) 113 | 114 | def __len__(self): 115 | if self.stream: 116 | return int(1e6) 117 | return len(self.data) 118 | 119 | def __iter__(self): 120 | worker_info = torch.utils.data.get_worker_info() 121 | self._wid = worker_info.id if worker_info is not None else 0 122 | # Now that we know we are in a worker instance, we can initialize per-worker things 123 | self.rng = self.algo.rng = self.task.rng = np.random.default_rng( 124 | 142857 + self._wid 125 | ) 126 | self.ctx.device = self.device 127 | if self.log_dir is not None: 128 | os.makedirs(self.log_dir, exist_ok=True) 129 | self.log_path = f"{self.log_dir}/generated_mols_{self._wid}.db" 130 | # self.log.connect(self.log_path) 131 | 132 | for idcs in self._idx_iterator(): 133 | num_offline = idcs.shape[0] # This is in [0, self.offline_batch_size] 134 | # Sample conditional info such as temperature, trade-off weights, etc. 135 | 136 | cond_info = self.task.sample_conditional_information( 137 | num_offline + self.online_batch_size 138 | ) 139 | is_valid = torch.ones(cond_info["beta"].shape[0]).bool() 140 | 141 | # Sample some dataset data 142 | mols, flat_rewards = ( 143 | map(list, zip(*[self.data[i] for i in idcs])) if len(idcs) else ([], []) 144 | ) 145 | flat_rewards = list( 146 | self.task.flat_reward_transform(torch.tensor(flat_rewards)) 147 | ) 148 | graphs = [self.ctx.mol_to_graph(m) for m in mols] 149 | trajs = self.algo.create_training_data_from_graphs(graphs) 150 | # Sample some on-policy data 151 | if self.online_batch_size > 0: 152 | with torch.no_grad(): 153 | trajs += self.algo.create_training_data_from_own_samples( 154 | self.model, 155 | self.online_batch_size, 156 | cond_info["encoding"][num_offline:], 157 | ) 158 | if self.algo.bootstrap_own_reward: 159 | # The model can be trained to predict its own reward, 160 | # i.e. predict the output of cond_info_to_reward 161 | pred_reward = [ 162 | i["reward_pred"].cpu().item() for i in trajs[num_offline:] 163 | ] 164 | flat_rewards += pred_reward 165 | else: 166 | # Otherwise, query the task for flat rewards 167 | valid_idcs = torch.tensor( 168 | [ 169 | i + num_offline 170 | for i in range(self.online_batch_size) 171 | if trajs[i + num_offline]["is_valid"] 172 | ] 173 | ).long() 174 | # fetch the valid trajectories endpoints 175 | mols = [ 176 | self.ctx.graph_to_mol(trajs[i]["traj"][-1][0]) 177 | for i in valid_idcs 178 | ] 179 | # ask the task to compute their reward 180 | preds, m_is_valid = self.task.compute_flat_rewards(mols) 181 | # The task may decide some mols are invalid, we have to again filter those 182 | valid_idcs = valid_idcs[m_is_valid] 183 | pred_reward = torch.zeros((self.online_batch_size, preds.shape[1])) 184 | pred_reward[valid_idcs - num_offline] = preds 185 | # if preds.shape[0] > 0: 186 | # for i in range(self.number_of_objectives): 187 | # pred_reward[valid_idcs - num_offline, i] = preds[range(preds.shape[0]), i] 188 | is_valid[num_offline:] = False 189 | is_valid[valid_idcs] = True 190 | flat_rewards += list(pred_reward) 191 | # Override the is_valid key in case the task made some mols invalid 192 | for i in range(self.online_batch_size): 193 | trajs[num_offline + i]["is_valid"] = is_valid[ 194 | num_offline + i 195 | ].item() 196 | flat_rewards = torch.stack(flat_rewards) 197 | # Compute scalar rewards from conditional information & flat rewards 198 | rewards = self.task.cond_info_to_reward(cond_info, flat_rewards) 199 | rewards[torch.logical_not(is_valid)] = np.exp( 200 | self.algo.illegal_action_logreward 201 | ) 202 | # Construct batch 203 | batch = self.algo.construct_batch(trajs, cond_info["encoding"], rewards) 204 | batch.num_offline = num_offline 205 | batch.num_online = self.online_batch_size 206 | batch.flat_rewards = flat_rewards 207 | batch.mols = mols 208 | 209 | if self.online_batch_size > 0 and self.log_dir is not None: 210 | self.log_generated( 211 | trajs[num_offline:], 212 | rewards[num_offline:], 213 | flat_rewards[num_offline:], 214 | {k: v[num_offline:] for k, v in cond_info.items()}, 215 | ) 216 | if self.online_batch_size > 0: 217 | extra_info = {} 218 | for hook in self.log_hooks: 219 | extra_info.update(hook(trajs, rewards, flat_rewards, cond_info)) 220 | batch.extra_info = extra_info 221 | yield batch 222 | 223 | def log_generated(self, trajs, rewards, flat_rewards, cond_info): 224 | mols = [ 225 | Chem.MolToSmiles(self.ctx.graph_to_mol(trajs[i]["traj"][-1][0])) 226 | if trajs[i]["is_valid"] 227 | else "" 228 | for i in range(len(trajs)) 229 | ] 230 | 231 | flat_rewards = ( 232 | flat_rewards.reshape((len(flat_rewards), -1)).data.numpy().tolist() 233 | ) 234 | rewards = rewards.data.numpy().tolist() 235 | preferences = ( 236 | cond_info.get("preferences", torch.zeros((len(mols), 0))) 237 | .data.numpy() 238 | .tolist() 239 | ) 240 | logged_keys = [ 241 | k for k in sorted(cond_info.keys()) if k not in ["encoding", "preferences"] 242 | ] 243 | 244 | data = [ 245 | [mols[i], rewards[i]] 246 | + flat_rewards[i] 247 | + preferences[i] 248 | + [cond_info[k][i].item() for k in logged_keys] 249 | for i in range(len(trajs)) 250 | ] 251 | data_labels = ( 252 | ["smi", "r"] 253 | + [f"fr_{i}" for i in range(len(flat_rewards[0]))] 254 | + [f"pref_{i}" for i in range(len(preferences[0]))] 255 | + [f"ci_{k}" for k in logged_keys] 256 | ) 257 | # self.log.insert_many(data, data_labels) 258 | -------------------------------------------------------------------------------- /rxitect/utils/metrics.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import math 3 | from typing import TYPE_CHECKING 4 | 5 | from botorch.utils.multi_objective import infer_reference_point 6 | from botorch.utils.multi_objective import pareto 7 | from botorch.utils.multi_objective.hypervolume import Hypervolume 8 | from cvxopt import matrix 9 | from cvxopt import solvers 10 | import numpy as np 11 | from rdkit import Chem 12 | from rdkit import DataStructs 13 | import torch 14 | 15 | if TYPE_CHECKING: 16 | from numpy.typing import NDArray 17 | 18 | 19 | def pareto_frontier(obj_vals: NDArray, maximize: bool = True): 20 | """ 21 | Compute the Pareto frontier of a set of candidate solutions. 22 | Parameters 23 | ---------- 24 | obj_vals: NDArray 25 | NumPy array of objective values 26 | maximize: bool 27 | x 28 | """ 29 | # pareto utility assumes maximization 30 | if maximize: 31 | pareto_mask = pareto.is_non_dominated(torch.from_numpy(obj_vals)) 32 | else: 33 | pareto_mask = pareto.is_non_dominated(-torch.from_numpy(obj_vals)) 34 | return obj_vals[pareto_mask] 35 | 36 | 37 | def get_hypervolume(flat_rewards: torch.Tensor, zero_ref: bool = True) -> float: 38 | """Compute the hypervolume of a set of trajectories. 39 | Parameters 40 | ---------- 41 | flat_rewards: torch.Tensor 42 | A tensor of shape (num_trajs, num_of_objectives) containing the rewards of each trajectory. 43 | zero_ref: bool 44 | x 45 | """ 46 | # Compute the reference point 47 | if zero_ref: 48 | reference_point = torch.zeros_like(flat_rewards[0]) 49 | else: 50 | reference_point = infer_reference_point(flat_rewards) 51 | # Compute the hypervolume 52 | hv_indicator = Hypervolume(reference_point) # Difference 53 | return hv_indicator.compute(flat_rewards) 54 | 55 | 56 | def uniform_reference_points(nobj, p=4, scaling=None): 57 | """Generate reference points uniformly on the hyperplane intersecting 58 | each axis at 1. The scaling factor is used to combine multiple layers of 59 | reference points. 60 | """ 61 | def gen_refs_recursive(ref, nobj, left, total, depth): 62 | points = [] 63 | if depth == nobj - 1: 64 | ref[depth] = left / total 65 | points.append(ref) 66 | else: 67 | for i in range(left + 1): 68 | ref[depth] = i / total 69 | points.extend(gen_refs_recursive(ref.copy(), nobj, left - i, total, depth + 1)) 70 | return points 71 | 72 | ref_points = np.array(gen_refs_recursive(np.zeros(nobj), nobj, p, p, 0)) 73 | if scaling is not None: 74 | ref_points *= scaling 75 | ref_points += (1 - scaling) / nobj 76 | 77 | return ref_points 78 | 79 | 80 | def r2_indicator_set(reference_points, solutions, utopian_point): 81 | """Computer R2 indicator value of a set of solutions (*solutions*) given a set of 82 | reference points (*reference_points) and a utopian_point (*utopian_point). 83 | :param reference_points: An array of reference points from a uniform distribution. 84 | :param solutions: the multi-objective solutions (fitness values). 85 | :param utopian_point: utopian point that represents the best possible solution 86 | :returns: r2 value (float). 87 | """ 88 | 89 | min_list = [] 90 | for v in reference_points: 91 | max_list = [] 92 | for a in solutions: 93 | max_list.append(np.max(v * np.abs(utopian_point - a))) 94 | 95 | min_list.append(np.min(max_list)) 96 | 97 | v_norm = np.linalg.norm(reference_points) 98 | r2 = np.sum(min_list) / v_norm 99 | 100 | return r2 101 | 102 | 103 | solvers.options['abstol'] = 1e-15 104 | solvers.options['reltol'] = 1e-15 105 | solvers.options['feastol'] = 1e-15 106 | solvers.options['maxiters'] = 1000 107 | solvers.options['show_progress'] = False 108 | 109 | 110 | def sharpe_ratio(p, Q, x, rf): 111 | """ Compute the Sharpe ratio. 112 | Returns the Sharpe ratio given the expected return vector, p, 113 | the covariance matrix, Q, the investment column vector, x, and 114 | the return of the riskless asset, rf. 115 | Parameters 116 | ---------- 117 | p : ndarray 118 | Expected return vector (of size n). 119 | Q : ndarray 120 | Covariance (n,n)-matrix. 121 | x : ndarray 122 | Investment vector of size (n,1). The sum of which should be 1. 123 | rf : float 124 | Return of a riskless asset. 125 | Returns 126 | ------- 127 | sr : float 128 | The HSR value. 129 | """ 130 | return (x.T.dot(p) - rf) / math.sqrt(x.T.dot(Q).dot(x)) 131 | 132 | 133 | def _sharpe_ratio_qp_max(p, Q, rf): 134 | """ Sharpe ratio maximization problem - QP formulation """ 135 | n = len(p) 136 | 137 | # inequality constraints (investment in assets is higher or equal to 0) 138 | C = np.diag(np.ones(n)) 139 | d = np.zeros((n, 1), dtype=np.double) 140 | 141 | # equality constraints (just one) 142 | A = np.zeros((1, n), dtype=np.double) 143 | b = np.zeros((1, 1), dtype=np.double) 144 | A[0, :] = p - rf 145 | b[0, 0] = 1 146 | 147 | # convert numpy matrix to cvxopt matrix 148 | G, c, A, b, C, d = matrix(Q, tc='d'), matrix(np.zeros(n), tc='d'), matrix(A, tc='d'), matrix(b, tc='d'), matrix( 149 | C, tc='d'), matrix(d, tc='d') 150 | 151 | sol = solvers.coneqp(G, c, -C, -d, None, A, b, kktsolver='ldl') # , initvals=self.initGuess) 152 | y = np.array(sol['x']) 153 | 154 | return y 155 | 156 | 157 | def sharpe_ratio_max(p, Q, rf): 158 | """ Compute the Sharpe ratio and investment of an optimal portfolio. 159 | Parameters 160 | ---------- 161 | p : ndarray 162 | Expected return vector (of size n). 163 | Q : ndarray 164 | Covariance (n,n)-matrix. 165 | rf : float 166 | Return of a riskless asset. 167 | Returns 168 | ------- 169 | sr : float 170 | The HSR value. 171 | x : ndarray 172 | Investment vector of size (n,1). 173 | """ 174 | y = _sharpe_ratio_qp_max(p, Q, rf) 175 | x = y / y.sum() 176 | x = np.where(x > 1e-9, x, 0) 177 | sr = sharpe_ratio(p, Q, x, rf) 178 | return sr, x 179 | 180 | 181 | # Assumes that l <= A << u 182 | # Assumes A, l, u are numpy arrays 183 | def _expected_return(A, low, up): 184 | """ 185 | Returns the expected return (computed as defined by the HSR indicator), as a 186 | column vector. 187 | """ 188 | A = np.array(A, dtype=np.double) # because of division operator in python 2.7 189 | return ((up - A).prod(axis=-1)) / ((up - low).prod()) 190 | 191 | 192 | def _covariance(A, low, up, p=None): 193 | """ Returns the covariance matrix (computed as defined by the HSR indicator). """ 194 | p = _expected_return(A, low, up) if p is None else p 195 | Pmax = np.maximum(A[:, np.newaxis, :], A[np.newaxis, ...]) 196 | P = _expected_return(Pmax, low, up) 197 | 198 | Q = P - p[:, np.newaxis] * p[np.newaxis, :] 199 | return Q 200 | 201 | 202 | def _argunique(pts): 203 | """ Find the unique points of a matrix. Returns their indexes. """ 204 | ix = np.lexsort(pts.T) 205 | diff = (pts[ix][1:] != pts[ix][:-1]).any(axis=1) 206 | un = np.ones(len(pts), dtype=bool) 207 | un[ix[1:]] = diff 208 | return un 209 | 210 | 211 | def HSRindicator(A, low, up, managedup=False): 212 | """ 213 | Compute the HSR indicator of the point set A given reference points l and u. 214 | Returns the HSR value of A given l and u, and returns the optimal investment. 215 | By default, points in A are assumed to be unique. 216 | Tip: Either ensure that A does not contain duplicated points 217 | (for example, remove them previously and then split the 218 | investment between the copies as you wish), or set the flag 219 | 'managedup' to True. 220 | Parameters 221 | ---------- 222 | A : ndarray 223 | Input matrix (n,d) with n points and d dimensions. 224 | low : array_like 225 | Lower reference point. 226 | up : array_like 227 | Upper reference point. 228 | managedup : bool, optional 229 | If A contains duplicated points and 'managedup' is set to True, only the 230 | first copy may be assigned positive investment, all other copies are 231 | assigned zero investment. Otherwise, no special treatment is given to 232 | duplicate points. 233 | Returns 234 | ------- 235 | hsri : float 236 | The HSR value. 237 | x : ndarray 238 | The optimal investment as a column vector array (n,1). 239 | """ 240 | n = len(A) 241 | x = np.zeros((n, 1), dtype=float) 242 | 243 | # if u is not strongly dominated by l or A is the empty set 244 | if (up <= low).any(): 245 | raise ValueError("The lower reference point does not strongly dominate the upper reference point!") 246 | 247 | if len(A) == 0: 248 | return 0, x 249 | 250 | valid = (A < up).all(axis=1) 251 | validix = np.where(valid)[0] 252 | 253 | # if A is the empty set 254 | if valid.sum() == 0: 255 | return 0, x 256 | A = A[valid] # A only contains points that strongly dominate u 257 | A = np.maximum(A, low) 258 | m = len(A) # new size (m <= n) 259 | 260 | # manage duplicate points 261 | ix = _argunique(A) if managedup else np.ones(m).astype(bool) 262 | p = _expected_return(A[ix], low, up) 263 | Q = _covariance(A[ix], low, up, p) 264 | 265 | hsri, x[validix[ix]] = sharpe_ratio_max(p, Q, 0) 266 | 267 | return hsri, x 268 | 269 | 270 | class HSR_Calculator: 271 | def __init__(self, lower_bound, upper_bound, max_obj_bool=None): 272 | ''' 273 | Class to calculate HSR Indicator with assumption that assumes a maximization on all objectives. 274 | Parameters 275 | ---------- 276 | lower_bound : array_like 277 | Lower reference point. 278 | upper_bound : array_like 279 | Upper reference point. 280 | max_obj_bool : bool, optional 281 | Details of the objectives for which dimension maximization is not the case. 282 | ''' 283 | 284 | self.lower_bound = lower_bound 285 | self.upper_bound = upper_bound 286 | self.max_obj_bool = None 287 | 288 | if max_obj_bool is not None: 289 | self.max_obj_bool = max_obj_bool 290 | 291 | def reset_lower_bound(self, lower_bound): 292 | self.lower_bound = lower_bound 293 | 294 | def reset_upper_bound(self, upper_bound): 295 | self.upper_bound = upper_bound 296 | 297 | def make_max_problem(self, matrix): 298 | 299 | if self.max_obj_bool is None: 300 | return matrix 301 | 302 | max_matrix = deepcopy(matrix) 303 | 304 | for dim in self.max_obj_bool: 305 | max_matrix[:, dim] = max_matrix**-1 306 | 307 | return max_matrix 308 | 309 | def calculate_hsr(self, solutions): 310 | 311 | max_solutions = self.make_max_problem(solutions) 312 | 313 | hsr_indicator, hsr_invest = HSRindicator(A=max_solutions, low=self.lower_bound, up=self.upper_bound) 314 | 315 | return hsr_indicator, hsr_invest 316 | 317 | 318 | class Normalizer(object): 319 | def __init__(self, loc=0., scale=1.): 320 | self.loc = loc 321 | self.scale = np.where(scale != 0, scale, 1.) 322 | 323 | def __call__(self, arr): 324 | min_val = self.loc - 4 * self.scale 325 | max_val = self.loc + 4 * self.scale 326 | clipped_arr = np.clip(arr, a_min=min_val, a_max=max_val) 327 | norm_arr = (clipped_arr - self.loc) / self.scale 328 | 329 | return norm_arr 330 | 331 | def inv_transform(self, arr): 332 | return self.scale * arr + self.loc 333 | 334 | 335 | # Should be calculated per preference 336 | def compute_diverse_top_k(smiles, rewards, k, thresh=0.7): 337 | # mols is a list of (reward, mol) 338 | mols = [] 339 | for i in range(len(smiles)): 340 | mols.append([rewards[i].item(), smiles[i]]) 341 | mols = sorted(mols, key=lambda m: m[0], reverse=True) 342 | modes = [mols[0]] 343 | mode_fps = [Chem.RDKFingerprint(mols[0][1])] 344 | for i in range(1, len(mols)): 345 | fp = Chem.RDKFingerprint(mols[i][1]) 346 | sim = DataStructs.BulkTanimotoSimilarity(fp, mode_fps) 347 | if max(sim) < thresh: 348 | modes.append(mols[i]) 349 | mode_fps.append(fp) 350 | if len(modes) >= k: 351 | # last_idx = i 352 | break 353 | return np.mean([i[0] for i in modes]) # return sim 354 | 355 | 356 | def get_topk(rewards, k): 357 | """ 358 | Parameters 359 | ---------- 360 | rewards : array_like 361 | Rewards obtained after taking the convex combination. 362 | Shape: number_of_preferences x number_of_samples 363 | k : int 364 | Top-K value 365 | Returns 366 | ---------- 367 | average top-K rewards across all preferences 368 | """ 369 | if len(rewards.shape) < 2: 370 | rewards = torch.unsqueeze(rewards, -1) 371 | sorted_rewards = torch.sort(rewards, 1).values 372 | topk_rewards = sorted_rewards[range(rewards.shape[0]), :k] 373 | mean_topk = torch.mean(topk_rewards.mean(-1)) 374 | return mean_topk 375 | 376 | 377 | if __name__ == "__main__": 378 | 379 | # Example for 2 dimensions 380 | # Point set: {(1,3), (2,2), (3,1)}, l = (0,0), u = (4,4) 381 | A = np.array([[1, 3], [2, 2], [3, 1]]) # matrix with dimensions n x d (n points, d dimensions) 382 | low = np.zeros(2) # l must weakly dominate every point in A 383 | up = np.array([4, 4]) # u must be strongly dominated by every point in A 384 | 385 | # A = np.array([[3.41e-01, 9.72e-01, 2.47e-01], 386 | # [9.30e-01, 1.53e-01, 4.72e-01], 387 | # [4.56e-01, 1.71e-01, 8.68e-01], 388 | # [8.70e-02, 5.94e-01, 9.50e-01], 389 | # [5.31e-01, 6.35e-01, 1.95e-01], 390 | # [3.12e-01, 3.37e-01, 7.01e-01], 391 | # [3.05e-02, 9.10e-01, 7.71e-01], 392 | # [8.89e-01, 8.29e-01, 2.07e-02], 393 | # [6.92e-01, 3.62e-01, 2.93e-01], 394 | # [2.33e-01, 4.55e-01, 6.60e-01]]) 395 | # 396 | # l = np.zeros(3) # l must weakly dominate every point in A 397 | # u = np.array([1, 1, 1]) 398 | 399 | hsr_class = HSR_Calculator(lower_bound=low, upper_bound=up) 400 | hsri, x = hsr_class.calculate_hsr(A) # compute HSR indicator 401 | 402 | print("Optimal investment:") 403 | print("%s" % "\n".join(map(str, x[:, 0]))) 404 | print("HSR indicator value: %f" % hsri) 405 | -------------------------------------------------------------------------------- /rxitect/algorithms/trajectory_balance.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import copy 4 | from itertools import count 5 | from typing import TYPE_CHECKING, Any, Dict, List 6 | 7 | import numpy as np 8 | import torch 9 | from torch_geometric.data import Batch 10 | from torch_scatter import scatter 11 | 12 | from rxitect.algorithms.gfn_algorithm import GFNAlgorithm, Trajectory 13 | from rxitect.envs.contexts import ActionType 14 | 15 | if TYPE_CHECKING: 16 | from rxitect.algorithms.gfn_algorithm import SamplingModel 17 | from rxitect.envs import FragmentEnv, FragmentEnvContext 18 | from rxitect.envs.contexts import ActionCategorical, ActionIndex 19 | 20 | 21 | class TrajectoryBalance(GFNAlgorithm): 22 | """ """ 23 | 24 | def __init__( 25 | self, 26 | env: FragmentEnv, 27 | ctx: FragmentEnvContext, 28 | rng: np.random.RandomState, 29 | hps: Dict[str, Any], 30 | max_len=None, 31 | max_nodes=None, 32 | ): 33 | """TB implementation, see 34 | "Trajectory Balance: Improved Credit Assignment in GFlowNets Nikolay Malkin, Moksh Jain, 35 | Emmanuel Bengio, Chen Sun, Yoshua Bengio" 36 | https://arxiv.org/abs/2201.13259 37 | Hyperparameters used: 38 | random_action_prob: float, probability of taking a uniform random action when sampling 39 | illegal_action_logreward: float, log(R) given to the model for non-sane end states or illegal actions 40 | bootstrap_own_reward: bool, if True, uses the .reward batch data to predict rewards for sampled data 41 | tb_epsilon: float, if not None, adds this epsilon in the numerator and denominator of the log-ratio 42 | reward_loss_multiplier: float, multiplying constant for the bootstrap loss. 43 | Parameters 44 | ---------- 45 | env: FragmentEnv 46 | A graph environment. 47 | ctx: FragmentEnvContext 48 | A context. 49 | rng: np.random.RandomState 50 | rng used to take random actions 51 | hps: Dict[str, Any] 52 | Hyperparameter dictionary, see above for used keys. 53 | max_len: int 54 | If not None, ends trajectories of more than max_len steps. 55 | max_nodes: int 56 | If not None, ends trajectories of graphs with more than max_nodes steps (illegal action). 57 | """ 58 | self.ctx = ctx 59 | self.env = env 60 | self.rng = rng 61 | self.max_len = max_len 62 | self.max_nodes = max_nodes 63 | self.random_action_prob = hps["random_action_prob"] 64 | self.illegal_action_logreward = hps["illegal_action_logreward"] 65 | self.bootstrap_own_reward = hps["bootstrap_own_reward"] 66 | self.sanitize_samples = True 67 | self.epsilon = hps["tb_epsilon"] 68 | self.reward_loss_multiplier = hps["reward_loss_multiplier"] 69 | # Experimental flags 70 | self.reward_loss_is_mae = True 71 | self.tb_loss_is_mae = False 72 | self.tb_loss_is_huber = False 73 | self.mask_invalid_rewards = False 74 | self.length_normalize_losses = False 75 | self.reward_normalize_losses = False 76 | self.sample_temp = 1 77 | 78 | def _corrupt_actions(self, actions: List[ActionIndex], cat: ActionCategorical): 79 | """Sample from the uniform policy with probability `self.random_action_prob`""" 80 | # Should this be a method of GraphActionCategorical? 81 | if self.random_action_prob <= 0: 82 | return 83 | (corrupted,) = ( 84 | self.rng.uniform(size=len(actions)) < self.random_action_prob 85 | ).nonzero() 86 | for i in corrupted: 87 | n_in_batch = [int((b == i).sum()) for b in cat.batch] 88 | n_each = np.array( 89 | [ 90 | float(logit.shape[1]) * nb 91 | for logit, nb in zip(cat.logits, n_in_batch) 92 | ] 93 | ) 94 | which = self.rng.choice(len(n_each), p=n_each / n_each.sum()) 95 | row = self.rng.choice(n_in_batch[which]) 96 | col = self.rng.choice(cat.logits[which].shape[1]) 97 | actions[i] = (which, row, col) 98 | 99 | def create_training_data_from_own_samples( 100 | self, model: SamplingModel, n: int, cond_info: torch.Tensor 101 | ): 102 | ctx = self.ctx 103 | env = self.env 104 | dev = self.ctx.device 105 | cond_info = cond_info.to(dev) 106 | log_z_pred = model.log_z(cond_info) 107 | # This will be returned as training data 108 | data = [{"traj": [], "reward_pred": None, "is_valid": True} for _ in range(n)] 109 | # Let's also keep track of trajectory statistics according to the model 110 | zero = torch.tensor([0], device=dev).float() 111 | fwd_logprob: List[List[torch.Tensor]] = [[] for _ in range(n)] 112 | bck_logprob: List[List[torch.Tensor]] = [ 113 | [zero] for _ in range(n) 114 | ] # zero in case there is a single invalid action 115 | 116 | graphs = [env.new() for _ in range(n)] 117 | done = [False] * n 118 | 119 | def not_done(lst): 120 | return [e for i, e in enumerate(lst) if not done[i]] 121 | 122 | # TODO report these stats: 123 | mol_too_big = 0 124 | mol_not_sane = 0 125 | invalid_act = 0 126 | logprob_of_illegal: List[torch.Tensor] = [] 127 | 128 | illegal_action_logreward = torch.tensor( 129 | [self.illegal_action_logreward], device=dev 130 | ) 131 | if self.epsilon is not None: 132 | epsilon = torch.tensor([self.epsilon], device=dev).float() 133 | for t in range(self.max_len) if self.max_len is not None else count(0): 134 | # Construct graphs for the trajectories that aren't yet done 135 | torch_graphs = [ctx.graph_to_data(i) for i in not_done(graphs)] 136 | not_done_mask = torch.tensor(done, device=dev).logical_not() 137 | # Forward pass to get ActionCategorical 138 | fwd_cat, log_reward_preds = model( 139 | ctx.collate_fn(torch_graphs).to(dev), cond_info[not_done_mask] 140 | ) 141 | if self.sample_temp != 1: 142 | sample_cat = copy.copy(fwd_cat) 143 | sample_cat.logits = [i / self.sample_temp for i in fwd_cat.logits] 144 | actions = sample_cat.sample() 145 | else: 146 | actions = fwd_cat.sample() 147 | self._corrupt_actions(actions, fwd_cat) 148 | graph_actions = [ 149 | ctx.idx_to_action(g, a) for g, a in zip(torch_graphs, actions) 150 | ] 151 | log_probs = fwd_cat.log_probability(actions) 152 | for i, j in zip(not_done(range(n)), range(n)): 153 | # Step each trajectory, and accumulate statistics 154 | fwd_logprob[i].append(log_probs[j].unsqueeze(0)) 155 | data[i]["traj"].append((graphs[i], graph_actions[j])) 156 | # Check if we're done 157 | if graph_actions[j].act_type is ActionType.STOP or ( 158 | self.max_len and t == self.max_len - 1 159 | ): 160 | done[i] = True 161 | if self.sanitize_samples and not ctx.is_valid_graph(graphs[i]): 162 | # check if the graph is sane (e.g. RDKit can 163 | # construct a molecule from it) otherwise 164 | # treat the done action as illegal 165 | mol_not_sane += 1 166 | data[i]["reward_pred"] = illegal_action_logreward.exp() 167 | data[i]["is_valid"] = False 168 | elif self.bootstrap_own_reward: 169 | # if we're bootstrapping, extract reward prediction 170 | data[i]["reward_pred"] = log_reward_preds[j].detach().exp() 171 | else: # If not done, try to step the environment 172 | gp = graphs[i] 173 | try: 174 | # env.step can raise AssertionError if the action is illegal 175 | gp = env.step(graphs[i], graph_actions[j]) 176 | if self.max_nodes is not None: 177 | assert len(gp.nodes) <= self.max_nodes 178 | except AssertionError: 179 | if len(gp.nodes) > self.max_nodes: 180 | mol_too_big += 1 181 | else: 182 | invalid_act += 1 183 | done[i] = True 184 | data[i]["reward_pred"] = illegal_action_logreward.exp() 185 | data[i]["is_valid"] = False 186 | continue 187 | # Add to the trajectory 188 | # P_B = uniform backward 189 | n_back = env.count_backward_transitions(gp) 190 | bck_logprob[i].append(torch.tensor([1 / n_back], device=dev).log()) 191 | graphs[i] = gp 192 | if all(done): 193 | break 194 | 195 | for i in range(n): 196 | # If we're not bootstrapping, we could query the reward 197 | # model here, but this is expensive/impractical. 198 | # Instead, just report forward and backward flows 199 | data[i]["log_z"] = log_z_pred[i].item() 200 | data[i]["fwd_logprob"] = sum(fwd_logprob[i]) 201 | data[i]["bck_logprob"] = sum(bck_logprob[i]) 202 | if self.bootstrap_own_reward and False: # TODO: verify 203 | if not data[i]["is_valid"]: 204 | logprob_of_illegal.append(data[i]["fwd_logprob"].item()) 205 | # If we are bootstrapping, we can report the theoretical loss as well 206 | numerator = data[i]["fwd_logprob"] + log_z_pred[i] 207 | denominator = data[i]["bck_logprob"] + data[i]["reward_pred"].log() 208 | if self.epsilon is not None: 209 | numerator = torch.logaddexp(numerator, epsilon) 210 | denominator = torch.logaddexp(denominator, epsilon) 211 | data[i]["loss"] = (numerator - denominator).pow(2) 212 | return data 213 | 214 | def construct_batch(self, trajs: List[Trajectory], cond_info, rewards): 215 | """Construct a batch from a list of trajectories and their information 216 | Parameters 217 | ---------- 218 | trajs: List[Trajectory] 219 | A list of N trajectories. 220 | cond_info: Tensor 221 | The conditional info that is considered for each trajectory. Shape (N, n_info) 222 | rewards: Tensor 223 | The transformed reward (e.g. R(x) ** beta) for each trajectory. Shape (N,) 224 | Returns 225 | ------- 226 | batch: gd.Batch 227 | A (CPU) Batch object with relevant attributes added 228 | """ 229 | torch_graphs = [ 230 | self.ctx.graph_to_data(i[0]) for tj in trajs for i in tj["traj"] 231 | ] 232 | actions = [ 233 | self.ctx.action_to_idx(g, a) 234 | for g, a in zip(torch_graphs, [i[1] for tj in trajs for i in tj["traj"]]) 235 | ] 236 | num_backward = torch.tensor( 237 | [ 238 | # Count the number of backward transitions from s_{t+1}, 239 | # unless t+1 = T is the last time step 240 | self.env.count_backward_transitions(tj["traj"][i + 1][0]) 241 | if i + 1 < len(tj["traj"]) 242 | else 1 243 | for tj in trajs 244 | for i in range(len(tj["traj"])) 245 | ] 246 | ) 247 | batch = self.ctx.collate_fn(torch_graphs) 248 | batch.traj_lens = torch.tensor([len(i["traj"]) for i in trajs]) 249 | batch.num_backward = num_backward 250 | batch.actions = torch.tensor(actions) 251 | batch.rewards = rewards 252 | batch.cond_info = cond_info 253 | batch.is_valid = torch.tensor([i.get("is_valid", True) for i in trajs]).float() 254 | return batch 255 | 256 | def compute_batch_losses( 257 | self, model: SamplingModel, batch: Batch, num_bootstrap: int = 0 258 | ): 259 | """Compute the losses over trajectories contained in the batch 260 | Parameters 261 | ---------- 262 | model: SamplingModel 263 | A GNN taking in a batch of graphs as input as per constructed by `self.construct_batch`. 264 | Must have a `logZ` attribute, itself a model, which predicts log of Z(cond_info) 265 | batch: Batch 266 | batch of graphs inputs as per constructed by `self.construct_batch` 267 | num_bootstrap: int 268 | the number of trajectories for which the reward loss is computed. Ignored if 0.""" 269 | dev = batch.x.device 270 | # A single trajectory comprises many graphs 271 | num_trajs = int(batch.traj_lens.shape[0]) 272 | rewards = batch.rewards 273 | cond_info = batch.cond_info 274 | 275 | # This index says which trajectory each graph belongs to, so 276 | # it will look like [0,0,0,0,1,1,1,2,...] if trajectory 0 is 277 | # of length 4, trajectory 1 of length 3, and so on. 278 | batch_idx = torch.arange(num_trajs, device=dev).repeat_interleave( 279 | batch.traj_lens 280 | ) 281 | # The position of the last graph of each trajectory 282 | final_graph_idx = torch.cumsum(batch.traj_lens, 0) - 1 283 | 284 | # Forward pass of the model, returns a GraphActionCategorical and the optional bootstrap predictions 285 | fwd_cat, log_reward_preds = model(batch, cond_info[batch_idx]) 286 | 287 | # Retreive the reward predictions for the full graphs, 288 | # i.e. the final graph of each trajectory 289 | log_reward_preds = log_reward_preds[final_graph_idx, 0] 290 | # Compute trajectory balance objective 291 | Z = model.log_z(cond_info)[:, 0] 292 | # This is the log prob of each action in the trajectory 293 | log_prob = fwd_cat.log_probability(batch.actions) 294 | # The log prob of each backward action 295 | log_p_B = (1 / batch.num_backward).log() 296 | # Take log rewards, and clip 297 | assert rewards.ndim == 1 298 | Rp = torch.maximum(rewards.log(), torch.tensor(-100.0, device=dev)) 299 | # This is the log probability of each trajectory 300 | traj_log_prob = scatter( 301 | log_prob, batch_idx, dim=0, dim_size=num_trajs, reduce="sum" 302 | ) 303 | # Compute log numerator and denominator of the TB objective 304 | numerator = Z + traj_log_prob 305 | denominator = Rp + scatter( 306 | log_p_B, batch_idx, dim=0, dim_size=num_trajs, reduce="sum" 307 | ) 308 | 309 | if self.epsilon is not None: 310 | # Numerical stability epsilon 311 | epsilon = torch.tensor([self.epsilon], device=dev).float() 312 | numerator = torch.logaddexp(numerator, epsilon) 313 | denominator = torch.logaddexp(denominator, epsilon) 314 | 315 | invalid_mask = 1 - batch.is_valid 316 | if self.mask_invalid_rewards: 317 | # Instead of being rude to the model and giving a 318 | # log-reward of -100 what if we say, whatever you think the 319 | # logprobablity of this trajectory is it should be smaller 320 | # (thus the `numerator - 1`). Why 1? Intuition? 321 | denominator = denominator * (1 - invalid_mask) + invalid_mask * ( 322 | numerator.detach() - 1 323 | ) 324 | 325 | if self.tb_loss_is_mae: 326 | traj_losses = abs(numerator - denominator) 327 | elif self.tb_loss_is_huber: 328 | raise NotImplementedError("Huber loss is not supported yet") # TODO 329 | else: 330 | traj_losses = (numerator - denominator).pow(2) 331 | 332 | # Normalize losses by trajectory length 333 | if self.length_normalize_losses: 334 | traj_losses = traj_losses / batch.traj_lens 335 | if self.reward_normalize_losses: 336 | # multiply each loss by how important it is, using R as the importance factor 337 | # factor = Rp.exp() / Rp.exp().sum() 338 | factor = -Rp.min() + Rp + 1 339 | factor = factor / factor.sum() 340 | assert factor.shape == traj_losses.shape 341 | # * num_trajs because we're doing a convex combination, and a .mean() later, which would 342 | # undercount (by 2N) the contribution of each loss 343 | traj_losses = factor * traj_losses * num_trajs 344 | 345 | if self.bootstrap_own_reward: 346 | num_bootstrap = num_bootstrap or len(rewards) 347 | if self.reward_loss_is_mae: 348 | reward_losses = abs( 349 | rewards[:num_bootstrap] - log_reward_preds[:num_bootstrap].exp() 350 | ) 351 | else: 352 | reward_losses = ( 353 | rewards[:num_bootstrap] - log_reward_preds[:num_bootstrap].exp() 354 | ).pow(2) 355 | reward_loss = reward_losses.mean() 356 | else: 357 | reward_loss = 0 358 | 359 | loss = traj_losses.mean() + reward_loss * self.reward_loss_multiplier 360 | info = { 361 | "offline_loss": traj_losses[: batch.num_offline].mean() 362 | if batch.num_offline > 0 363 | else 0, 364 | "online_loss": traj_losses[batch.num_offline :].mean() 365 | if batch.num_online > 0 366 | else 0, 367 | "reward_loss": reward_loss, 368 | "invalid_trajectories": invalid_mask.sum() / batch.num_online 369 | if batch.num_online > 0 370 | else 0, 371 | "invalid_logprob": (invalid_mask * traj_log_prob).sum() 372 | / (invalid_mask.sum() + 1e-4), 373 | "invalid_losses": (invalid_mask * traj_losses).sum() 374 | / (invalid_mask.sum() + 1e-4), 375 | "log_z": Z.mean(), 376 | } 377 | 378 | if not torch.isfinite(traj_losses).all(): 379 | raise ValueError("loss is not finite") 380 | return loss, info 381 | -------------------------------------------------------------------------------- /rxitect/envs/contexts/graph_env_context.py: -------------------------------------------------------------------------------- 1 | """Adapted from recursionpharma's gflownet implementation @ https://github.com/recursionpharma/gflownet. 2 | Contains code designed to give context for actions an agent can take in a setting where 3 | the actions are a combination of choosing a molecular fragment and where to attach it, effectively 4 | resulting in the creation of a (final) molecular graph. 5 | """ 6 | from __future__ import annotations 7 | 8 | from abc import ABC, abstractmethod 9 | from copy import copy 10 | from dataclasses import dataclass 11 | from enum import Enum, auto 12 | from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple 13 | 14 | import networkx as nx 15 | import numpy as np 16 | import torch 17 | from torch.types import Device 18 | from torch_geometric.data import Batch, Data 19 | from torch_scatter import scatter, scatter_max 20 | 21 | if TYPE_CHECKING: 22 | from rdkit.Chem.rdchem import Mol 23 | 24 | 25 | class Graph(nx.Graph): 26 | """A wrapper around networkx's Graph class to facilitate debugging.""" 27 | 28 | def __str__(self): 29 | return repr(self) 30 | 31 | def __repr__(self): 32 | return f"<{list(self.nodes)}, {list(self.edges)}, {list(self.nodes[i]['v'] for i in self.nodes)}>" 33 | 34 | 35 | class ActionType(Enum): 36 | """Class that contains all actions in the context of fragment-based molecular graph building""" 37 | 38 | STOP = auto() 39 | ADD_NODE = auto() 40 | ADD_EDGE = auto() 41 | SET_NODE_ATTR = auto() 42 | SET_EDGE_ATTR = auto() 43 | REMOVE_NODE = auto() 44 | REMOVE_EDGE = auto() 45 | REMOVE_NODE_ATTR = auto() 46 | REMOVE_EDGE_ATTR = auto() 47 | 48 | 49 | @dataclass 50 | class Action: 51 | """A class representing a single graph-building action 52 | 53 | Parameters 54 | ---------- 55 | act_type: ActionType 56 | The action type 57 | source: :obj:`int`, optional 58 | The source node this action is applied on 59 | target: :obj:`int`, optional 60 | The target node (i.e. if specified this is an edge action) 61 | value: :obj:`Any`, optional 62 | The value (e.g. new node type) applied 63 | attr: :obj:`str`, optional 64 | The set attribute of a node/edge 65 | """ 66 | 67 | act_type: ActionType 68 | source: Optional[int] = None 69 | target: Optional[int] = None 70 | value: Optional[Any] = None 71 | attr: Optional[str] = None 72 | 73 | def __repr__(self) -> str: 74 | attrs = ", ".join( 75 | str(i) 76 | for i in [self.source, self.target, self.attr, self.value] 77 | if i is not None 78 | ) 79 | return f"<{self.act_type}, {attrs}>" 80 | 81 | 82 | class GraphEnvContext(ABC): 83 | device: Device 84 | 85 | @abstractmethod 86 | def idx_to_action(self, g: Data, idx: ActionIndex) -> Action: 87 | """Translate an action index (e.g. from an ActionCategorical) to an Action 88 | Parameters 89 | ---------- 90 | g: Data 91 | The graph to which the action is being applied 92 | idx: ActionIndex 93 | The tensor indices for the corresponding action 94 | Returns 95 | ------- 96 | action: Action 97 | A graph action that could be applied to the original graph corresponding to g. 98 | """ 99 | pass 100 | 101 | @abstractmethod 102 | def action_to_idx(self, g: Data, action: Action) -> ActionIndex: 103 | """Translate a Action to an action index (e.g. from an ActionCategorical) 104 | Parameters 105 | ---------- 106 | g: Data 107 | The graph to which the action is being applied 108 | action: Action 109 | A graph action that could be applied to the original graph corresponding to g. 110 | Returns 111 | ------- 112 | action_idx: ActionIndex 113 | The tensor indices for the corresponding action 114 | """ 115 | pass 116 | 117 | @abstractmethod 118 | def graph_to_data(self, g: Graph) -> Data: 119 | """Convert a networkx Graph to a torch geometric Data instance 120 | Parameters 121 | ---------- 122 | g: Graph 123 | A graph instance. 124 | Returns 125 | ------- 126 | torch_g: Data 127 | The corresponding torch_geometric graph. 128 | """ 129 | pass 130 | 131 | @classmethod 132 | def collate_fn(cls, graphs: List[Data]) -> Batch: 133 | """Convert a list of torch geometric Data instances to a Batch 134 | instance. This exists so that environment contexts can set 135 | custom batching attributes, e.g. by using `follow_batch`. 136 | Parameters 137 | ---------- 138 | graphs: List[Data] 139 | Graph instances 140 | Returns 141 | ------- 142 | batch: Batch 143 | The corresponding batch. 144 | """ 145 | return Batch.from_data_list(graphs) 146 | 147 | @abstractmethod 148 | def is_valid_graph(self, g: Graph) -> bool: 149 | """Verifies whether a graph is valid according to the context. This can 150 | catch, e.g. impossible molecules. 151 | 152 | Parameters 153 | ---------- 154 | g: Graph 155 | A graph. 156 | Returns 157 | ------- 158 | is_sane: bool: 159 | True if the environment considers g to be valid. 160 | """ 161 | pass 162 | 163 | @abstractmethod 164 | def graph_to_mol(self, g: Graph) -> Mol: 165 | """Convert a Graph to an RDKit molecule 166 | Parameters 167 | ---------- 168 | g: Graph 169 | A Graph instance representing a fragment junction tree. 170 | Returns 171 | ------- 172 | m: Mol 173 | The corresponding RDKit molecule 174 | """ 175 | pass 176 | 177 | @abstractmethod 178 | def mol_to_graph(self, mol: Mol) -> Graph: 179 | """Transforms an RDKit representation of a molecule into 180 | its corresponding generic Graph representation 181 | Parameters 182 | ---------- 183 | mol: Mol 184 | An RDKit molecule 185 | Returns 186 | ------- 187 | g: Graph 188 | The corresponding Graph representation of that molecule. 189 | """ 190 | pass 191 | 192 | 193 | class ActionCategorical: 194 | def __init__( 195 | self, 196 | graphs: Batch, 197 | logits: List[torch.Tensor], 198 | keys: List[str], 199 | types: List[ActionType], 200 | deduplicate_edge_index: bool = True, 201 | ): 202 | """A multi-type Categorical compatible with generating structured actions. 203 | What is meant by type here is that there are multiple types of 204 | mutually exclusive actions, e.g. AddNode and AddEdge are 205 | mutually exclusive, but since their logits will be produced by 206 | different variable-sized tensors (corresponding to different 207 | elements of the graph, e.g. nodes or edges) it is inconvient 208 | to stack them all into one single Categorical. This class 209 | provides this convenient interaction between torch_geometric 210 | Batch objects and lists of logit tensors. 211 | Parameters 212 | ---------- 213 | graphs: Batch 214 | A Batch of graphs to which the logits correspond 215 | logits: List[torch.Tensor] 216 | A list of tensors of shape `(n, m)` representing logits 217 | over a variable number of graph elements (e.g. nodes) for 218 | which there are `m` possible actions. `n` should thus be 219 | equal to the sum of the number of such elements for each 220 | graph in the Batch object. The length of the `logits` list 221 | should thus be equal to the number of element types (in 222 | other words there should be one tensor per type). 223 | keys: List[Union[str, None]] 224 | The keys corresponding to the Graph elements for each 225 | tensor in the logits list. Used to extract the `_batch` 226 | and slice attributes. For example, if the first logit 227 | tensor is a per-node action logit, and the second is a 228 | per-edge, `keys` could be `['x', 'edge_index']`. If 229 | keys[i] is None, the corresponding logits are assumed to 230 | be graph-level (i.e. if there are `k` graphs in the Batch 231 | object, this logit tensor would have shape `(k, m)`) 232 | types: List[ActionType] 233 | The action type each logit corresponds to. 234 | deduplicate_edge_index: bool, default=True 235 | If true, this means that the 'edge_index' keys have been reduced 236 | by e_i[::2] (presumably because the graphs are undirected) 237 | """ 238 | # TODO: handle legal action masks? (e.g. can't add a node attr to a node that already has an attr) 239 | self.num_graphs = graphs.num_graphs 240 | # The logits 241 | self.logits = logits 242 | self.types = types 243 | self.keys = keys 244 | self.dev = dev = graphs.x.device 245 | 246 | # I'm extracting batches and slices in a slightly hackish way, 247 | # but I'm not aware of a proper API to torch_geometric that 248 | # achieves this "neatly" without accessing private attributes 249 | 250 | # This is the minibatch index of each entry in the logits 251 | # i.e., if graph i in the Batch has N[i] nodes, 252 | # g.batch == [0,0,0, ..., 1,1,1,1,1, ... ] 253 | # N[0] times N[1] times 254 | # This generalizes to edges and non-edges. 255 | # Append '_batch' to keys except for 'x', since TG has a special case (done by default for 'x') 256 | self.batch = [ 257 | getattr(graphs, f"{k}_batch" if k != "x" else "batch") if k is not None 258 | # None signals a global logit rather than a per-instance logit 259 | else torch.arange(graphs.num_graphs, device=dev) 260 | for k in keys 261 | ] 262 | # This is the cumulative sum (prefixed by 0) of N[i]s 263 | self.slice = [ 264 | graphs._slice_dict[k] 265 | if k is not None 266 | else torch.arange(graphs.num_graphs, device=dev) 267 | for k in keys 268 | ] 269 | self.log_probs = None 270 | 271 | if deduplicate_edge_index and "edge_index" in keys: 272 | idx = keys.index("edge_index") 273 | self.batch[idx] = self.batch[idx][::2] 274 | self.slice[idx] = self.slice[idx].div(2, rounding_mode="floor") 275 | 276 | def detach(self): 277 | new = copy(self) 278 | new.logits = [i.detach() for i in new.logits] 279 | if new.log_probs is not None: 280 | new.log_probs = [i.detach() for i in new.log_probs] 281 | return new 282 | 283 | def to(self, device): 284 | self.dev = device 285 | self.logits = [i.to(device) for i in self.logits] 286 | self.batch = [i.to(device) for i in self.batch] 287 | self.slice = [i.to(device) for i in self.slice] 288 | if self.log_probs is not None: 289 | self.log_probs = [i.to(device) for i in self.log_probs] 290 | return self 291 | 292 | def log_softmax(self): 293 | """Compute log-probabilities given logits""" 294 | if self.log_probs is not None: 295 | return self.log_probs 296 | # Use the `subtract by max` trick to avoid precision errors: 297 | # compute max 298 | maxl = ( 299 | torch.cat( 300 | [ 301 | scatter(i, b, dim=0, dim_size=self.num_graphs, reduce="max") 302 | for i, b in zip(self.logits, self.batch) 303 | ], 304 | dim=1, 305 | ) 306 | .max(1) 307 | .values.detach() 308 | ) 309 | # subtract by max then take exp 310 | # x[b, None] indexes by the batch to map back to each node/edge and adds a broadcast dim 311 | exp_logits = [ 312 | (i - maxl[b, None]).exp() + 1e-40 for i, b in zip(self.logits, self.batch) 313 | ] 314 | # sum corrected exponentiated logits, to get log(Z - max) = log(sum(exp(logits)) - max) 315 | log_z = sum( 316 | [ 317 | scatter(i, b, dim=0, dim_size=self.num_graphs, reduce="sum").sum(1) 318 | for i, b in zip(exp_logits, self.batch) 319 | ] 320 | ).log() 321 | # log probabilities is log(exp(logit) / Z) 322 | self.log_probs = [ 323 | i.log() - log_z[b, None] for i, b in zip(exp_logits, self.batch) 324 | ] 325 | return self.log_probs 326 | 327 | def sample(self) -> List[ActionIndex]: 328 | # Use the Gumbel trick to sample categoricals 329 | # i.e. if X ~ argmax(logits - log(-log(uniform(logits.shape)))) 330 | # then p(X = i) = exp(logits[i]) / Z 331 | # Here we have to do the argmax first over the variable number 332 | # of rows of each element type for each graph in the 333 | # minibatch, then over the different types (since they are 334 | # mutually exclusive). 335 | 336 | # Uniform noise 337 | u = [torch.rand(i.shape, device=self.dev) for i in self.logits] 338 | # Gumbel noise 339 | gumbel = [logit - (-noise.log()).log() for logit, noise in zip(self.logits, u)] 340 | # scatter_max and .max create a (values, indices) pair 341 | # These logits are 2d (num_obj_of_type, num_actions_of_type), 342 | # first reduce-max over the batch, which preserves the 343 | # columns, so we get (minibatch_size, num_actions_of_type). 344 | # First we prefill `out` with very negative values in case 345 | # there are no corresponding logits (this can happen if e.g. a 346 | # graph has no edges), we don't want to accidentally take the 347 | # max of that type. 348 | mnb_max = [ 349 | torch.zeros(self.num_graphs, i.shape[1], device=self.dev) - 1e6 350 | for i in self.logits 351 | ] 352 | mnb_max = [ 353 | scatter_max(i, b, dim=0, out=out) 354 | for i, b, out in zip(gumbel, self.batch, mnb_max) 355 | ] 356 | # Then over cols, this gets us which col holds the max value, 357 | # so we get (minibatch_size,) 358 | col_max = [values.max(1) for values, idx in mnb_max] 359 | # Now we look up which row in those argmax cols was the max: 360 | row_pos = [ 361 | idx_mnb[torch.arange(len(idx_col)), idx_col] 362 | for (_, idx_mnb), (_, idx_col) in zip(mnb_max, col_max) 363 | ] 364 | # The maxes themselves 365 | maxs = [values for values, idx in col_max] 366 | # Now we need to check which type of logit has the actual max 367 | type_max_val, type_max_idx = torch.stack(maxs).max(0) 368 | if torch.isfinite(type_max_val).logical_not_().any(): 369 | raise ValueError( 370 | "Non finite max value in sample", (type_max_val, self.logits) 371 | ) 372 | 373 | # Now we can return the indices of where the actions occurred 374 | # in the form List[(type, row, column)] 375 | actions = [] 376 | for i in range(type_max_idx.shape[0]): 377 | t = type_max_idx[i] 378 | # Subtract from the slice of that type and index, since the computed 379 | # row position is batch-wise rather graph-wise 380 | actions.append( 381 | (int(t), int(row_pos[t][i] - self.slice[t][i]), int(col_max[t][1][i])) 382 | ) 383 | # It's now up to the Context class to create GraphBuildingAction instances 384 | # if it wants to convert these indices to env-compatible actions 385 | return actions 386 | 387 | def log_probability(self, actions: List[ActionIndex]) -> torch.Tensor: 388 | """The log-probability of a list of action tuples 389 | Parameters 390 | ---------- 391 | actions: ActionIndex 392 | A list of action indices (action index triples) 393 | """ 394 | log_probs = self.log_softmax() 395 | return torch.stack( 396 | [ 397 | log_probs[t][row + self.slice[t][i], col] 398 | for i, (t, row, col) in enumerate(actions) 399 | ] 400 | ) 401 | 402 | 403 | def generate_forward_trajectory( 404 | g: Graph, max_nodes: int = None 405 | ) -> List[Tuple[Graph, Action]]: 406 | """Sample (uniformly) a trajectory that generates `g`""" 407 | # TODO: should this be a method of GraphBuildingEnv? handle set_node_attr flags and so on? 408 | gn = Graph() 409 | # Choose an arbitrary starting point, add to the stack 410 | stack: List[Tuple[int, ...]] = [(np.random.randint(0, len(g.nodes)),)] 411 | traj = [] 412 | # This map keeps track of node labels in gn, since we have to start from 0 413 | relabeling_map: Dict[int, int] = {} 414 | while len(stack): 415 | # We pop from the stack until all nodes and edges have been 416 | # generated and their attributes have been set. Un-inserted 417 | # nodes/edges will be added to the stack as the graph is 418 | # expanded from the starting point. Nodes/edges that have 419 | # attributes will be reinserted into the stack until those 420 | # attributes are "set". 421 | i = stack.pop(np.random.randint(len(stack))) 422 | 423 | gt = gn.copy() # This is a shallow copy 424 | if len(i) > 1: # i is an edge 425 | e = relabeling_map.get(i[0], None), relabeling_map.get(i[1], None) 426 | if e in gn.edges: 427 | # i exists in the new graph, that means some of its attributes need to be added 428 | attrs = [j for j in g.edges[i] if j not in gn.edges[e]] 429 | if len(attrs) == 0: 430 | continue # If nodes are in cycles edges leading to them get stack multiple times, disregard 431 | attr = attrs[np.random.randint(len(attrs))] 432 | gn.edges[e][attr] = g.edges[i][attr] 433 | act = Action( 434 | ActionType.SET_EDGE_ATTR, 435 | source=e[0], 436 | target=e[1], 437 | attr=attr, 438 | value=g.edges[i][attr], 439 | ) 440 | else: 441 | # i doesn't exist, add the edge 442 | if e[1] not in gn.nodes: 443 | # The endpoint of the edge is not in the graph, this is a AddNode action 444 | assert e[1] is None # normally we shouldn't have relabeled i[1] yet 445 | relabeling_map[i[1]] = len(relabeling_map) 446 | e = e[0], relabeling_map[i[1]] 447 | gn.add_node(e[1], v=g.nodes[i[1]]["v"]) 448 | gn.add_edge(*e) 449 | for j in g[i[1]]: # stack unadded edges/neighbours 450 | jp = relabeling_map.get(j, None) 451 | if jp not in gn or (e[1], jp) not in gn.edges: 452 | stack.append((i[1], j)) 453 | act = Action( 454 | ActionType.ADD_NODE, source=e[0], value=g.nodes[i[1]]["v"] 455 | ) 456 | if len(gn.nodes[e[1]]) < len(g.nodes[i[1]]): 457 | stack.append( 458 | (i[1],) 459 | ) # we still have attributes to add to node i[1] 460 | else: 461 | # The endpoint is in the graph, this is an AddEdge action 462 | assert e[0] in gn.nodes 463 | gn.add_edge(*e) 464 | act = Action(ActionType.ADD_EDGE, source=e[0], target=e[1]) 465 | 466 | if len(gn.edges[e]) < len(g.edges[i]): 467 | stack.append(i) # we still have attributes to add to edge i 468 | else: # i is a node, (u,) 469 | u = i[0] 470 | n = relabeling_map.get(u, None) 471 | if n not in gn.nodes: 472 | # u doesn't exist yet, this should only happen for the first node 473 | assert len(gn.nodes) == 0 474 | act = Action(ActionType.ADD_NODE, source=0, value=g.nodes[u]["v"]) 475 | n = relabeling_map[u] = len(relabeling_map) 476 | gn.add_node(0, v=g.nodes[u]["v"]) 477 | for j in g[u]: # For every neighbour of node u 478 | if relabeling_map.get(j, None) not in gn: 479 | stack.append((u, j)) # push the (u,j) edge onto the stack 480 | else: 481 | # u exists, meaning we have attributes left to add 482 | attrs = [j for j in g.nodes[u] if j not in gn.nodes[n]] 483 | attr = attrs[np.random.randint(len(attrs))] 484 | gn.nodes[n][attr] = g.nodes[u][attr] 485 | act = Action( 486 | ActionType.SET_NODE_ATTR, 487 | source=n, 488 | attr=attr, 489 | value=g.nodes[u][attr], 490 | ) 491 | if len(gn.nodes[n]) < len(g.nodes[u]): 492 | stack.append((u,)) # we still have attributes to add to node u 493 | traj.append((gt, act)) 494 | traj.append((gn, Action(ActionType.STOP))) 495 | return traj 496 | 497 | 498 | # Utility type-aliases 499 | StateActionPair = Tuple[Graph, Action] 500 | ActionIndex = Tuple[int, int, int] 501 | -------------------------------------------------------------------------------- /tests/data/test.smi: -------------------------------------------------------------------------------- 1 | c1ccc(Nc2n[nH]c3cccnc23)cc1 2 | N#Cc1c(NC(=O)Cc2ccccc2Br)sc2c1CCCC2 3 | CCCCC(=O)C1CCN(C(=O)c2cc3sccc3n2Cc2ccc(OC)cc2)CC1 4 | Cc1cc(=O)oc2cc(OCCSC(=S)N3CCN(C(C)C)CC3)ccc12 5 | C=CCOc1nc(-c2ccc(CP(=O)(OCC)OCC)cc2)nc2ccc(Br)cc12 6 | CCCCOc1c(CCCC)c(O)nc2ccccc12 7 | O=C(O)C1=CC(c2ccc(F)cc2)CC(OCc2ccc(CO)cc2)O1 8 | CC(=O)c1ccc(NC(=S)N2CCN(CCN3C(=O)c4cccc5cccc(c45)C3=O)CC2)cc1 9 | c1cncc(-c2cc(-c3cccnc3)ncn2)c1 10 | OCCN1C=NCC1 11 | O=C(Nc1cccc(F)c1)N1CCN(c2ccnc(Cl)n2)CC1 12 | COc1ccc(-n2c(C3CC3)nnc2N2CC3CC3(c3ccc(F)cc3Cl)C2)cn1 13 | CCOC(=O)c1nc2c(c(=O)[nH]1)C1CCCN1C(=O)N2c1ccccc1 14 | COc1cc(C=NNC(=O)c2ccncc2)ccc1OC(=O)c1cccc(Br)c1 15 | Cc1nn(-c2ccccc2)c(C)c1NC(=O)COC(=O)Cc1c(C)n(C)c2ccccc12 16 | Cc1ccc(C2=C(C#N)C(=O)NC(c3cccc(Br)c3)(C(F)(F)F)C2)cc1 17 | CC1(COc2cccc(-c3ccc(F)cc3F)n2)CCn2cc([N+](=O)[O-])nc2O1 18 | Cc1nn(C)c2c1C(c1ccc(C(C)C)cc1)C(C#N)=C(N)O2 19 | Cc1[nH]c2ncnc(Nc3cccc(C#N)c3)c2c1C 20 | Nc1nccc(-c2ccc3c(NCc4ccccc4)n[nH]c3c2)n1 21 | CCOc1ccc(NC(=O)CSc2cn(CCNC(=O)c3ccccc3F)c3ccccc23)cc1 22 | CC(C)(C)NC(=O)C1CCN(Cc2cccc(NC(=O)C3CCOC3)c2)CC1 23 | CCCCC(=O)NC(Cc1cccc(C)c1)C(=O)NCC#N 24 | COc1ccc(-c2cc(-c3cccc(S(C)(=O)=O)c3)cnc2N)cn1 25 | CC(C)n1cc(NC(=O)c2cc(NC(=O)c3ccc(Cl)c(Cl)n3)cn2C)cc1C(=O)Nc1cc(C(=O)NCCCN(C)C)n(C)c1 26 | CC(C)(NC(=O)OCc1ccccc1)C(=O)OC(=O)C(C)(C)NC(=O)OCc1ccccc1 27 | CC(C)Cc1cc(C(=O)NCC2(C#N)CCN(CC3=Cc4ccccc4OC3(C)C)CC2)nn1-c1ccccc1 28 | COCCOc1cc(N)nc(C)c1CNC(=O)c1cnn(Cc2ccccc2)c1 29 | CC(=O)Nc1ccc2c(c1)[nH]c1cc(C(F)(F)F)ccc12 30 | COc1ccc(-c2nc3ccccc3c(=O)[nH]2)cc1Oc1ccc(Cl)cc1Cl 31 | O=C(COC(=O)c1cncc(Br)c1)NC1CCCCCC1 32 | CN1C2CCC1CC(=NOC(c1ccc(Cl)cc1)c1ccc(Cl)cc1)C2 33 | COc1ccc(C=C(NC(=O)c2ccc([N+](=O)[O-])cc2)C(=O)NCCN2CCOCC2)cc1 34 | Cc1ccc(S(=O)(=O)N(C)C)cc1NC(S)=NCc1ccco1 35 | CCOC(=O)c1cc2ccccn2c(=Nc2ccc(F)cc2)n1 36 | Cc1nccn1-c1cc(-c2cnc(N)c(OC(F)F)c2)nc(N2CC3CC2C3)n1 37 | O=C(NCc1ccccc1Br)c1cnc2sc(N3CCCCCC3)nn2c1=O 38 | CCCCCCNC(=O)CSc1nc2c([nH]c3ccccc32)c(=O)n1-c1ccccc1 39 | CCN(CC)S(=O)(=O)c1cccc(C(=O)Nc2ccc(-c3cccc(C(N)=O)c3)cc2C(=O)O)c1 40 | COc1cc(Cl)c(Cl)cc1NC(=O)NS(=O)(=O)c1ccc(C)cc1 41 | CS(=O)(=O)Oc1ccc(C(=NNc2ccc([N+](=O)[O-])cc2[N+](=O)[O-])c2ccc(OS(C)(=O)=O)cc2)cc1 42 | CC(C)=CCCC1(C)C=Cc2c(ccc(C(=O)C=Cc3ccc(C#N)cc3)c2O)O1 43 | O=C1CC(=C2c3ccccc3-c3ccccc32)C(=O)N1CCCCN1CCN(c2cccc(C(F)(F)F)c2)CC1 44 | CN(C)C(=O)CN1CC2CCN(C(=O)C3CCCO3)CCC2S1(=O)=O 45 | CN(C)C(=O)Cc1cn(-c2cccc(C(F)(F)F)c2)nc1-c1ccc2c(c1)CCc1ccccc1-2 46 | O=C(Nc1cc(Br)ccc1C(=O)O)c1ccc(SC(F)(F)F)cc1 47 | CCn1c(COc2ccccc2C)nnc1SCC(=O)CC(=O)Nc1ccccc1OC 48 | Cc1cccc(CN2CCN(CC(=O)O)C2=O)c1 49 | CCc1c(-c2nnc(C3(c4ccc(OC)cc4)CC3)o2)nc(-c2ccc(Cl)cc2Cl)n1-c1ccc(Br)cc1 50 | O=C1NC(=O)C(=Cc2cnn3c(NC4CC4)cc(NC(=O)OCc4ccccc4)nc23)N1 51 | Cc1ccccc1NS(=O)(=O)c1cc2c(cc1C)NC(=O)CO2 52 | CCCCCCCN(CC(=O)Nc1ccc(C)cc1C)Cc1ccc(OC(C)(C)C(=O)O)cc1 53 | Cc1cc(NC(=O)Nc2cccc(OC(F)(F)F)c2)c2ccccc2n1 54 | CC1=C(CCc2ccoc2)C2(C)CCCC(C)(C)C2C(O)C1=O 55 | COc1ccc(NC(=O)CCN2C(=O)NC3(CCCC3)C2=O)cc1Cl 56 | Cc1nc(SCC(=O)c2ccc(S(N)(=O)=O)c(Cl)c2)[nH]c(=O)c1Cc1ccccc1 57 | O=C(O)CCNc1nc(Cc2nnc(SCC(=O)NNC(=O)CCl)n2NC(=O)c2cccc([N+](=O)[O-])c2)cs1 58 | Cc1cccc(C)c1N1C(=O)c2nccnc2C1=O 59 | CCc1nc2ccc(C3CCN(S(C)(=O)=O)CC3)cn2c1N(C)c1nc2c(s1)Cc1ccc(F)cc1-2 60 | Cc1cc(C)cc(Oc2nccc(-c3c(C)nnn3C3CCN(C(=N)N)CC3)n2)c1 61 | CS(=O)(=O)c1ccc(-c2c(-c3ccccc3)cc(-c3ccc(F)cc3)oc2=O)cc1 62 | COc1cc(-c2nc(=O)c3c([nH]2)sc2cc(C(F)(F)F)ccc23)ccc1OCC(=O)O 63 | O=C(Nc1ccc(SC(F)(F)F)cc1)Nc1ccc(SC(F)(F)F)c(Cl)c1 64 | COCc1nn(CCn2cc(Br)cn2)c(=O)o1 65 | O=C(Cc1ccccc1)NCc1ccc(-c2nc(C(=O)N3CC=CCC3)co2)cc1 66 | COc1ccc(C2=C(c3ccc(SC)cc3)CC3CCCN3C2=O)cc1C 67 | CC[N+]1=C(C)C(C)(C)c2cc(C)ccc21 68 | COc1ccc(C)cc1NC(=O)CN(C)S(=O)(=O)c1ccc2[nH]c(=O)oc2c1 69 | Cc1cc2cc(C#N)ccc2c(C)c1Nc1nc(Nc2ccc(C#N)cc2)nc(OCCCN2CCOCC2)n1 70 | NCCC(N)C(=O)N1CCCCC1 71 | CN(C)c1ncccc1C(=O)N1CCCC1Cn1cccn1 72 | COc1ccc(NC(=O)CN(C)C(=O)c2cc(C3CC3)nn2-c2ccccc2)cc1 73 | NS(=O)(=O)c1ccc(NC(=O)c2cccc(C(=O)O)n2)c(F)c1 74 | CCN1C(=O)C(C)(C)Oc2cc(C)c(-c3cc(C(C)=CC(=O)O)ccc3OC(F)(F)F)cc21 75 | NC1CCCC(C(=O)Nc2ccc3[nH]nc(Nc4cccc5cccnc45)c3c2)C1 76 | CCCCc1ccc(CC(=O)Nc2cc(S(=O)(=O)N3CCOCC3)ccc2O)cc1 77 | O=C(c1nc(-c2ccc(Cl)cc2)c2cc(Cl)ccc2n1)N1CCCCC1 78 | NNc1ncnc2[nH]cnc12 79 | COc1ccc(C2C(Oc3ccc(Cl)cc3Cl)C(=O)N2c2sc3c(c2C#N)C=C(C=Cc2c(C)nn(-c4ccccc4)c2Cl)CC3(C)C)cc1 80 | CON=C1CN=C(C(C)C)N1c1ccc(C(O)(C(F)(F)F)C(F)(F)F)cc1 81 | O=C(CCC(NC(=O)c1cc(Cl)cc(Cl)c1)C(=O)N1CCC2(CCCC2)CC1)NC(Cc1ccc2ccccc2c1)C(=O)O 82 | O=C(NC1=C(N2CCOCC2)C(=O)c2ccccc2C1=O)c1ccccc1 83 | CCCc1cc(=O)[nH]c(SCC(=O)c2ccc(S(N)(=O)=O)c(Cl)c2)n1 84 | COc1cc2nc(N(C)c3nc(OC)c4ccccc4n3)nc(C)c2cc1OC 85 | O=C(NO)c1ccc[n+]([O-])c1 86 | CC(C)NC(=O)CSc1nc2sccc2c(=O)n1-c1ccccc1 87 | CSc1nc(C(=O)N2CCCC(C(=O)c3ccc(Cl)cc3)C2)cs1 88 | CCCc1nc(N)c2nc(-n3nccn3)n(C)c2n1 89 | O=C(O)Cn1c2c(c3cc(F)ccc31)CN(C(=O)CCc1cccc3ccccc13)CC2 90 | COc1ccc(C(=O)Nc2ccc(C(C)N(C)c3ncc4c(N)nc(N)nc4n3)cc2)cc1 91 | C=CCc1cc(CN2CCN(C(C)C)CC2)c(O)c(-c2ccc(O)c(CC=C)c2)c1 92 | NC1(C(=O)O)CC2CCC1C2 93 | O=C(NCCCCCn1c(C2CCNCC2)nc2cc(Cl)c(Cl)cc21)c1ccccc1 94 | CC(C)(NC(=O)c1ccc(C#Cc2ccccc2)cc1)C(=O)NO 95 | O=c1[nH]c2ccccc2n2c(=O)n(-c3ccc(Cl)cc3)nc12 96 | CCOC(=O)c1ccc(NC2CCCN(c3nnc(Cc4ccccc4)c4ccccc34)C2)nc1 97 | Cc1nc(C(O)CNC(C)(C)C)ccc1O 98 | COc1cc(CCC(=O)OCC(=O)Nc2ccc(C)cc2Cl)cc(OC)c1OC 99 | Cc1noc(C)c1CN(C)C(=O)NC(C)c1cccs1 100 | CCS(=O)(=O)c1cc(C#N)ccc1C1C2=C(CCC2=O)N(c2cccc(C(F)(F)F)c2)C(=O)N1C(=O)NC 101 | FC(F)(F)C1(c2nnc(-c3nn(-c4ccc(Cl)cc4Cl)c(-c4ccc(Br)cc4)c3Cn3cncn3)s2)CC1 102 | CC#CC(CC(=O)O)c1ccc(OCc2ccc(CN3CCC4(C=Cc5ccccc54)CC3)cc2)cc1F 103 | Cc1ccnc(NCCCCCCC(=O)NC(CC(=O)O)c2cccc([N+](=O)[O-])c2)c1 104 | N=C(N)NN=Cc1c(-c2ccc(Cl)c(Cl)c2Cl)nc2n1CCS2 105 | O=C(NCCC1CCN(Cc2ccccc2)CC1)c1ccc(Cl)cn1 106 | CN(C)Nc1nc(-c2ccc([N+](=O)[O-])o2)cs1 107 | CC(=O)N1C2CCC1c1c(n(C)c3cc(-n4ccc(OCc5ccccc5)cc4=O)ccc13)C2 108 | O=C1CSC(N2N=C(c3ccc(Br)cc3)CC2c2ccco2)=N1 109 | Fc1ccc2[nH]c3c(c2c1)CN(CCCCc1ccncc1)CC3 110 | O=C(O)c1ccc(C(=O)c2cc([N+](=O)[O-])cc([N+](=O)[O-])c2)cc1 111 | O=c1[nH]cnc2c1ncn2C1OC2COP(=O)(O)OC3C(O)C(COP(=O)(O)OC2C1O)OC3n1cnc2c(=O)[nH]cnc21 112 | CCOC(=O)N1CCN(S(=O)(=O)N2CCCC(C(=O)NCc3ccc(OC)cc3)C2)CC1 113 | NC1=CC(=O)c2ncccc2C1=O 114 | CNCc1cc(C(=O)NC)ccc1Oc1ccc(Cl)cc1C 115 | O=C(Cc1ccc(F)cc1)N1CCN=C1SCc1cccc([N+](=O)[O-])c1 116 | CC(Nc1cc(F)cc(F)c1)c1cc(C(=O)N(C)C)cc2c(=O)cc(N3CCOCC3C)oc12 117 | C=C(COC)C1CCC2(C(=O)O)CCC3(C)C(CCC4C5(C)CCC(OC(=O)n6ccnc6C)C(C)(C)C5CCC43C)C12 118 | O=C(OCCCc1cccnc1)C1CCCCN1S(=O)(=O)Cc1cccc([N+](=O)[O-])c1 119 | CCN(CC)c1ccc(C(=O)c2sc(NC(=O)C3(c4ccc5c(c4)OCO5)CC3)nc2-c2ccccc2)cc1 120 | FC1(F)CCNC(C2COC(c3ccccc3)(c3ccccc3)O2)C1 121 | Cc1ccc2c(c1Br)C(=NN1C(=O)c3ccccc3C1=O)C(=O)N2 122 | O=C(C1CC(=O)N(C2CCCCC2)C1)N1CCC2(CC1)OCCO2 123 | O=C(O)C(Cc1ccc(F)cc1)C1CS1 124 | O=S(=O)(c1ccccc1)N(CC(F)(F)F)c1ccc(C(O)(C(F)(F)F)C(F)(F)F)cc1 125 | CC1Cc2ccccc2N1C(=O)COC(=O)C=Cc1ccc(F)cc1 126 | O=C(O)C1C(c2ccccc2)C(C(=O)OCc2ccccc2)C1c1ccccc1 127 | OC1CCCCCCC1 128 | COCC#Cc1nn(C2OC(COS(N)(=O)=O)C(O)C2O)c2ncnc(N)c12 129 | CN(C1CCCCC1)S(=O)(=O)c1cccc2nsnc12 130 | CC(=O)N(C)Cc1cn2cc(NC(=O)c3ccc(-c4ccc(C(F)(F)F)cc4)cc3)ccc2n1 131 | COc1cccc2oc3c(N4CC5CC4CN5)nc(N)nc3c12 132 | COc1cc2c(cc1OC)CN(CCCCn1c3ccccc3c3ccccc31)CC2 133 | OC(COc1cccc2[nH]ccc12)CN1CCC(COc2ccccc2)CC1 134 | C=CC(=O)Nc1cccc(Br)c1 135 | O=C(NC1CCN(S(=O)(=O)c2ccc(OC3CCNCC3)c(Br)c2)C1)c1ccc(Cl)c(Cl)c1 136 | Cc1ccc(C)n1NC(=O)c1ccc(O)cc1 137 | Cc1cccc(CN2CC3CN(C(=O)NC(C)C)CC3C2=O)c1 138 | COc1cc(N2CCN(N3CCN(C)CC3)CC2)ccc1Nc1ncc2ccc(-c3ccccc3OC)n2n1 139 | Cc1ncc(CNC2CCN(CCn3c(=O)ccc4ncc(F)cc43)CC2F)cc1C#N 140 | O=C(C1Cc2c(sc3ccccc23)CN1)N1CCNCC1 141 | CCCCCCCCCCCCCCCCNC(=O)c1c[nH]c(-c2ccccc2)n1 142 | O=c1c2c(-c3ccccc3)c3ccccc3nc2cnn1-c1ccccc1 143 | O=C(NCc1ccc(F)cc1)C(=O)c1cn(CC(=O)N2CCOCC2)c2ccccc12 144 | Cc1cnn(C(C)C2CC2)c1NC(=O)c1ccco1 145 | Cc1[nH]n(-c2ccccc2)c(=O)c1N=Nc1ccc2c(c1)OCO2 146 | O=C(Nc1ccc2cn[nH]c2c1)NC1CCCCC1CN1CCCC(Cc2ccc(F)cc2)C1 147 | COc1ccc2[nH]cc3c2c1CC(N(C)C)C3 148 | CCC=CCCCCCCCCCCCCCCCCCC(=O)O 149 | CCC(N)Cc1cc(OC)c(SC)cc1OC 150 | COc1ccc(NC(=O)NCCN2CCOCC2)cn1 151 | O=C1NC(=O)N(c2ccccc2)C(=O)C1=CNc1cccc2c1C(=O)c1ccccc1C2=O 152 | CCOc1cc(=O)n(C)cc1-c1cc(NC(=O)CCc2ccc(OC)c(OC)c2OC)ccc1Oc1ccc(F)cc1F 153 | CS(=O)(=O)c1cc(-c2ccc(F)cc2)cc(S(=O)(=O)c2ccc(CN)s2)c1 154 | Cc1cc(Nc2ccccc2Cl)n2nc(-c3ccco3)nc2n1 155 | CC#Cc1cc(-c2ccc3c(c2)C2(N=C(C)C(N)=N2)C2(CCC(OC)CC2)C3)ccn1 156 | COc1cncc(-c2cc3ccc(OC)cc3cc2C)c1 157 | Cc1noc(C)c1CN(C)C(=O)c1nc(N2CCCC2)ncc1Cl 158 | CC(NP(=O)(OCC1OC(n2cnc3c(=O)[nH]c(N)nc32)C(C)(O)C1O)Oc1cccc2ccccc12)C(=O)OC1CCCC1 159 | COc1ccccc1CNC(=O)C1CCN(c2nc3ccc(C)cc3[nH]2)CC1 160 | c1coc(C2=Nc3ccccc3SC(c3ccc4c(c3)OCO4)C2)c1 161 | O=c1c(-c2ccccc2)cncn1C(CN1CCCC1)c1ccccc1 162 | COc1ccc(-c2nn(CCn3ccnc3)c(=O)c3ccccc23)cc1OC 163 | CN(C)CCCNC(=O)C(c1cccc(F)c1)N(C)C 164 | O=C(COC(=O)CNC(=O)c1ccc(Cl)cc1)Nc1ncc(C(F)(F)F)cc1Cl 165 | CCC(O)C1CC(Cc2ccccc2)CCN1CCCNC(=O)Nc1cccc(C(C)=O)c1 166 | Cc1ccc(CSc2nc(N)cc(NCCC(=O)N3CCCC3)n2)cc1 167 | O=C(CO)NC1C(O)CC(OCc2ccc(-c3ccccc3)cc2)(C(=O)O)OC1C(O)C(O)CNCc1ccc(-c2ccc(O)cc2)cc1 168 | O=C(NCCc1ccccc1)c1ccc2c(c1)S(=O)(=O)N=C1CCCCCN12 169 | CCC(=O)CCCCCC(NC(=O)CC1CC2(C1)CN(C)C2)c1ncc(-c2cc3ccccc3nc2OC)[nH]1 170 | CCN1CC2(COC(=O)c3ccccc3N)CCC(OC)C34C5CC6C(OC)CC(O)(C5C6OC)C(O)(C(OC)C23)C14 171 | COc1ccc2nc(Cl)c(C3CC(c4ccco4)=NN3C(=O)CCCC(=O)O)cc2c1 172 | O=Cc1ccc(-c2ccc(Cl)cc2F)o1 173 | O=C(Nc1ccc(C(F)(F)F)cc1)N1CCCC1c1ncnn1Cc1ccc(Cl)cc1 174 | COc1ccc2c(c1)-c1c(N3CCNCC3)nc3ccccc3c1C2=O 175 | O=C(c1ccco1)N(Cc1cccs1)c1ccc(O)cc1 176 | O=C1CCCC2C3CCC[N+]4([O-])CCCC(CN12)C34 177 | O=C1CCCN1CC#CC[S+]1CCCC1 178 | O=C(CCCOC(=O)c1ccc(O)cc1)c1ccccc1 179 | COc1ccc(NC(=O)c2cc(-c3ccc(F)cc3OC)[nH]n2)c(OC)c1 180 | CCN(Cc1ccc(Cl)nc1)C1=C([N+](=O)[O-])CN(Cc2ccc(OC)cc2)CN1C 181 | O=C(CCCN1CCN(C(=O)c2ccccc2)CC1)NC1c2ccccc2SCC2CCCCC21 182 | CCOc1ccc(S(=O)(=O)N2CCOCC2)cc1NC(=O)CSCc1ccc(C)cc1 183 | COC(=O)C(=Cc1ccc(OC)c(OC)c1)c1ccc(OC)c(OC)c1 184 | COc1ccc(-c2nc(NCc3cccc(C=CC(=O)NO)c3)cc3c2[nH]c2ccccc23)cc1 185 | CCN1C(=O)c2cccc3c(S(=O)(=O)NC4CCOC4)ccc1c23 186 | COc1ccccc1NC(=O)C(NC(=O)c1ccc(C)cc1)c1ccccc1 187 | Cc1nc2cc(S(=O)(=O)c3ccc4[nH]c(C)nc4c3)ccc2[nH]1 188 | CC1CN(C(=O)C2CN(C(C)(C)C)CC2c2ccc(F)cc2F)CC(C)C1(O)c1ccc(C(F)(F)F)cc1 189 | COc1ccc(N)cc1C1CN(c2nc(-c3ccncc3F)cc(=O)n2C)CCO1 190 | Cc1cccc(CN2CCN(c3n[nH]c(N)n3)CC2)c1 191 | Cn1cncc1C(O)(C#Cc1ccc(C#N)cc1)c1ccc(C#N)c(-c2cccc3ccccc23)c1 192 | CC(C)(O)c1ccn2c(-c3ccc(F)c(-c4ccccc4C#N)c3)cnc2c1F 193 | COc1ncccc1-c1csc2ncnc(N3C4CCC3CC(NC3CCC3)C4)c12 194 | CSC1CN(C(=O)c2ccc3cc(Oc4ccc(C(F)(F)F)cn4)ccc3n2)CC1O 195 | Cc1nn(C)cc1C(=O)NC1CCOCC1O 196 | Cc1nc2ncnn2c(N2CCN(C(=O)c3ccco3)CC2)c1Cc1ccccc1 197 | CN(C)C(=N)c1ccc(C(=O)Nc2ccc(Cl)cc2C(=O)Nc2ccc(Cl)cn2)c(N2CCCCC2C(=O)O)c1 198 | C=CCN1C(=O)C(=Cc2ccc(-c3ccccc3)cc2)SC1=S 199 | Cc1c(S(=O)(=O)NC2CCN(CCCc3noc4ccccc34)C2)sc2ccc(F)cc12 200 | O=C(CC1CCC(NC(=O)Nc2ccccc2F)C(CO)O1)NCc1ccc(-c2ccccc2)cc1 201 | CC(C)=CCCC(C)=CCc1c(O)ccc(C=O)c1O 202 | CC(=O)NC(C)C(=O)NCCCc1ccccc1 203 | COc1nc(NC2OC(COC(C)=O)C(OC(C)=O)C(OC(C)=O)C2OC(C)=O)c(N=C(C)C(C)=O)c(=O)n1C 204 | O=C(CCCN1C(=O)c2ccc(F)c(F)c2C1=O)Nc1ccc(-c2ccccc2)cn1 205 | CCOC(=O)c1c(C)nc(-c2cccc(C#N)c2)n1OCC(N)=O 206 | Cc1cc(C(=O)CN2C(=O)C(=O)N(C3CCCC3)C2=O)c(C)n1CCc1ccccc1 207 | CCn1c(CCC(O)CC(O)CC(=O)O)c(-c2ccc(F)cc2)c(C)c1C(=O)NCc1ccc(C(=O)OC)cc1 208 | CCNC(=O)Nc1ncnc2c1ncn2C1OC(CSCCCS(=O)(=O)O)C2OC(C=Cc3ccccc3)OC21 209 | CCCn1c(=O)c2[nH]c(-c3ccc(OCC(=O)Nc4ccc(C(C)=O)cc4)cc3)cc2n(CCC)c1=O 210 | COc1ccccc1C=CCN1CCN(CCOC(c2ccc(F)cc2)c2ccc(F)cc2)CC1 211 | C=C(C(=O)OCCC)C(O)c1ccccc1[N+](=O)[O-] 212 | CC(C)(C)[Si](C)(C)Oc1ccc2ccc(=S)oc2c1 213 | COC(=O)C(CCCCNC(=O)OC(C)(C)C)N(C=CCc1cccc(Oc2ccc(C(C)(C)C)cc2)c1)Cc1cccc(OCc2ccccc2)c1 214 | c1ccc(CN2C(c3ccccc3)=Nc3ccccc3C2c2ccccc2)cc1 215 | COc1ccc(C2c3nc(CCCO)ccc3C(c3ccc4c(c3)OCO4)C2C(=O)O)cc1 216 | O=c1[nH]ccc2c(Cc3nnc4ccc(-c5ccsc5)nn34)cccc12 217 | O=C(Cc1ccccc1)OCC1OC(=O)NC1CN1CCOCC1 218 | O=C(CN1CCOCC1)Nc1ccc(Br)cc1C(=O)c1ccccc1 219 | COc1ccc2nc(-c3cccnc3-c3cc(C(F)(F)F)ccc3Cl)[nH]c2c1 220 | O=C(CNC(=O)c1ccco1)Nc1ccc(NC(=O)N2Cc3ccccc3C2)cc1 221 | CCc1cccc(NC(=O)C2CCN(S(=O)(=O)c3cccs3)CC2)c1 222 | Cn1cc(NC(=O)c2cc(NC(=O)c3cc(NC(=O)CN=C(N)N)cn3C)cn2C)cc1C(=O)NCCC(=N)N 223 | COc1cn2c(c(C(C)C)c1=O)C(=O)c1ccnc(CC(C)C)c1-2 224 | CC1(C)C=Cc2c(cc3c(c2O)C(=O)CC(c2ccc(O)cc2O)O3)O1 225 | CCc1ccc2c(c1)NC(=O)C2(C)Cc1ccccc1OC 226 | CCC(=O)N1CCC2C(CC(Cn3cccn3)N2c2nccs2)C1 227 | CC(C)(C)OC(=O)NC(=NCC1CCCCC1)NCCCc1c[nH]cn1 228 | COc1ccc(-c2cn3nc(N4CCCC(C(=O)Nc5cc(Cl)ccc5OC)C4)sc3n2)cc1 229 | CC[Si]1(C#CC2(O)CCC3C4CCc5cc(O)ccc5C4CCC32C)CCCCC1 230 | CCCCC(CO)NC(=O)C(C)NC(=O)C(O)c1cc(F)cc(F)c1 231 | CSc1cccc(NC(=O)Nc2cccc3ccc(O)cc23)c1 232 | COc1c(O)c2c3c(c1OC)CCNC3Cc1ccccc1-2 233 | C=C(Br)CN1CCC2(c3ccccc3)CC1Cc1ccc(O)cc12 234 | COc1ccc(C2OC(O)C3C(c4cc(OC)c5c(c4)OCO5)OC(=O)C23)cc1OC 235 | Brc1cncc(C2CN3CCC2CC3)n1 236 | COc1cc(-c2cccc(C3CC3)c2)c(F)cc1-n1c(=O)ccc2cc(S(=O)(=O)Nc3ccon3)ccc21 237 | O=C(CCl)Nc1cccc(C(=O)N2CCC(c3cc4c(-c5cnn6ncccc56)ccnc4[nH]3)C2)c1 238 | CCNC(=O)Nc1nc2nc(N)ncc2cc1-c1c(Cl)cccc1Cl 239 | O=C(NCCc1c[nH]c2ccccc12)c1ccc[n+](Cc2ccccc2F)c1 240 | Cc1nn(CCOc2ccccc2)c(=O)n1-c1ccccc1 241 | O=c1nc(N2CC3CN(Cc4ccc(F)cc4)CC3C2)sc2c([N+](=O)[O-])cc(C(F)(F)F)cc12 242 | Cc1cncc(-c2cc3c(cn2)cnn3-c2cnc(C3CC3)c(N3CCCC(N)C3)n2)n1 243 | O=C(OCC(=O)C12CC3CC(CC(C3)C1)C2)c1ccc(CO)cc1 244 | COc1cc(OC)c(-c2cn3ccc(N4CCC(NCc5ccnc(Cl)c5)C4)cc3n2)cc1Cl 245 | COC(=O)C1=C(C(=O)OC)C2N(Cc3ccccc3)c3ccccc3C23CC(CO)NC3=N1 246 | CC(C)(C(=O)SSC(=O)C(C)(C)c1ccccc1)c1ccccc1 247 | CCc1nnc(NC(=O)CCN(c2ccc(C)cc2)S(=O)(=O)c2ccccc2)s1 248 | COc1ccc(COC(=O)C2=C(C)NC(=O)NC2c2cccc(OC)c2OC)cc1 249 | Cc1cc(C)cc(NC(=O)C2CCN(S(=O)(=O)c3c(C)noc3C=Cc3ccco3)CC2)c1 250 | c1ccc(CNCCNC2CC2)cc1 251 | O=S(=O)(NCCCCO)c1cccc(-c2ccc(C(F)(F)F)cc2)c1 252 | O=C(C(O)c1ccccc1Cl)N1CCCN(Cc2cccnc2)CC1 253 | COC(=O)c1ccc(CNc2nc(NCCCN(C)C)nc(NCc3ccc(C(=O)OC)cc3)n2)cc1 254 | CN(C)S(=O)(=O)N1CCN(S(=O)(=O)c2ccc3ccccc3c2)CC1 255 | CCCCN1C(=O)C(=Cc2ccc(OCC(=O)OC)cc2)SC1=Nc1cccc(C(=O)O)c1 256 | O=S1(=O)c2ccccc2-c2ccccc2N1CCCN1CC=C(c2ccccc2)CC1 257 | O=C(Cc1cccs1)N(Cc1ccco1)C1(C(=O)NC2CCCCC2)CCCCC1 258 | O=S1(=O)CCOCC(c2ccccc2)N1Cc1ccccc1F 259 | CS(=O)(=O)NC(CCc1ccccc1)C(=O)NC(C=O)Cc1ccccc1 260 | CC(C)Cc1nn2c(=O)cc(COC(=O)c3cccc(NS(C)(=O)=O)c3)nc2s1 261 | CC(Cl)(Cl)C(NC(=O)c1cccs1)NC(=NC#N)Nc1ccc(Cl)nc1 262 | O=C(NO)c1ccc(Cn2nnc(-c3cccc(S(F)(F)(F)(F)F)c3)n2)cc1 263 | CC(=O)N1CCC(c2nccnc2-c2ccc(F)cc2)CC1 264 | COc1cccc(-n2nc(NC(=O)C3CNC(=O)C3)cc2-c2cccc(OC(F)(F)F)c2)c1F 265 | CC(C)(C)CN1CCC2(CC1)CN(c1ccccc1Nc1ccc(-c3ccccc3)nn1)c1c(O)ccc(Cl)c12 266 | Cc1ccc(CNCC2(F)CCN(C(=O)c3ccco3)CC2)nc1 267 | CC(=O)OC1c2ccccc2-c2nc(N3CCOCC3)c3ccccc3c21 268 | COc1ccc2c(CNCCc3ccco3)c(C(=O)O)n(Cc3ccccc3)c2c1 269 | CC1CN(C(C)CO)C(=O)c2cc(NC(=O)Cc3cn(C)c4ccccc34)ccc2OC1CN(C)C(=O)Nc1ccc(C(F)(F)F)cc1 270 | CC(C)(C)CCN1C(=O)C(CC(=O)N2CCC(N3Cc4ccccc4NC3=O)CC2)SC1c1ccccc1N1CCNCC1 271 | O=C(O)c1ccc(-c2ccccc2C(=O)NCCCCCCCn2cc(-c3cccnc3)nn2)cc1 272 | COc1ccc(N2CCN(CCCNC(=O)c3ccc(CS(=O)(=O)c4ccc(OC)cc4)o3)CC2)cc1 273 | Cc1ccc(S(=O)(=O)N2CCc3ccccc3C2CC(=O)NCCc2ccc(C3=NCCN3)cc2)cc1 274 | NC(Cn1c(=O)c(-c2ccc(COC(=O)CO)cc2)cn(Cc2c(F)cccc2C(F)(F)F)c1=O)c1ccccc1 275 | COc1cc(-c2cnc(N)c3c(-c4ccc5ccccc5c4)csc23)cc(OC)c1OC 276 | NCCn1oc(=O)[nH]c1=O 277 | Cc1cc(C(=O)NN=Cc2cccc([N+](=O)[O-])c2)c(C)o1 278 | CNC(=O)c1c(-n2ccc(C)cc2=O)oc2cc(N(C)S(C)(=O)=O)c(-c3ccc4c(n3)-c3cc5c(F)cccc5n3CC4)cc12 279 | Cc1coc2cc3oc(=O)c(CC(=O)NC(CC(C)C)C(=O)O)c(C)c3cc12 280 | COc1ccc(C(=O)NCC(=O)N2CCN(C(=O)c3ccco3)CC2)cc1[N+](=O)[O-] 281 | CC(=O)Nc1cccc(C(=O)Nc2cccc(-c3ccc(-c4nc5cc(F)ccc5[nH]4)s3)c2)c1 282 | COC(=O)Nc1coc(C(=O)Nc2coc(C(N)=O)c2)c1 283 | O=C(NNC(=S)Nc1ccccc1)C12CC3CC(CC(C3)C1)C2 284 | C=C1CC2C3C=C(Br)C4=CC(=O)CCC4(C)C3CCC2(C)C1(OC(C)=O)C(C)=O 285 | CNc1ccc2c3c(n(CCCNC(=O)OC(C)(C)C)c(=O)c2c1)-c1ccccc1C3O 286 | CCC(N)(CC)CNS(=O)(=O)c1cccc(C(C)=O)c1 287 | COc1cccc(C(=O)NN=Cc2ccc(OC)c3ccccc23)c1 288 | Oc1c(O)c(Cl)c2c(c1Cl)CCN(C(=S)NCc1ccc(Cl)cc1)C2 289 | CC(Cn1cnc2c(N)ncnc21)OCP(=O)(O)NC(CCCNC(=N)N)C(=O)O 290 | Cc1ccccc1C=C1CCc2ccccc2C1=O 291 | CC(C)C1CN(C(=O)c2ccc(O)cn2)CC1N(C)C 292 | CC1(C)CC2CC(C)(CN2C(S)=Nc2ccccc2F)C1 293 | CCOC(=O)C1Oc2ccc(CNC(=O)C3SCCN3C(=O)CC(N)Cc3cc(F)c(F)cc3F)cc2O1 294 | Cc1ccccc1NC(=S)NN=C1CC2C(CCC3CC(O)CCC32C)C2CC3OC4(CCC(C)CO4)C(C)C3C12C 295 | COc1ccc2c3c([nH]c2c1)C(CO)N(C(C)=O)CC31CCN(Cc2ccccc2Cl)CC1 296 | NC(CC(O)CP(=O)(O)O)C(=O)O 297 | CCC(CC)CC1(C)CC(CC)C(CC(=O)OC)OO1 298 | N=C(NO)NN=Cc1c(Cl)cccc1[N+](=O)[O-] 299 | FC(F)(F)c1cccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)c1 300 | COc1ccc(-c2ccc(C(=O)NCCCCc3cccnc3)cc2OC)cc1OC 301 | CN1C(=O)C(N2CCc3[nH]c(Cc4ccccc4)nc3C2=O)COc2ccccc21 302 | O=C(Nc1ccc(Oc2cc(O)cc(O)c2)c(Cl)c1)c1cc(Cl)cc(Cl)c1O 303 | N#Cc1cn(-c2ccc(C(=O)O)cc2)cc1-c1ccccc1OCCCCO 304 | CC12Cc3cnn(-c4ccc(F)cc4)c3C=C1CCN(S(=O)(=O)c1ccc(C(C)(C)C)cc1)C2 305 | COc1ccc(-c2ccc3c(c2)C2CC(N(C)S(=O)(=O)c4ccccc4)C(C(C)O)C(=O)N2CC3)cc1 306 | O=C(CC1CCN(Cc2ccncc2)CC1)N1CCC(n2c(=O)[nH]c3ccccc32)CC1 307 | O=C(NCCCNC(=O)c1cc(-c2ccccc2)on1)c1cccnc1 308 | O=C(NCCCc1nnc2ccccn12)C1CCCN1 309 | CSC1=C(C(=O)O)N2C(=O)C(NC(=O)C(N)c3ccccc3)C2CC1 310 | COc1cc(O)cc2oc(-c3ccc(O)cc3)cc(=O)c12 311 | CC(Nc1nc(N)nc(N)c1Cl)c1nc2cccc(Cl)c2c(=O)n1-c1cc[nH]n1 312 | COCCNC(=O)NC1(c2nc(C)no2)CCCCC1 313 | CCCc1nc(SC)n2c(O)nnc2c1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1 314 | CCCCCCCCCC1=C(O)C(=O)C(CCCCC)=C(O)C1=O 315 | Cc1cccc(C(=O)N2CC3OCCN(C(C)C)C3C2)n1 316 | CNC(Cc1ccccc1)C(=O)N1CCCC1C(=O)NC(C)(CCCNC(=N)N)C(=O)c1nc2ccccc2s1 317 | Cc1ccc2[nH]c3c(NCc4ccccc4)ncnc3c2c1 318 | O=C(COC(=O)CCc1ccc(S(=O)(=O)N2CCOCC2)cc1)Nc1ccccc1OC(F)F 319 | CCCCNC(=O)CCn1c(=O)c2ccccc2n(Cc2ccc(F)cc2)c1=O 320 | CC(C)(C)CC(N)CN 321 | CCCCCCCSc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O 322 | O=C(NCCc1ccc(C(F)(F)F)cc1)C1CC(=O)N(c2ccc3c(c2)OCCO3)C1 323 | COC1CCCCC1Nc1cc2c(c3nsnc13)C(=O)c1ccccc1C2=O 324 | CCOC(=O)C1=C(Nc2ccc(C(F)(F)F)cc2)C(=O)N(c2ccc(C(F)(F)F)cc2)C1c1ccc(OC(F)(F)F)cc1 325 | CCOC(=O)C(Cc1ccccc1N)(C(=O)OCC)N1CCN(Cc2ccc(Cl)cc2)CC1 326 | COc1cc(OC)c(Cl)c(-c2ccc3c(NC(=O)c4ccc(N5CCN(C)CC5)cc4)n[nH]c3n2)c1Cl 327 | N#CC1(C(=O)NCC2CCC3(CCNCC3)O2)CC1 328 | O=C(CNC(=O)CNC(=O)c1ccccc1)NCC(=O)Nc1ccc(Oc2cccc(NC(=O)CNC(=O)CNC(=O)CNC(=O)c3ccccc3)c2)cc1 329 | COc1ccc2c(OC3CC4C(=O)NC5(C(=O)NS(=O)(=O)C6(C)CC6)CC5C=CCCCCCNC(=O)N4C3)cc(-c3nc(C(C)C)cs3)nc2c1C 330 | COCC(C)Oc1ccc2[nH]nc(-c3cc(N4CCOCC4)ncn3)c2c1 331 | Cc1ccc(OC(=O)C(Cc2ccccc2)NS(C)(=O)=O)cc1 332 | O=S(=O)(c1cccc(S(=O)(=O)N2CCC(n3nnc4ccccc43)CC2)c1)N1CCCCC1 333 | CN(CCCNc1c2ccccc2nc2cccc([N+](=O)[O-])c12)CCCNc1ccc([N+](=O)[O-])c2[nH]c3ccccc3c(=O)c12 334 | CCNc1nc(NC(C)C)nc(N(C#N)CC)n1 335 | CCc1cn(CS(C)(=O)=O)c(CC)c1Oc1ccc(C#N)cc1 336 | Cc1nnc2ccc(-c3ccc(OC(F)(F)F)cc3)cn12 337 | O=C(O)c1cn2c(ccc3cc([N+](=O)[O-])ccc32)n1 338 | CC(c1ccon1)N(C)C(=O)c1cccc(N)n1 339 | CN1CC2CC2(c2cc(Cl)c([N+](=O)[O-])cc2[N+](=O)[O-])C1 340 | CCc1c(C(=O)Nc2ccc(F)c(C(=O)NC)c2)cnn1CC(C)C 341 | COc1cc(C)ccc1OCCSc1nc2ccc(NC(=O)c3ccc(F)cc3)cc2s1 342 | COC1OC(CS(=O)(=O)CCC(C)(C)N(Cl)Cl)C(O)C(O)C1O 343 | O=C(OC1CN2CCC1CC2)c1ccc(Cl)cc1 344 | O=C(Cn1cc(-c2ccc(Cl)c(Cl)c2)nn1)NC12CC3CC(CC(C3)C1)C2 345 | Cc1ccc(NC(=O)Cn2cc(S(=O)(=O)NC(C)C)c(S(=O)(=O)NC(C)C)c2)cc1Cl 346 | O=C(NC1CCCC1OCc1ccccc1)c1cccnc1Oc1ccc(Nc2ccccn2)cc1 347 | Cc1nc(-c2ccc(N(Cc3ccccc3)C(=O)c3ccc(O)cc3O)cc2)c2ccccc2n1 348 | CN(C)CCCCCCNC(=O)c1ccccc1Nc1ccc2c(C=Cc3ccccn3)n[nH]c2c1 349 | COC(=O)C(Oc1cc(Cl)cc(Cl)c1)c1ccc(Oc2ccc(Cl)cc2)cc1 350 | C[N+](C)(C)CCOP(=O)(O)O 351 | O=C(CSc1nnc(COc2ccccc2)o1)N1CCCC1 352 | COC(=O)c1cc(NCc2cccc(Br)c2)ccc1N1CCOCC1 353 | CN(C)CCCOc1ccc(-c2cn3c(-c4ccc5ncccc5c4)cnc3cn2)cn1 354 | COc1cc(Cl)ccc1C1NC(=O)Nc2cc(OC)c(OC)c(OC)c21 355 | O=[N+]([O-])c1ccc(S(=O)(=O)N(CC(O)CN2CCCCC2)c2ccccc2)cc1 356 | O=C(CC1CCC2(CC1)OCCO2)NC1CCC(CCN2CCC(c3noc4cc(Cl)ccc34)CC2)CC1 357 | CC1=NC2(CCC3CN(S(=O)(=O)CC(C)C)CC32)C(=O)N1C 358 | Cn1c(=O)c2c(nc(N3CCCC(N)C3)n2Cc2cc(F)ccc2Cl)c2cc(C(=O)O)ccc21 359 | CCOc1ccccc1N1CCN(CC(=O)C(O)(c2ccccc2)C2CCC2)CC1 360 | CC1(CCNCc2ccc3c(c2)OCCO3)OCCc2sccc21 361 | CC(=NNC(=O)Cc1c(C)n[nH]c1O)c1ccc(Cl)c(Cl)c1 362 | CC(CS)C(=O)N(CC(=O)O)c1ccc(Oc2ccccc2)cc1 363 | O=c1[nH]c(-c2ccc(-c3nnn[nH]3)cc2)c(-c2ccc(-c3nnn[nH]3)cc2)cc1O 364 | Clc1ccc(-c2cc3nc(N4CCCC4)c4ccccc4c3nn2)cc1 365 | CC(C)N1CCc2c(sc(N)c2C#N)C1 366 | CC(=O)C1=C(O)C(=O)N(c2cc(C(C)(C)C)on2)C1c1ccc(C(C)C)cc1 367 | COc1cc(C=NNC(=O)C(OC)c2ccc3c(c2)OCCO3)ccc1F 368 | COc1ccccc1C(=O)c1c(N)nc2ccc(C(=O)c3c(F)cccc3F)cn12 369 | CC(C(=O)NCCNc1c2c(nc3ccccc13)CCCC2)c1ccc(-c2ccc(OCCCCCCO[N+](=O)[O-])cc2)c(F)c1 370 | COc1ccc(OCCOC(=O)C2CCCCN2C(=O)C(=O)C2(O)CCCCC2C)cc1OC 371 | O=C(Cn1nnc(-c2ccc(Cl)cc2)n1)NCc1ccco1 372 | COC(=O)C(c1ccc(Cl)cc1)C1CCCN1 373 | COc1cc(NS(=O)(=O)c2ccccc2)ccc1-c1cncc2ccccc12 374 | CCOC(=O)C1CCCN(c2ncnc3c2oc2ccccc23)C1 375 | CC(C)(C)NS(=O)(=O)c1ccccc1-c1ccc(-c2cnc(N)cn2)c(F)c1F 376 | CCOC(=O)N1CCN(C(=O)C(CCC(=O)O)NC(=O)c2cc(-c3cn[nH]c3)nc(-c3ccccc3)n2)CC1 377 | NC(=O)N1CC(Oc2cccc(C(F)(F)F)c2)C1 378 | CNC(=O)c1ccc(CNCc2ccc(SC)c(OC)c2)cc1 379 | CC(C)(C)Sc1c(CC(C)(C)C(=O)O)n(Cc2ccc(-c3cncnc3)cc2)c2ccc(OCc3ccccn3)cc12 380 | NCCC(=O)N1CCOc2ccccc2C1 381 | CCN(CC)CCCNCc1cc2c3ccccc3n(C)c2c(-c2cc(OC)c(OC)c(OC)c2)n1 382 | CC(=O)NCC1(c2cn3c4c(cccc24)CCC3)CCCCC1 383 | CC(CO)n1c(=O)n(C)c2cnc3ccc(-c4c[nH]c5nncc-5c4)nc3c21 384 | Cc1nn(-c2cccc(CN)c2)c2c(F)c(-c3ccc(N4CCCCC4=O)cc3)ccc12 385 | CCc1ccccc1N1CCN(C(=O)c2cc(OC)c(OC)c(OC)c2)CC1 386 | NC(=S)NN=C(c1ccccc1)c1ccccc1F 387 | COC(C)(C)C1CCCN1C1=C(C)C(=O)OC1 388 | COc1ccc2ccc(C(=O)Nc3ccccc3)c(OC(C)C)c2c1 389 | CCCCNC(=O)CCCCCCCCCCOCC1Cc2ccccc2CN1C(=O)c1ccc(OC)cc1 390 | CC(=O)NC1C(OCc2ccccc2)OC(CO)C(O)C1OC(C)C(=O)NC(C(=O)NC(CCC(=O)O)C(N)=O)C(C)C 391 | O=C(NCc1ccn[nH]1)c1[nH]c2ccc(Br)cc2c1S(=O)(=O)N1CCCC1 392 | COc1ncc(-c2cc3c(n2C(C)C)C(c2ccc(Cl)cc2)N(c2ccc(F)c(Cl)c2)C3=O)c(OC)n1 393 | Cc1c(Cl)cc(OCC(F)(F)F)c2nc(CCc3nc(N4CCCC4)nn3C)nn12 394 | CC(C)(C)NCc1c(C(=O)O)nn(-c2ccccc2)c1-c1ccc([N+](=O)[O-])cc1 395 | Cc1cc(C)c(Cn2c(C3CC(=O)N(c4ccc(F)cc4)C3)nc3ccccc32)c(C)c1 396 | Cc1ccc(C(=O)NCC(=O)NCCCc2ccccc2)cc1 397 | COC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)c1ccc(NC(=O)C(N)CC(=O)O)c(OCCc2c[nH]c3ccccc23)c1 398 | CCN(CC)CCNC(=O)c1ccccc1NC(=O)C(=O)Nc1ccccc1C(=O)NCCN(CC)CC 399 | CC1(C)C(=O)C=CC2(C)C1CC(=O)C1(C)C2CCC2(C)C(c3ccoc3)C(=O)C3OC321 400 | CC(C)NC(=O)c1ccc(CC2CCN(C3CCN(C(=O)c4ccccc4Cl)CC3)CC2)cc1 401 | Cc1cc(C)cc(NC(=O)Nc2ccc3nc(N4CCN(C)CC4)cc(C)c3c2)c1 402 | CCNC(=O)Nc1nc2ccc(-c3cccc(C)c3)cc2[nH]1 403 | Cc1ccc2oc(C(=O)NCc3ccccc3)cc(=O)c2c1 404 | CC(CC(=O)NC1CCCC1)=NNC(=O)Cc1csc(N)n1 405 | C=CCN(Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1)c1nc(C)ncc1C(=O)O 406 | C=C1C(=O)NC(C)C1c1ccc(Br)cc1 407 | C=CCn1c2ccccc2c2c3c(ccc21)CN(Cc1ccc(C)cc1)CO3 408 | OCCN1CCN(C(=S)Nc2ccccc2)CC1 409 | CCCc1nnc2n1N=C(c1ccc(-c3cc(Cl)ccc3Cl)o1)CS2 410 | COc1ccc2c(c1)OC1(C)CC2C(C(=O)c2ccccc2OC)C(=O)N1 411 | COc1ccc(C=Cc2ncc([N+](=O)[O-])n2CCOC(=O)c2cccc3c2OCCO3)cc1 412 | Brc1cnc(NCc2cccnc2)nc1Nc1cc(C2CC2)[nH]n1 413 | CC(=O)OCC1OC(CC(=O)C=Cc2ccc(O)cc2)C(OC(C)=O)C(OC(C)=O)C1OC(C)=O 414 | Cc1ccc2nc(O)c(N3CCOCC3)c(-c3ccccc3)c2c1 415 | CCOC(=O)C(C(=O)OCC)=C1NC2(c3ccccc3Nc3ccccc32)C(C#N)S1 416 | CCN(CC(=O)NC)C(=O)c1cc(-c2cccc(Cl)c2)no1 417 | Cc1cc(C)c2c(c1)n(CC(=O)Nc1ccc3c(c1)CC1(C3)C(=O)NC(=O)N1C)c(=O)n2CC(=O)N(C)C 418 | CCOc1ccccc1C(=O)Nc1ccccc1C(=O)Nc1ccccn1 419 | CN(C)c1cc2c(cc1NC(=O)c1ccccc1Cl)n(C)c(=O)n2C 420 | NC(=S)NN=C1C(=O)Nc2ccccc21 421 | O=C(O)Cn1c(=O)c(=O)[nH]c2cc([N+](=O)[O-])c(-n3ccc(CNCCCc4ccccc4)c3)cc21 422 | COc1cc(OC)c(CN2CCCN(C)CC2)cc1Br 423 | NC(N)=NCCc1cccc2c(-c3ccc(C(F)(F)F)cc3)cccc12 424 | CC(C)(C)NC(=O)C(c1cccnc1)N(C(=O)c1ccco1)c1ccc(NS(=O)(=O)c2ccccc2)cc1 425 | CC(=NN)C(C)=NN=C(C)C(C)=NN 426 | CCCCCC(=O)N1CC(n2cc(C3CC3)nn2)C(O)CC1c1ccccc1 427 | COC(=O)N(NC(=O)c1c(CN2CCN(S(=O)(=O)c3ccccc3)CC2)c(-c2ccccc2)nc2ccccc12)c1ccccc1 428 | C=C1CC(C)CC(=O)O1 429 | Cn1ncc(NC(=O)c2nc(-c3c(F)cccc3F)sc2N)c1N1CCNCCC1=O 430 | COc1ccc(-n2cnc3cc(C(=O)N4CCC5(CC4)OCCO5)ccc32)cc1 431 | O=C(O)c1cccc(O)n1 432 | Cc1nc(C)c(-c2ccc3cc(-c4c(C5CCCCC5)c5ccc6cc5n4CC(=O)NCCC=CCS(=O)(=O)NC6=O)ccc3n2)s1 433 | CC1=C(C(=O)Nc2ccc(C)cn2)C(c2cnn(C)c2)C2=C(O)CCCC2=N1 434 | CC1CCN(C(=O)Cn2cc(SCc3ccccc3)c3ccccc32)CC1 435 | CCn1c(NC(=O)C(C)C)c(-c2ccccc2)c(=O)c2ccccc21 436 | CN(c1ccc(C(=O)COC(=O)C2CC2)cc1)S(C)(=O)=O 437 | CCOC(=O)C1CCN(C(=O)CNC(=O)Nc2ccccc2Cl)CC1 438 | CC(OC(=O)Cc1coc2ccc3ccccc3c12)C(=O)Nc1ncc(Cl)cc1Cl 439 | Clc1cc(Cl)c2c(c1)oc1c(Cl)c(Cl)c(Cl)cc12 440 | O=C(Nc1cnccn1)Nc1ccnc2ccc(C(F)(F)F)cc12 441 | CC(CS(=O)(=O)c1ccc(Oc2ccccc2)cc1)(NCc1ccc2ccccc2c1)C(=O)NO 442 | CCc1cc(S(=O)(=O)Nc2cscn2)c(F)cc1Oc1ccc(F)cc1-c1ccnn1C 443 | Clc1ccc2nc3ccccc3c(NCc3nc4ccccc4[nH]3)c2c1 444 | O=C(CCl)Nc1ccc(Cl)cc1NS(=O)(=O)c1cccc(F)c1 445 | O=C(c1ccc(F)cc1)n1nnc2ccccc21 446 | C=COc1c(OC)c(OC)cc2c1-c1ccc(OC)c(=O)cc1C(NC(C)=O)CC2 447 | O=S(=O)(O)c1ccc2c(S(=O)(=O)O)cc(S(=O)(=O)O)cc2c1 448 | COc1ccc(N(C=Nc2ccn(C3CSC(CO)O3)c(=O)n2)c2ccc(OC)cc2)cc1 449 | CSc1nc(-c2ccc(P(=O)(O)O)o2)c(CC(C)C)s1 450 | OCC(O)CN1C(CCc2ccc3c(c2)OCO3)CCCC1CCc1ccc2c(c1)OCO2 451 | COC(=O)C(Cc1ccccc1)NC(=O)C1CC(=O)N1C(Cc1ccccc1)C(=O)NCC(C)C 452 | CCOP(=O)(OCC)OC(=NN=C1C(=O)Nc2ccccc21)c1ccccc1P(=O)(OCC)OCC 453 | Cc1n[nH]c2nccc(-c3ccc(NC(=O)NCc4ccccn4)cc3)c12 454 | CCCCCCCCCCCCCCCCP(=O)(OC)OC 455 | Cc1ncn2c1Cn1cc(CO)nc1-c1cc(Br)ccc1-2 456 | O=c1[nH]c(N2CCOCC2)nc(NC2CCCNC2)c1-c1nc(-c2cccc(C(F)(F)F)c2)cs1 457 | CC1(C)OCC(=O)Nc2ccc(-c3cccc(F)c3)cc21 458 | CC(C)=CCc1cc2c(=O)c(-c3ccc(O)cc3O)coc2c(CC=C(C)C)c1O 459 | CCNCC(=O)Nc1c(Cc2nccc3ccccc23)ccc(OC)c1OC 460 | O=C(CCCN1CCC2(c3ccccc3)CC1Cc1ccccc12)c1ccc(F)cc1 461 | CCCCc1nc(C)c(CC(=O)OC2CCCCC2)c(=O)n1Cc1ccc(-c2ccccc2-c2noc(=O)[nH]2)cc1 462 | C=C1C(O)CC(=CC=C2CCCC3(C)C2CCC3C(C)C(=C)CCCC(C)(C)O)CC1O 463 | Cn1c(=O)c(Oc2ccc(F)cc2F)cc2cnc(N3CCCC(CO)C3)nc21 464 | O=C1c2ccccc2C(=O)N1CCn1cncn1 465 | C=CCC12CC(C(=O)OC)N3C1N(C(=N)C3(Cc1ccccc1)N1CCCC1)c1ccc(Br)cc12 466 | CC(C)N1CCc2ncn(C)c2C1C(=O)NCCN1CCCC1 467 | O=C(COC(=O)c1cccc2c(=O)c3ccccc3[nH]c12)Nc1ccc(C(=O)O)cc1 468 | COc1cccc(N2CCN(Cc3coc(-c4cccc5ccccc45)n3)CC2)c1 469 | COC(=O)c1cncc(N)n1 470 | CC1(C(=O)N2CCN(c3ccc([N+](=O)[O-])cc3)CC2)CC1(Cl)Cl 471 | O=C(O)COc1ccc(-c2nocc3c(C(=O)c4ccccc4F)ccc2-3)cc1 472 | COc1cc(NC(=S)NC(=O)c2ccccc2F)ccc1NC(=O)c1cccs1 473 | COc1ccc(C(=O)N2CCN(Cc3ccc(F)cc3)CC2)cc1 474 | C=C1C(C(C)C2CC=C(C)C(=O)O2)CCC2(C)C1CC1(O)C=C3C=CC(=O)OC(C)(C)C3C(OC(C)=O)CC12 475 | Cn1cc(NS(=O)(=O)CCOc2ccc(F)cc2)cn1 476 | COc1cc(=O)n(-c2ccc(Oc3ccnc(N)c3C#CCN(C)C)c(F)c2)cc1C(=O)NCc1ccc(F)cc1 477 | COC(=O)C(C)C1CCC(C)(CCC2=C(C)C(=O)CCC2(C)C)OO1 478 | CCCCNC(=O)N(O)C1N(N=Cc2ccccc2F)C(=S)SC1(C)C 479 | N#Cc1cccc(C=Cc2c(Cl)nc(N)nc2NC2CC(CO)C(O)C2O)c1 480 | COc1cc(-c2nc(C)nc3[nH]cc(F)c23)c(Cl)cc1Cl 481 | O=C(Nc1ccc2c(c1)OCO2)C1Cc2c(-c3ccc(Cl)cc3)ccnc2O1 482 | Cc1ccc(OCC(=O)Nc2nnc(S(=O)(=O)N3CCc4ccccc43)s2)cc1 483 | O=C1OC2(CCCc3ccccc32)CC(O)=C1Sc1ccccc1Cl 484 | Nc1nc(SC2CCCC2)c2ncn(C=C3CC3(CO)CO)c2n1 485 | Cc1cccn2c(=O)c(C(=O)N3CCN(c4ccc(Cl)cc4)CC3)cnc12 486 | COC(=O)Cc1ccc(C2C(CCCc3ccccc3)C(=O)N2c2ccc(F)cc2)cc1 487 | Cc1ccc(C(=O)N2CCN(c3ccccn3)CC2)cc1C 488 | NC(=O)N1CCC(Nc2ncnc3ccc(-c4cncs4)cc23)CC1 489 | COc1cc(C(=O)c2c[nH]c3cccc(OC)c23)c(N)c(OC)c1OC 490 | C=CCc1cc(C=C2CN(S(=O)(=O)CCC)CC(=Cc3ccc(O)c(CC=C)c3)C2=O)ccc1O 491 | Cc1cn(C2CC(n3nncc3-c3ccccc3)C(CO[Si](C)(C)C(C)(C)C)O2)c(=O)[nH]c1=O 492 | c1cc(-c2cc3cc(C4=NCCN4)ccc3[nH]2)ccc1C1=NCCN1 493 | Oc1cc2c(cc1O)-c1[nH]c3ccccc3c1C2 494 | Cc1csc(NN=C(C=Cc2ccc(F)c(C)c2)c2nc3ccccc3[nH]2)n1 495 | O=C(O)c1cn(Cc2ccc(-c3cncnc3)cc2)c2c(F)cccc2c1=O 496 | O=C(O)C1=C(C(=O)Nc2sc(C3CC3)cc2-c2nc(C3CC3)no2)CCC1 497 | CC(C)n1cnnc1C1CCCN(C(=O)COCc2ccccc2)C1 498 | COC(=O)c1ccc(CNC(=O)c2ccc(Cl)s2)c(NC(=O)c2nc3c(s2)CN(C)CC3)c1 499 | Cc1cc(C)c2c(c1)c1nnc(SCCN3CCCCC3)nc1n2C 500 | CCC(NC(=O)c1c(Br)c(-c2ccccc2)nc2ccccc12)c1ccccc1 501 | --------------------------------------------------------------------------------