├── .here
├── data
    ├── raw
    │   └── .gitkeep
    ├── external
    │   └── .gitkeep
    ├── interim
    │   └── .gitkeep
    └── processed
    │   └── .gitkeep
├── models
    └── .gitkeep
├── reports
    └── .gitkeep
├── tests
    ├── __init__.py
    ├── models
    │   ├── __init__.py
    │   └── test_rnn_generators.py
    ├── test_utils.py
    ├── data
    │   ├── test_smiles_voc.txt
    │   ├── test_selfies_voc.txt
    │   └── test.smi
    └── test_tokenizers.py
├── experiments
    ├── .gitkeep
    ├── seh_frag.py
    └── train_lstm_prior.py
├── references
    └── .gitkeep
├── rxitect
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── transforms.py
    │   ├── multiprocessing_proxy.py
    │   └── metrics.py
    ├── trainers
    │   ├── __init__.py
    │   └── gfn_trainer.py
    ├── algorithms
    │   ├── __init__.py
    │   ├── gfn_algorithm.py
    │   └── trajectory_balance.py
    ├── tasks
    │   ├── __init__.py
    │   ├── gfn_task.py
    │   └── original_task.py
    ├── envs
    │   ├── __init__.py
    │   ├── contexts
    │   │   ├── __init__.py
    │   │   ├── fragment_env_context.py
    │   │   └── graph_env_context.py
    │   └── fragment_env.py
    ├── models
    │   ├── __init__.py
    │   ├── gflownet.py
    │   ├── transformers.py
    │   ├── lstm_generator.py
    │   └── bengio2021flow.py
    ├── data
    │   ├── __init__.py
    │   ├── datamodules.py
    │   ├── datasets.py
    │   └── iterators.py
    ├── utils.py
    └── tokenizers.py
├── docs
    ├── source
    │   ├── modules.rst
    │   ├── tests.models.rst
    │   ├── index.rst
    │   ├── rxitect.envs.contexts.rst
    │   ├── rxitect.envs.rst
    │   ├── rxitect.data.rst
    │   ├── rxitect.models.rst
    │   ├── tests.rst
    │   ├── rxitect.rst
    │   └── conf.py
    ├── Makefile
    └── make.bat
├── environment.yml
├── pyproject.toml
├── LICENSE
├── README.rst
└── .gitignore


/.here:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/raw/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reports/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/external/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/interim/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/processed/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/experiments/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/experiments/seh_frag.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/references/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rxitect/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rxitect/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rxitect/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | from rxitect.trainers.gfn_trainer import GFNTrainer
2 | 


--------------------------------------------------------------------------------
/rxitect/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | from rxitect.algorithms.trajectory_balance import TrajectoryBalance
2 | 


--------------------------------------------------------------------------------
/rxitect/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | from rxitect.tasks.gfn_task import FlatRewards, GFNTask, ScalarReward
2 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | final
2 | =====
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    rxitect
8 |    tests
9 | 


--------------------------------------------------------------------------------
/rxitect/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from rxitect.envs.contexts.fragment_env_context import FragmentEnvContext
2 | from rxitect.envs.fragment_env import FragmentEnv
3 | 


--------------------------------------------------------------------------------
/rxitect/models/__init__.py:
--------------------------------------------------------------------------------
1 | """Sub-package containing implementations of various Molecular Generator models"""
2 | from rxitect.models.gflownet import FragmentBasedGFN
3 | from rxitect.models.lstm_generator import LSTMGenerator
4 | 


--------------------------------------------------------------------------------
/rxitect/data/__init__.py:
--------------------------------------------------------------------------------
1 | """Sub-package containing implementations of various datasets for each representation type"""
2 | from rxitect.data.datamodules import SmilesDataModule
3 | from rxitect.data.datasets import SelfiesDataset, SmilesDataset
4 | 


--------------------------------------------------------------------------------
/rxitect/envs/contexts/__init__.py:
--------------------------------------------------------------------------------
1 | from rxitect.envs.contexts.graph_env_context import (Action, ActionCategorical,
2 |                                                      ActionIndex, ActionType,
3 |                                                      Graph, GraphEnvContext,
4 |                                                      StateActionPair)
5 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from rxitect import utils
 4 | 
 5 | 
 6 | def test_that_is_valid_smiles_works_as_expected():
 7 |     valid_smiles = "O=C(Nc1cccc(F)c1)N1CCN(c2ccnc(Cl)n2)CC1"
 8 |     invalid_smiles = "O=C(Nc1cccc(F)c1)N1CCN(c2cDnc(Cl)n2)CC1"
 9 |     assert utils.is_valid_smiles(valid_smiles)
10 |     assert not utils.is_valid_smiles(invalid_smiles)
11 | 


--------------------------------------------------------------------------------
/docs/source/tests.models.rst:
--------------------------------------------------------------------------------
 1 | tests.models package
 2 | ====================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | tests.models.test\_rnn\_generators module
 8 | -----------------------------------------
 9 | 
10 | .. automodule:: tests.models.test_rnn_generators
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: tests.models
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: rx
 2 | 
 3 | channels:
 4 |   - pyg
 5 |   - pytorch
 6 |   - gpytorch
 7 |   - conda-forge
 8 |   - defaults
 9 | 
10 | dependencies:
11 |   - python=3.9
12 |   - pytorch=1.11
13 |   - botorch=0.7.2
14 |   - cvxopt=1.3.0
15 |   - torchmetrics=0.9.2
16 |   - pyg=2.0.4
17 |   - cudatoolkit=11.3
18 |   - rdkit=2022.03.04
19 |   - poetry>=1.1.4,<2.0
20 |   - pip>=20.0
21 |   - h5py=3.7.0
22 |   - pip:
23 |     - https://github.com/MolecularAI/aizynthfinder/archive/v3.4.0.tar.gz
24 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Rxitect documentation master file, created by
 2 |    sphinx-quickstart on Mon Oct  3 23:40:20 2022.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Rxitect's documentation!
 7 | ===================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/docs/source/rxitect.envs.contexts.rst:
--------------------------------------------------------------------------------
 1 | rxitect.envs.contexts package
 2 | =============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | rxitect.envs.contexts.graph\_env\_context module
 8 | ------------------------------------------------
 9 | 
10 | .. automodule:: rxitect.envs.contexts.graph_env_context
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: rxitect.envs.contexts
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/rxitect.envs.rst:
--------------------------------------------------------------------------------
 1 | rxitect.envs package
 2 | ====================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    rxitect.envs.contexts
11 | 
12 | Submodules
13 | ----------
14 | 
15 | rxitect.envs.fragment\_env module
16 | ---------------------------------
17 | 
18 | .. automodule:: rxitect.envs.fragment_env
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: rxitect.envs
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/source/rxitect.data.rst:
--------------------------------------------------------------------------------
 1 | rxitect.data package
 2 | ====================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | rxitect.data.datamodules module
 8 | -------------------------------
 9 | 
10 | .. automodule:: rxitect.data.datamodules
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | rxitect.data.datasets module
16 | ----------------------------
17 | 
18 | .. automodule:: rxitect.data.datasets
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: rxitect.data
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/source/rxitect.models.rst:
--------------------------------------------------------------------------------
 1 | rxitect.models package
 2 | ======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | rxitect.models.gflownet module
 8 | ------------------------------
 9 | 
10 | .. automodule:: rxitect.models.gflownet
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | rxitect.models.lstm\_generator module
16 | -------------------------------------
17 | 
18 | .. automodule:: rxitect.models.lstm_generator
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: rxitect.models
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/source/tests.rst:
--------------------------------------------------------------------------------
 1 | tests package
 2 | =============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    tests.models
11 | 
12 | Submodules
13 | ----------
14 | 
15 | tests.test\_tokenizers module
16 | -----------------------------
17 | 
18 | .. automodule:: tests.test_tokenizers
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | tests.test\_utils module
24 | ------------------------
25 | 
26 | .. automodule:: tests.test_utils
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: tests
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/source/rxitect.rst:
--------------------------------------------------------------------------------
 1 | rxitect package
 2 | ===============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    rxitect.data
11 |    rxitect.envs
12 |    rxitect.models
13 | 
14 | Submodules
15 | ----------
16 | 
17 | rxitect.tokenizers module
18 | -------------------------
19 | 
20 | .. automodule:: rxitect.tokenizers
21 |    :members:
22 |    :undoc-members:
23 |    :show-inheritance:
24 | 
25 | rxitect.utils module
26 | --------------------
27 | 
28 | .. automodule:: rxitect.utils
29 |    :members:
30 |    :undoc-members:
31 |    :show-inheritance:
32 | 
33 | Module contents
34 | ---------------
35 | 
36 | .. automodule:: rxitect
37 |    :members:
38 |    :undoc-members:
39 |    :show-inheritance:
40 | 


--------------------------------------------------------------------------------
/tests/data/test_smiles_voc.txt:
--------------------------------------------------------------------------------
 1 | (
 2 | )
 3 | [OH+]
 4 | 4
 5 | [I+]
 6 | O
 7 | [IH]
 8 | [BH2-]
 9 | [As+]
10 | [CH]
11 | L
12 | [te+]
13 | F
14 | 6
15 | [O]
16 | [te]
17 | [S+]
18 | [se+]
19 | [S-]
20 | [C+]
21 | I
22 | 1
23 | [TeH]
24 | 7
25 | [SeH]
26 | -
27 | b
28 | [c-]
29 | 0
30 | [BH3-]
31 | [Se]
32 | s
33 | [B-]
34 | [nH+]
35 | [c+]
36 | [SiH2]
37 | =
38 | B
39 | N
40 | [PH]
41 | [BH-]
42 | 9
43 | [CH2-]
44 | c
45 | p
46 | [o+]
47 | [SH+]
48 | 2
49 | [CH2]
50 | #
51 | [NH+]
52 | [s+]
53 | o
54 | [O-]
55 | 8
56 | n
57 | [N+]
58 | [Te]
59 | [SH2]
60 | [n-]
61 | [P+]
62 | [As]
63 | 3
64 | [NH2+]
65 | [N-]
66 | [Si]
67 | [SiH]
68 | [C-]
69 | C
70 | 5
71 | [cH-]
72 | P
73 | %
74 | [O+]
75 | [SH]
76 | [NH-]
77 | S
78 | [Se+]
79 | [b-]
80 | R
81 | [se]
82 | [nH]
83 | [n+]


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "rxitect"
 3 | version = "0.1.0"
 4 | description = "Code for Julius Cathalina's M.Sc. Thesis"
 5 | authors = ["Julius Cathalina <j.e.j.cathalina@vuw.leidenuniv.nl>"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = "^3.9"
 9 | seaborn = "^0.11.2"
10 | hydra-core = "^1.2.0"
11 | hydra-colorlog = "^1.2.0"
12 | dvc = "^2.17.0"
13 | wandb = "^0.13.1"
14 | graphviz = "^0.20.1"
15 | pyprojroot = "^0.2.0"
16 | selfies = "^2.1.1"
17 | jsonargparse = {extras = ["signatures"], version = "^4.14.0"}
18 | 
19 | [tool.poetry.dev-dependencies]
20 | pytest = "^7.1.2"
21 | jupyterlab = "^3.4.4"
22 | ipywidgets = "^7.7.1"
23 | black = "^22.6.0"
24 | isort = "^5.10.1"
25 | Sphinx = "^5.1.1"
26 | 
27 | [build-system]
28 | requires = ["poetry-core>=1.0.0"]
29 | build-backend = "poetry.core.masonry.api"
30 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/tests/data/test_selfies_voc.txt:
--------------------------------------------------------------------------------
 1 | [F]
 2 | [Te]
 3 | [P+1]
 4 | [=Ring1]
 5 | [CH2]
 6 | [=NH1+1]
 7 | [NH1-1]
 8 | [=N+1]
 9 | [=NH2+1]
10 | [Ring1]
11 | [SH1]
12 | [=Branch2]
13 | [=Ring2]
14 | [=Te+1]
15 | [C]
16 | [CH1-1]
17 | [SiH2]
18 | [=O]
19 | [=B-1]
20 | [S-1]
21 | [=N]
22 | [SH2]
23 | [B-1]
24 | [Branch2]
25 | [=Se+1]
26 | [O]
27 | [Ring2]
28 | [#C]
29 | [OH0]
30 | [=S]
31 | [#N]
32 | [#N+1]
33 | [BH3-1]
34 | [P]
35 | [As]
36 | [Cl]
37 | [BH2-1]
38 | [=C]
39 | [Se]
40 | [PH1]
41 | [S+1]
42 | [C-1]
43 | [=B]
44 | [Se+1]
45 | [=Se]
46 | [SiH1]
47 | [=N-1]
48 | [N+1]
49 | [Branch1]
50 | [I+1]
51 | [N-1]
52 | [SH1+1]
53 | [CH1]
54 | [=Branch1]
55 | [#S]
56 | [O-1]
57 | [O+1]
58 | [OH1+1]
59 | [#Branch1]
60 | [#C-1]
61 | [N]
62 | [NH1]
63 | [=As]
64 | [#Branch2]
65 | [=O+1]
66 | [S]
67 | [B]
68 | [NH1+1]
69 | [C+1]
70 | [CH2-1]
71 | [BH1-1]
72 | [=PH1]
73 | [SeH1]
74 | [Br]
75 | [NH2+1]
76 | [=SH1]
77 | [=P]
78 | [=S+1]
79 | [=C-1]
80 | [I]
81 | [TeH1]
82 | [As+1]
83 | [Si]
84 | 


--------------------------------------------------------------------------------
/rxitect/utils/transforms.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | 
 4 | 
 5 | def thermometer(
 6 |     v: Tensor, n_bins: int = 50, vmin: float = 0.0, vmax: float = 1.0
 7 | ) -> Tensor:
 8 |     """Thermometer encoding of a scalar quantity.
 9 |     Parameters
10 |     ----------
11 |     v: Tensor
12 |         Value(s) to encode. Can be any shape
13 |     n_bins: int
14 |         The number of dimensions to encode the values into
15 |     vmin: float
16 |         The smallest value, below which the encoding is equal to torch.zeros(n_bins)
17 |     vmax: float
18 |         The largest value, beyond which the encoding is equal to torch.ones(n_bins)
19 |     Returns
20 |     -------
21 |     encoding: Tensor
22 |         The encoded values, shape: `v.shape + (n_bins,)`
23 |     """
24 |     bins = torch.linspace(vmin, vmax, n_bins)
25 |     gap = bins[1] - bins[0]
26 |     return (v[..., None] - bins.reshape((1,) * v.ndim + (-1,))).clamp(
27 |         0, gap.item()
28 |     ) / gap
29 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = "Rxitect"
10 | copyright = "2022, Julius Cathalina"
11 | author = "Julius Cathalina"
12 | 
13 | # -- General configuration ---------------------------------------------------
14 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
15 | 
16 | extensions = ["sphinx.ext.napoleon"]
17 | 
18 | templates_path = ["_templates"]
19 | exclude_patterns = []
20 | 
21 | 
22 | # -- Options for HTML output -------------------------------------------------
23 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
24 | 
25 | html_theme = "alabaster"
26 | html_static_path = ["_static"]
27 | 


--------------------------------------------------------------------------------
/experiments/train_lstm_prior.py:
--------------------------------------------------------------------------------
 1 | import pytorch_lightning as pl
 2 | from pyprojroot import here
 3 | from pytorch_lightning.profilers import AdvancedProfiler
 4 | 
 5 | from rxitect.data import SmilesDataModule
 6 | from rxitect.models import LSTMGenerator
 7 | 
 8 | # from pytorch_lightning.cli import LightningCLI
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     # cli = LightningCLI(LSTMGenerator)
13 |     lr = 1e-3
14 |     epochs = 5
15 | 
16 |     net = LSTMGenerator(
17 |         vocabulary_filepath=here() / "data/processed/chembl_v30_smi_voc.txt",
18 |     )
19 |     dm = SmilesDataModule(
20 |         dataset_filepath=here() / "data/processed/chembl_v30_clean.smi",
21 |         tokenizer=net.tokenizer,
22 |         num_workers=4,
23 |     )
24 | 
25 |     profiler = AdvancedProfiler(dirpath=here() / "logs", filename="perf_logs_lstm")
26 | 
27 |     trainer = pl.Trainer(
28 |         accelerator="gpu",
29 |         devices=1,
30 |         max_epochs=epochs,
31 |         profiler=profiler,
32 |         check_val_every_n_epoch=1,
33 |     )
34 |     trainer.fit(net, datamodule=dm)
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Recursion Pharmaceuticals
 4 | Copyright (c) 2022 Julius Cathalina
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/tests/models/test_rnn_generators.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn as nn
 4 | from pyprojroot import here
 5 | from torch.utils.data import DataLoader
 6 | 
 7 | from rxitect.data import SelfiesDataset, SmilesDataset
 8 | from rxitect.models import LSTMGenerator
 9 | from rxitect.tokenizers import SelfiesTokenizer, SmilesTokenizer, get_tokenizer
10 | 
11 | 
12 | @pytest.fixture()
13 | def smiles_tokenizer() -> SmilesTokenizer:
14 |     test_vocabulary_filepath = here() / "tests/data/test_smiles_voc.txt"
15 |     smiles_tokenizer = get_tokenizer("smiles", test_vocabulary_filepath, 100)
16 |     return smiles_tokenizer
17 | 
18 | 
19 | @pytest.fixture()
20 | def smiles_dataloader(smiles_tokenizer: SmilesTokenizer) -> DataLoader:
21 |     test_dataset_filepath = here() / "tests/data/test.smi"
22 |     dataset = SmilesDataset(
23 |         dataset_filepath=test_dataset_filepath, tokenizer=smiles_tokenizer
24 |     )
25 |     dataloader = DataLoader(
26 |         dataset=dataset,
27 |         batch_size=128,
28 |         num_workers=1,
29 |         shuffle=True,
30 |         pin_memory=True,
31 |         collate_fn=SmilesDataset.collate_fn,
32 |     )
33 |     return dataloader
34 | 
35 | 
36 | def test_dataloader_loads_dataset_in_properly(smiles_dataloader: DataLoader):
37 |     dataloader = smiles_dataloader
38 |     assert len(dataloader.dataset) == 500
39 | 


--------------------------------------------------------------------------------
/tests/test_tokenizers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pyprojroot import here
 3 | 
 4 | from rxitect.tokenizers import SelfiesTokenizer, SmilesTokenizer, get_tokenizer
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def smiles_tokenizer() -> SmilesTokenizer:
 9 |     test_vocabulary_filepath = here() / "tests/data/test_smiles_voc.txt"
10 |     smiles_tokenizer = get_tokenizer("smiles", test_vocabulary_filepath, 100)
11 |     return smiles_tokenizer
12 | 
13 | 
14 | @pytest.fixture
15 | def selfies_tokenizer() -> SelfiesTokenizer:
16 |     test_vocabulary_filepath = here() / "tests/data/test_selfies_voc.txt"
17 |     smiles_tokenizer = get_tokenizer("selfies", test_vocabulary_filepath, 100)
18 |     return smiles_tokenizer
19 | 
20 | 
21 | def test_decoding_encoded_smiles_reconstructs_smiles_correctly(smiles_tokenizer):
22 |     sample_smiles = "CCBr[nH]"
23 |     tokenizer = smiles_tokenizer
24 |     encoded_smiles = tokenizer.encode(sample_smiles)
25 |     decoded_smiles = tokenizer.decode(encoded_smiles)
26 | 
27 |     assert decoded_smiles == sample_smiles
28 | 
29 | 
30 | def test_decoding_encoded_selfies_reconstructs_selfies_correctly(selfies_tokenizer):
31 |     sample_selfies = "[C][C][Br][NH1]"
32 |     tokenizer = selfies_tokenizer
33 |     encoded_selfies = tokenizer.encode(sample_selfies)
34 |     decoded_smiles = tokenizer.decode(encoded_selfies)
35 | 
36 |     assert decoded_smiles == sample_selfies
37 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Rxitect
 3 | =======
 4 | ------------------------------------------------------------------------------------
 5 | A de-novo drug design library for creating retrosynthesis-aware reward-driven models
 6 | ------------------------------------------------------------------------------------
 7 | 
 8 | Introduction
 9 | ============
10 | 
11 | This library was made for my M.Sc. thesis research with the aim of understanding
12 | how computational chemists can incorporate synthesis planning into de-novo drug design
13 | systems. Many molecule generators propose interesting but impractical molecules, which is why we need
14 | to design them with synthesizability in mind. Modern Computer-Assisted Synthesis Planning (CASP) tools are quite powerful
15 | but are of limited use in algorithms that need to call said tools many times (e.g., > 100.000 calls)
16 | due to the time it takes to solve a single molecule on average. This research aims to
17 | create a useful proxy that is cheap to call yet robust, and then using a myriad of techniques
18 | that are known to be effective in searching the vast molecular search space such as Reinforcement Learning (RL),
19 | and Generative Flow Networks (GFlowNets), we can experimentally test if these proxies are useful to propose more
20 | practical and synthesizable molecules.
21 | 
22 | Quickstart
23 | ----------
24 | Run the following code to get up and running
25 | ```
26 | conda env create -f environment.yml
27 | # alternatively you can use mamba, which I recommend
28 | conda activate rx
29 | poetry install
30 | ```
31 | 
32 | Examples
33 | --------
34 | Coming Soon!
35 | 


--------------------------------------------------------------------------------
/rxitect/tasks/gfn_task.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import Any, Dict, List, NewType, Tuple, Union
 5 | 
 6 | import torch
 7 | from rdkit.Chem.rdchem import Mol
 8 | from torch import nn
 9 | 
10 | # This type represents an unprocessed list of reward signals/conditioning information
11 | FlatRewards = NewType("FlatRewards", torch.Tensor)  # type: ignore
12 | 
13 | # This type represents the outcome for a multi-objective task of
14 | # converting FlatRewards to a scalar, e.g. (sum R_i omega_i) ** beta
15 | ScalarReward = NewType("ScalarReward", torch.Tensor)  # type: ignore
16 | 
17 | 
18 | class GFNTask(ABC):
19 |     @abstractmethod
20 |     def flat_reward_transform(self, y: Union[float, torch.Tensor]) -> FlatRewards:
21 |         pass
22 | 
23 |     @abstractmethod
24 |     def inverse_flat_reward_transform(
25 |         self, rp: FlatRewards
26 |     ) -> Union[float, torch.Tensor]:
27 |         pass
28 | 
29 |     @abstractmethod
30 |     def _load_task_models(self) -> Dict[str, nn.Module]:
31 |         pass
32 | 
33 |     @abstractmethod
34 |     def sample_conditional_information(self, n: int) -> Dict[str, Any]:
35 |         """
36 |         Parameters
37 |         ----------
38 |         n: size of random sample
39 | 
40 |         Returns
41 |         -------
42 |         Dictionary containing conditional information
43 |         """
44 |         pass
45 | 
46 |     @abstractmethod
47 |     def cond_info_to_reward(
48 |         self, cond_info: Dict[str, torch.Tensor], flat_rewards: FlatRewards
49 |     ) -> ScalarReward:
50 |         """Combines a minibatch of reward signal vectors and conditional information into a scalar reward.
51 |         Parameters
52 |         ----------
53 |         cond_info: Dict[str, Tensor]
54 |             A dictionary with various conditional information (e.g. temperature)
55 |         flat_rewards: FlatRewards
56 |             A 2d tensor where each row represents a series of flat rewards.
57 |         Returns
58 |         -------
59 |         reward: ScalarReward
60 |             A 1d tensor, a scalar reward for each minibatch entry.
61 |         """
62 |         pass
63 | 
64 |     @abstractmethod
65 |     def compute_flat_rewards(self, mols: List[Mol]) -> Tuple[FlatRewards, torch.Tensor]:
66 |         """Compute the flat rewards of mols according the tasks' proxies
67 |         Parameters
68 |         ----------
69 |         mols: List[Mol]
70 |             A list of RDKit molecules.
71 |         Returns
72 |         -------
73 |         reward: FlatRewards
74 |             A 2d tensor, a vector of scalar reward for valid each molecule.
75 |         is_valid: Tensor
76 |             A 1d tensor, a boolean indicating whether the molecule is valid.
77 |         """
78 |         pass
79 | 


--------------------------------------------------------------------------------
/rxitect/data/datamodules.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Optional, Tuple
 3 | 
 4 | import torch
 5 | from pytorch_lightning import LightningDataModule
 6 | from pytorch_lightning.utilities.types import (EVAL_DATALOADERS,
 7 |                                                TRAIN_DATALOADERS)
 8 | from torch.utils.data import DataLoader, random_split
 9 | 
10 | from rxitect.data.datasets import SmilesDataset
11 | from rxitect.tokenizers import SmilesTokenizer
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | class SmilesDataModule(LightningDataModule):
17 |     def __init__(
18 |         self,
19 |         dataset_filepath: str,
20 |         tokenizer=SmilesTokenizer,
21 |         train_val_test_split: Tuple[int, int, int] = (1_500_000, 185_000, 186_227),
22 |         batch_size: int = 128,
23 |         num_workers: int = 0,
24 |         num_partitions: Optional[int] = None,
25 |         pin_memory: bool = False,
26 |         random_state: int = 42,
27 |     ) -> None:
28 |         super().__init__()
29 | 
30 |         self.save_hyperparameters()
31 |         self.dataset_filepath = dataset_filepath
32 |         self.train_val_test_split = train_val_test_split
33 |         self.batch_size = batch_size
34 |         self.num_workers = num_workers
35 |         self.num_partitions = num_partitions
36 |         self.pin_memory = pin_memory
37 |         self.random_state = random_state
38 |         self.tokenizer = tokenizer
39 | 
40 |     def prepare_data(self) -> None:
41 |         pass
42 |         # TODO: Download the tokenized ChEMBL file here
43 |         # saves us the params if we init the vocab internally as well.
44 | 
45 |     def setup(self, stage: Optional[str] = None) -> None:
46 |         # TODO: Make ChEMBL v30 a downloadable dataset like MNIST from torch and simplify
47 |         data = SmilesDataset(self.dataset_filepath, self.tokenizer)
48 |         # Create splits for train/val/test
49 |         self.train_data, self.val_data, self.test_data = random_split(
50 |             dataset=data,
51 |             lengths=self.train_val_test_split,
52 |             generator=torch.Generator().manual_seed(self.random_state),
53 |         )
54 | 
55 |     def train_dataloader(self) -> TRAIN_DATALOADERS:
56 |         return DataLoader(
57 |             dataset=self.train_data,
58 |             batch_size=self.batch_size,
59 |             pin_memory=self.pin_memory,
60 |             num_workers=self.num_workers,
61 |             collate_fn=SmilesDataset.collate_fn,
62 |             shuffle=True,
63 |         )
64 | 
65 |     def val_dataloader(self) -> EVAL_DATALOADERS:
66 |         return DataLoader(
67 |             dataset=self.val_data,
68 |             batch_size=self.batch_size,
69 |             pin_memory=self.pin_memory,
70 |             num_workers=self.num_workers,
71 |             collate_fn=SmilesDataset.collate_fn,
72 |             shuffle=False,
73 |         )
74 | 
75 |     def test_dataloader(self) -> EVAL_DATALOADERS:
76 |         return DataLoader(
77 |             dataset=self.test_data,
78 |             batch_size=self.batch_size,
79 |             pin_memory=self.pin_memory,
80 |             num_workers=self.num_workers,
81 |             collate_fn=SmilesDataset.collate_fn,
82 |             shuffle=False,
83 |         )
84 | 


--------------------------------------------------------------------------------
/rxitect/models/gflownet.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import TYPE_CHECKING
  4 | 
  5 | import torch
  6 | from torch import nn
  7 | from torch_geometric.data import Batch
  8 | 
  9 | from rxitect.envs.contexts import ActionCategorical
 10 | from rxitect.models.transformers import GraphTransformer, create_mlp
 11 | 
 12 | if TYPE_CHECKING:
 13 |     from rxitect.envs import FragmentEnvContext
 14 | 
 15 | 
 16 | class FragmentBasedGFN(nn.Module):
 17 |     """GraphTransformer class for a GFlowNet which outputs a GraphActionCategorical. Meant for
 18 |     fragment-wise generation.
 19 |     Outputs logits for the following actions
 20 |     - STOP
 21 |     - ADD_NODE
 22 |     - SET_EDGE_ATTR
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         ctx: FragmentEnvContext,
 28 |         num_emb: int = 64,
 29 |         num_layers: int = 3,
 30 |         num_heads: int = 2,
 31 |     ):
 32 |         """
 33 |         Parameters
 34 |         ----------
 35 |         ctx: FragmentEnvContext
 36 |             x
 37 |         num_emb: int
 38 |             x
 39 |         num_layers: int
 40 |             x
 41 |         num_heads: int
 42 |             x
 43 |         """
 44 |         super().__init__()
 45 |         self.transformer = GraphTransformer(
 46 |             x_dim=ctx.num_node_dim,
 47 |             e_dim=ctx.num_edge_dim,
 48 |             g_dim=ctx.num_cond_dim,
 49 |             num_emb=num_emb,
 50 |             num_layers=num_layers,
 51 |             num_heads=num_heads,
 52 |         )
 53 |         num_final = num_emb * 2
 54 |         num_mlp_layers = 0
 55 |         self.emb2add_node = create_mlp(
 56 |             num_final, num_emb, ctx.num_new_node_values, num_mlp_layers
 57 |         )
 58 |         # Edge attr logits are "sided", so we will compute both sides independently
 59 |         self.emb2set_edge_attr = create_mlp(
 60 |             num_emb + num_final, num_emb, ctx.num_edge_attr_logits // 2, num_mlp_layers
 61 |         )
 62 |         self.emb2stop = create_mlp(num_emb * 3, num_emb, 1, num_mlp_layers)
 63 |         self.emb2reward = create_mlp(num_emb * 3, num_emb, 1, num_mlp_layers)
 64 |         self.edge2emb = create_mlp(num_final, num_emb, num_emb, num_mlp_layers)
 65 |         self.log_z = create_mlp(ctx.num_cond_dim, num_emb * 2, 1, 2)
 66 |         self.action_type_order = ctx.action_type_order
 67 | 
 68 |     def forward(self, g: Batch, cond: torch.Tensor):
 69 |         """See `GraphTransformer` for argument values"""
 70 |         node_embeddings, graph_embeddings = self.transformer(g, cond)
 71 |         # On `::2`, edges are duplicated to make graphs undirected, only take the even ones
 72 |         e_row, e_col = g.edge_index[:, ::2]
 73 |         edge_emb = self.edge2emb(node_embeddings[e_row] + node_embeddings[e_col])
 74 |         src_anchor_logits = self.emb2set_edge_attr(
 75 |             torch.cat([edge_emb, node_embeddings[e_row]], 1)
 76 |         )
 77 |         dst_anchor_logits = self.emb2set_edge_attr(
 78 |             torch.cat([edge_emb, node_embeddings[e_col]], 1)
 79 |         )
 80 | 
 81 |         def _mask(x, m):
 82 |             # mask logit vector x with binary mask m, -1000 is a tiny log-value
 83 |             return x * m + -1000 * (1 - m)
 84 | 
 85 |         cat = ActionCategorical(
 86 |             g,
 87 |             logits=[
 88 |                 self.emb2stop(graph_embeddings),
 89 |                 _mask(self.emb2add_node(node_embeddings), g.add_node_mask),
 90 |                 _mask(
 91 |                     torch.cat([src_anchor_logits, dst_anchor_logits], 1),
 92 |                     g.set_edge_attr_mask,
 93 |                 ),
 94 |             ],
 95 |             keys=[None, "x", "edge_index"],
 96 |             types=self.action_type_order,
 97 |         )
 98 |         return cat, self.emb2reward(graph_embeddings)
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     env = FragmentEnvContext()
103 |     gfn = FragmentBasedGFN(ctx=env)
104 |     # gfn.
105 | 


--------------------------------------------------------------------------------
/rxitect/data/datasets.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from rxitect.tokenizers import SelfiesTokenizer, SmilesTokenizer
  4 | 
  5 | import tarfile
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import rdkit.Chem as Chem
 10 | from torch.utils.data import Dataset
 11 | 
 12 | 
 13 | class SmilesDataset(Dataset):
 14 |     def __init__(self, dataset_filepath: str, tokenizer: SmilesTokenizer) -> None:
 15 |         self.tokenizer = tokenizer
 16 |         self.padding_value = tokenizer.tk2ix_[tokenizer.pad_token]
 17 |         with open(dataset_filepath, "r") as f:
 18 |             self.smiles = [line.split()[0] for line in f]
 19 | 
 20 |     def __getitem__(self, index: int) -> torch.Tensor:
 21 |         smiles = self.smiles[index]
 22 |         return self.tokenizer.encode(smiles)
 23 | 
 24 |     def __len__(self):
 25 |         return len(self.smiles)
 26 | 
 27 |     def __str__(self) -> str:
 28 |         return f"SMILES Dataset containing {len(self)} structures"
 29 | 
 30 |     @classmethod
 31 |     def collate_fn(cls, arr: torch.Tensor) -> torch.Tensor:
 32 |         """Function to take a list of encoded sequences and turn them into a batch"""
 33 |         max_len = max([seq.size(0) for seq in arr])
 34 |         collated_arr = torch.zeros(len(arr), max_len, dtype=torch.long)
 35 |         for i, seq in enumerate(arr):
 36 |             collated_arr[i, : seq.size(0)] = seq
 37 |         return collated_arr
 38 | 
 39 | 
 40 | class SelfiesDataset(Dataset):
 41 |     def __init__(self, dataset_filepath: str, tokenizer: SelfiesTokenizer) -> None:
 42 |         self.tokenizer = tokenizer
 43 |         with open(dataset_filepath, "r") as f:
 44 |             self.selfies = [line.split()[0] for line in f]
 45 | 
 46 |     def __getitem__(self, index: int) -> torch.Tensor:
 47 |         selfies = self.selfies[index]
 48 |         return self.tokenizer.encode(selfies)
 49 | 
 50 |     def __len__(self):
 51 |         return len(self.selfies)
 52 | 
 53 |     def __str__(self) -> str:
 54 |         return f"SELFIES Dataset containing {len(self)} structures"
 55 | 
 56 |     @classmethod
 57 |     def collate_fn(cls, arr: torch.Tensor) -> torch.Tensor:
 58 |         """Function to take a list of encoded sequences and turn them into a batch"""
 59 |         max_len = max([seq.size(0) for seq in arr])
 60 |         collated_arr = torch.zeros(len(arr), max_len)
 61 |         for i, seq in enumerate(arr):
 62 |             collated_arr[i, : seq.size(0)] = seq
 63 |         return collated_arr
 64 | 
 65 | 
 66 | class QM9Dataset(Dataset):
 67 |     def __init__(self, h5_file=None, xyz_file=None, train=True, target='gap', split_seed=142857, ratio=0.9):
 68 |         if h5_file is not None:
 69 |             self.df = pd.HDFStore(h5_file, 'r')['df']
 70 |         elif xyz_file is not None:
 71 |             self.load_tar()
 72 |         rng = np.random.default_rng(split_seed)
 73 |         idcs = np.arange(len(self.df))  # TODO: error if there is no h5_file provided. Should h5 be required
 74 |         rng.shuffle(idcs)
 75 |         self.target = target
 76 |         if train:
 77 |             self.idcs = idcs[:int(np.floor(ratio * len(self.df)))]
 78 |         else:
 79 |             self.idcs = idcs[int(np.floor(ratio * len(self.df))):]
 80 | 
 81 |     def get_stats(self, percentile=0.95):
 82 |         y = self.df[self.target]
 83 |         return y.min(), y.max(), np.sort(y)[int(y.shape[0] * percentile)]
 84 | 
 85 |     def load_tar(self, xyz_file):
 86 |         f = tarfile.TarFile(xyz_file, 'r')
 87 |         labels = ['rA', 'rB', 'rC', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
 88 |         all_mols = []
 89 |         for pt in f:
 90 |             pt = f.extractfile(pt)
 91 |             data = pt.read().decode().splitlines()
 92 |             all_mols.append(data[-2].split()[:1] + list(map(float, data[1].split()[2:])))
 93 |         self.df = pd.DataFrame(all_mols, columns=['SMILES'] + labels)
 94 | 
 95 |     def __len__(self):
 96 |         return len(self.idcs)
 97 | 
 98 |     def __getitem__(self, idx):
 99 |         return Chem.MolFromSmiles(self.df['SMILES'][self.idcs[idx]]), self.df[self.target][self.idcs[idx]]
100 | 


--------------------------------------------------------------------------------
/rxitect/algorithms/gfn_algorithm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from abc import ABC, abstractmethod
  4 | from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
  5 | 
  6 | import torch
  7 | from torch import nn
  8 | from torch_geometric.data import Batch
  9 | 
 10 | from rxitect.envs.contexts.graph_env_context import (
 11 |     StateActionPair, generate_forward_trajectory)
 12 | 
 13 | if TYPE_CHECKING:
 14 |     from rxitect.envs.contexts import (Action, ActionCategorical, ActionIndex,
 15 |                                        Graph)
 16 | 
 17 | 
 18 | class SamplingModel(nn.Module):
 19 |     def forward(self, batch: Batch) -> Tuple[ActionCategorical, torch.Tensor]:
 20 |         raise NotImplementedError()
 21 | 
 22 |     def log_z(self, cond_info: torch.Tensor) -> torch.Tensor:
 23 |         raise NotImplementedError()
 24 | 
 25 | 
 26 | class GFNAlgorithm(ABC):
 27 |     @abstractmethod
 28 |     def create_training_data_from_own_samples(
 29 |         self, model: SamplingModel, n: int, cond_info: torch.Tensor
 30 |     ) -> List[Dict]:
 31 |         """Generate trajectories by sampling a model
 32 |         Parameters
 33 |         ----------
 34 |         model: SamplingModel
 35 |            The model being sampled
 36 |         n: int
 37 |             Number of samples
 38 |         cond_info: torch.Tensor
 39 |             Conditional information, shape (N, n_info)
 40 |         Returns
 41 |         -------
 42 |         data: List[Dict]
 43 |            A list of trajectories. Each trajectory is a dict with keys
 44 |            - trajs: List[Tuple[Graph, GraphAction]]
 45 |            - reward_pred: float, -100 if an illegal action is taken, predicted R(x) if bootstrapping, None otherwise
 46 |            - fwd_logprob: log Z + sum logprobs P_F
 47 |            - bck_logprob: sum logprobs P_B
 48 |            - logZ: predicted log Z
 49 |            - loss: predicted loss (if bootstrapping)
 50 |            - is_valid: is the generated graph valid according to the env & ctx
 51 |         """
 52 |         pass
 53 | 
 54 |     @staticmethod
 55 |     def create_training_data_from_graphs(graphs: List[Graph]) -> List[Trajectory]:
 56 |         """Generate trajectories from known endpoints
 57 |         Parameters
 58 |         ----------
 59 |         graphs: List[Graph]
 60 |             List of Graph endpoints
 61 |         Returns
 62 |         -------
 63 |         trajs: List[Dict{'traj': List[tuple[Graph, GraphAction]]}]
 64 |            A list of trajectories.
 65 |         """
 66 |         return [{"traj": generate_forward_trajectory(i)} for i in graphs]
 67 | 
 68 |     def construct_batch(self, trajs, cond_info, rewards) -> Batch:
 69 |         """Construct a batch from a list of trajectories and their information
 70 |         Parameters
 71 |         ----------
 72 |         trajs: List[List[tuple[Graph, GraphAction]]]
 73 |             A list of N trajectories.
 74 |         cond_info: Tensor
 75 |             The conditional info that is considered for each trajectory. Shape (N, n_info)
 76 |         rewards: Tensor
 77 |             The transformed reward (e.g. R(x) ** beta) for each trajectory. Shape (N,)
 78 |         Returns
 79 |         -------
 80 |         batch: Batch
 81 |              A (CPU) Batch object with relevant attributes added
 82 |         """
 83 |         pass
 84 | 
 85 |     @abstractmethod
 86 |     def compute_batch_losses(
 87 |         self, model: nn.Module, batch: Batch, num_bootstrap: Optional[int] = 0
 88 |     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
 89 |         """Computes the loss for a batch of data, and proves logging information
 90 |         Parameters
 91 |         ----------
 92 |         model: nn.Module
 93 |             The model being trained or evaluated
 94 |         batch: gd.Batch
 95 |             A batch of graphs
 96 |         num_bootstrap: Optional[int]
 97 |             The number of trajectories with reward targets in the batch (if applicable).
 98 |         Returns
 99 |         -------
100 |         loss: Tensor
101 |             The loss for that batch
102 |         info: Dict[str, Tensor]
103 |             Logged information about model predictions.
104 |         """
105 |         pass
106 | 
107 | 
108 | Trajectory = Dict[str, List[StateActionPair]]
109 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # Visual Studio Code
156 | .vscode/
157 | 
158 | # PyCharm
159 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
162 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | .idea/
164 | 
165 | # Project specific directories
166 | aizynthfinder/*
167 | !aizynthfinder/.gitkeep
168 | 
169 | # Data files (delete after DVC integration)
170 | data/*
171 | !data/raw
172 | !data/interim
173 | !data/processed
174 | !data/external
175 | data/raw/*
176 | data/interim/*
177 | data/processed/*
178 | data/external/*
179 | !data/raw/.gitkeep
180 | !data/interim/.gitkeep
181 | !data/processed/.gitkeep
182 | !data/external/.gitkeep
183 | 
184 | # Model files
185 | models/*
186 | !models/.gitkeep
187 | 
188 | # Scratchpad notebooks
189 | notebooks/Untitled*.ipynb
190 | 
191 | # Logs
192 | logs/*
193 | !logs/.gitkeep
194 | lightning_logs/
195 | 
196 | # Active Learning Dump
197 | *_iter_*


--------------------------------------------------------------------------------
/rxitect/utils/multiprocessing_proxy.py:
--------------------------------------------------------------------------------
  1 | import queue
  2 | import threading
  3 | from typing import Tuple
  4 | 
  5 | import torch
  6 | import torch.multiprocessing as mp
  7 | from torch import nn
  8 | from torch.utils.data import get_worker_info
  9 | 
 10 | 
 11 | class MPModelPlaceholder:
 12 |     """This class can be used as a Model in a worker process, and
 13 |     translates calls to queries to the main process"""
 14 | 
 15 |     def __init__(self, in_queues, out_queues):
 16 |         self.qs = in_queues, out_queues
 17 |         self.device = torch.device("cpu")
 18 |         self._is_init = False
 19 | 
 20 |     def _check_init(self):
 21 |         if self._is_init:
 22 |             return
 23 |         info = get_worker_info()
 24 |         self.in_queue = self.qs[0][info.id]
 25 |         self.out_queue = self.qs[1][info.id]
 26 |         self._is_init = True
 27 | 
 28 |     # TODO: make a generic method for this based on __getattr__
 29 |     def log_z(self, *a):
 30 |         self._check_init()
 31 |         self.in_queue.put(("log_z", *a))
 32 |         return self.out_queue.get()
 33 | 
 34 |     def __call__(self, *a):
 35 |         self._check_init()
 36 |         self.in_queue.put(("__call__", *a))
 37 |         return self.out_queue.get()
 38 | 
 39 | 
 40 | class MPModelProxy:
 41 |     """This class maintains a reference to an in-cuda-memory model, and
 42 |     creates a `placeholder` attribute which can be safely passed to
 43 |     multiprocessing DataLoader workers.
 44 |     This placeholder model sends messages across multiprocessing
 45 |     queues, which are received by this proxy instance, which calls the
 46 |     model and sends the return value back to the worker.
 47 |     Starts its own (daemon) thread. Always passes CPU tensors between
 48 |     processes.
 49 |     """
 50 | 
 51 |     def __init__(self, model: nn.Module, num_workers: int, cast_types: Tuple):
 52 |         """Construct a multiprocessing model proxy for torch DataLoaders.
 53 |         Parameters
 54 |         ----------
 55 |         model: torch.nn.Module
 56 |             A torch model which lives in the main process to which method calls are passed
 57 |         num_workers: int
 58 |             Number of DataLoader workers
 59 |         cast_types: tuple
 60 |             Types that will be cast to cuda when received as arguments of method calls.
 61 |             torch.Tensor is cast by default.
 62 |         """
 63 |         self.in_queues = [mp.Queue() for _ in range(num_workers)]
 64 |         self.out_queues = [mp.Queue() for _ in range(num_workers)]
 65 |         self.placeholder = MPModelPlaceholder(self.in_queues, self.out_queues)
 66 |         self.model = model
 67 |         self.device = next(model.parameters()).device
 68 |         self.cuda_types = (torch.Tensor,) + cast_types
 69 |         self.stop = threading.Event()
 70 |         self.thread = threading.Thread(target=self.run, daemon=True)
 71 |         self.thread.start()
 72 | 
 73 |     def __del__(self):
 74 |         self.stop.set()
 75 | 
 76 |     def run(self) -> None:
 77 |         while not self.stop.is_set():
 78 |             for qi, q in enumerate(self.in_queues):
 79 |                 try:
 80 |                     r = q.get(True, 1e-5)
 81 |                 except queue.Empty:
 82 |                     continue
 83 |                 except ConnectionError:
 84 |                     break
 85 |                 attr, *args = r
 86 |                 f = getattr(self.model, attr)
 87 |                 args = [
 88 |                     i.to(self.device) if isinstance(i, self.cuda_types) else i
 89 |                     for i in args
 90 |                 ]
 91 |                 result = f(*args)
 92 |                 if isinstance(result, (list, tuple)):
 93 |                     msg = [
 94 |                         i.detach().to(torch.device("cpu"))
 95 |                         if isinstance(i, self.cuda_types)
 96 |                         else i
 97 |                         for i in result
 98 |                     ]
 99 |                     self.out_queues[qi].put(msg)
100 |                 else:
101 |                     msg = (
102 |                         result.detach().to(torch.device("cpu"))
103 |                         if isinstance(result, self.cuda_types)
104 |                         else result
105 |                     )
106 |                     self.out_queues[qi].put(msg)
107 | 
108 | 
109 | def wrap_model_mp(model: nn.Module, num_workers: int, cast_types: Tuple) -> MPModelPlaceholder:
110 |     """Construct a multiprocessing model proxy for torch DataLoaders so
111 |     that only one process ends up making cuda calls and holding cuda
112 |     tensors in memory.
113 |     Parameters
114 |     ----------
115 |     model: nn.Module
116 |         A torch model which lives in the main process to which method calls are passed
117 |     num_workers: int
118 |         Number of DataLoader workers
119 |     cast_types: tuple
120 |         Types that will be cast to cuda when received as arguments of method calls.
121 |         torch.Tensor is cast by default.
122 |     Returns
123 |     -------
124 |     placeholder: MPModelPlaceholder
125 |         A placeholder model whose method calls route arguments to the main process
126 |     """
127 |     return MPModelProxy(model, num_workers, cast_types).placeholder
128 | 


--------------------------------------------------------------------------------
/rxitect/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import TYPE_CHECKING, List, Optional, Tuple
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from numpy.typing import ArrayLike
  8 | from rdkit import Chem
  9 | from rdkit.Chem import AllChem
 10 | from torch_geometric.data import Batch, Data
 11 | 
 12 | if TYPE_CHECKING:
 13 |     from rxitect.data.composable_molecule import ComposableMolecule
 14 |     from rxitect.mdp import MarkovDecisionProcess
 15 | 
 16 | 
 17 | def is_valid_smiles(smiles: str) -> bool:
 18 |     return Chem.MolFromSmiles(smiles) is not None
 19 | 
 20 | 
 21 | def filter_duplicate_tensors(x: torch.Tensor) -> torch.Tensor:
 22 |     return x.unique_consecutive(dim=0)
 23 | 
 24 | 
 25 | def mol_from_fragments(
 26 |     jbonds: ArrayLike,
 27 |     frags: Optional[List[Chem.rdchem.Mol]] = None,
 28 |     frag_smiles: Optional[List[str]] = None,
 29 |     optimize: bool = False,
 30 | ) -> Tuple[Chem.rdchem.Mol, List[int]]:
 31 |     """Joins 2 or more fragments into a single molecule
 32 | 
 33 |     Args:
 34 |         jbonds (ArrayLike): An array-like (e.g., a list) object containing junction bonds
 35 |         frags (Optional[List[Mol]]): A list of RDKit Mol objects to be combined. Should be given if frag_smiles is not present
 36 |         frag_smiles (Optional[List[Mol]]): A list of SMILES strings to be made into RDKit Mol objects and combined. Must be present if frags is not
 37 |         optimize (bool): If the molecule's 3D structure should be optimized. Defaults to False
 38 | 
 39 |     Returns:
 40 |         Tuple[Mol, List[int]]: A tuple containing the combined molecule as an RDKit Mol object, and a list containing the bonds
 41 |     """
 42 |     jbonds = np.asarray(jbonds)
 43 | 
 44 |     if frags is not None:
 45 |         pass
 46 |     elif frags is None and frag_smiles is not None:
 47 |         frags = [Chem.MolFromSmiles(smi) for smi in frag_smiles]
 48 |     else:
 49 |         raise ValueError("At least one of `frags` or `frag_smiles` should be given.")
 50 | 
 51 |     if len(frags) == 0:
 52 |         return None, None
 53 | 
 54 |     num_frags = len(frags)
 55 |     # combine fragments into a single molecule
 56 |     mol = frags[0]
 57 |     for i in np.arange(start=1, stop=num_frags):
 58 |         mol = Chem.CombineMols(mol, frags[i])
 59 |     # add junction bonds between fragments
 60 |     frag_start_idx = np.concatenate(
 61 |         [[0], np.cumsum([frag.GetNumAtoms() for frag in frags])], 0
 62 |     )[:-1]
 63 | 
 64 |     if jbonds.size == 0:
 65 |         mol_bonds = []
 66 |     else:
 67 |         mol_bonds = frag_start_idx[jbonds[:, 0:2]] + jbonds[:, 2:4]
 68 | 
 69 |     rw_mol = Chem.EditableMol(mol)
 70 | 
 71 |     [
 72 |         rw_mol.AddBond(int(bond[0]), int(bond[1]), Chem.BondType.SINGLE)
 73 |         for bond in mol_bonds
 74 |     ]
 75 |     mol = rw_mol.GetMol()
 76 |     atoms = list(mol.GetAtoms())
 77 | 
 78 |     def _pop_H(atom):
 79 |         num_h = atom.GetNumExplicitHs()
 80 |         if num_h > 0:
 81 |             atom.SetNumExplicitHs(num_h - 1)
 82 | 
 83 |     [(_pop_H(atoms[bond[0]]), _pop_H(atoms[bond[1]])) for bond in mol_bonds]
 84 |     Chem.SanitizeMol(mol)
 85 | 
 86 |     # create and optimize 3D structure
 87 |     if optimize:
 88 |         assert not "h" in set(
 89 |             [atom.GetSymbol().lower() for atom in mol.GetAtoms()]
 90 |         ), "can't optimize molecule with h"
 91 |         Chem.AddHs(mol)
 92 |         AllChem.EmbedMolecule(mol)
 93 |         AllChem.MMFFOptimizeMolecule(mol)
 94 |         Chem.RemoveHs(mol)
 95 |     return mol, mol_bonds
 96 | 
 97 | 
 98 | def mol2graph(cmol: ComposableMolecule, mdp: MarkovDecisionProcess) -> Data:
 99 |     """
100 |     TODO
101 |     """
102 |     long_tensor = lambda x: torch.tensor(x, dtype=torch.long, device=mdp.device)
103 |     if len(cmol.block_idxs) == 0:
104 |         data = Data(  # There's an extra block embedding for the empty molecule
105 |             x=long_tensor([mdp.num_true_blocks]),
106 |             edge_index=long_tensor([[], []]),
107 |             edge_attr=long_tensor([]).reshape((0, 2)),
108 |             stems=long_tensor([(0, 0)]),
109 |             stem_types=long_tensor([mdp.num_stem_types]),
110 |         )  # also extra stem type embedding
111 |         return data
112 |     edges = [(i[0], i[1]) for i in cmol.jbonds]
113 |     # edge_attrs = [mdp.bond_type_offset[i[2]] +  i[3] for i in mol.jbonds]
114 |     t = mdp.true_block_idx
115 |     if 0:
116 |         edge_attrs = [
117 |             (
118 |                 (mdp.stem_type_offset[t[cmol.block_idxs[i[0]]]] + i[2])
119 |                 * mdp.num_stem_types
120 |                 + (mdp.stem_type_offset[t[cmol.block_idxs[i[1]]]] + i[3])
121 |             )
122 |             for i in cmol.jbonds
123 |         ]
124 |     else:
125 |         edge_attrs = [
126 |             (
127 |                 mdp.stem_type_offset[t[cmol.block_idxs[i[0]]]] + i[2],
128 |                 mdp.stem_type_offset[t[cmol.block_idxs[i[1]]]] + i[3],
129 |             )
130 |             for i in cmol.jbonds
131 |         ]
132 |     """
133 |     Here stem_type_offset is a list of offsets to know which
134 |     embedding to use for a particular stem. Each (blockidx, atom)
135 |     pair has its own embedding.
136 |     """
137 |     stem_types = [
138 |         mdp.stem_type_offset[t[cmol.block_idxs[i[0]]]] + i[1] for i in cmol.stems
139 |     ]
140 | 
141 |     data = Data(
142 |         x=long_tensor([t[i] for i in cmol.block_idxs]),
143 |         edge_index=long_tensor(edges).T if len(edges) else long_tensor([[], []]),
144 |         edge_attr=long_tensor(edge_attrs)
145 |         if len(edges)
146 |         else long_tensor([]).reshape((0, 2)),
147 |         stems=long_tensor(cmol.stems) if len(cmol.stems) else long_tensor([(0, 0)]),
148 |         stem_types=long_tensor(stem_types)
149 |         if len(cmol.stems)
150 |         else long_tensor([mdp.num_stem_types]),
151 |     )
152 |     data.to(mdp.device)
153 |     return data
154 | 
155 | 
156 | def mols2batch(mols: List[Data], mdp: MarkovDecisionProcess) -> Batch:
157 |     """
158 |     TODO
159 |     """
160 |     batch = Batch.from_data_list(mols, follow_batch=["stems"])
161 |     batch.to(mdp.device)
162 |     return batch
163 | 


--------------------------------------------------------------------------------
/rxitect/envs/fragment_env.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import networkx as nx
  4 | 
  5 | from rxitect.envs.contexts import Action, ActionType, Graph, StateActionPair
  6 | 
  7 | 
  8 | class FragmentEnv:
  9 |     """
 10 |     A Graph building environment which induces a DAG state space, compatible with GFlowNet.
 11 |     Supports forward and backward actions, with a `parents` function that list parents of
 12 |     forward actions.
 13 |     Edges and nodes can have attributes added to them in a key:value style.
 14 |     Edges and nodes are created with _implicit_ default attribute
 15 |     values (e.g. chirality, single/double bondness) so that:
 16 |         - an agent gets to do an extra action to set that attribute, but only
 17 |           if it is still default-valued (DAG property preserved)
 18 |         - we can generate a legal action for any attribute that isn't a default one.
 19 |     """
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         allow_add_edge: bool = True,
 24 |         allow_node_attr: bool = True,
 25 |         allow_edge_attr: bool = True,
 26 |     ):
 27 |         """A graph building environment instance
 28 |         Parameters
 29 |         ----------
 30 |         allow_add_edge: bool
 31 |             if True, allows this action and computes AddEdge parents (i.e. if False, this
 32 |             env only allows for tree generation)
 33 |         allow_node_attr: bool
 34 |             if True, allows this action and computes SetNodeAttr parents
 35 |         allow_edge_attr: bool
 36 |             if True, allows this action and computes SetEdgeAttr parents
 37 |         """
 38 |         self.allow_add_edge = allow_add_edge
 39 |         self.allow_node_attr = allow_node_attr
 40 |         self.allow_edge_attr = allow_edge_attr
 41 | 
 42 |     @staticmethod
 43 |     def new():
 44 |         return Graph()
 45 | 
 46 |     def step(self, g: Graph, action: Action) -> Graph:
 47 |         """Step forward the given graph state with an action
 48 |         Parameters
 49 |         ----------
 50 |         g: Graph
 51 |             the graph to be modified
 52 |         action: GraphAction
 53 |             the action taken on the graph, indices must match
 54 |         Returns
 55 |         -------
 56 |         gp: Graph
 57 |             the new graph
 58 |         """
 59 |         gp = g.copy()
 60 |         if action.act_type is ActionType.ADD_EDGE:
 61 |             a, b = action.source, action.target
 62 |             assert self.allow_add_edge
 63 |             assert a in g and b in g
 64 |             if a > b:
 65 |                 a, b = b, a
 66 |             assert a != b
 67 |             assert not g.has_edge(a, b)
 68 |             # Ideally the FA underlying this must only be able to send
 69 |             # create_edge actions which respect this a<b property (or
 70 |             # its inverse!) , otherwise symmetry will be broken
 71 |             # because of the way the parents method is written
 72 |             gp.add_edge(a, b)
 73 | 
 74 |         elif action.act_type is ActionType.ADD_NODE:
 75 |             if len(g) == 0:
 76 |                 assert action.source == 0  # TODO: this may not be useful
 77 |                 gp.add_node(0, v=action.value)
 78 |             else:
 79 |                 assert action.source in g.nodes
 80 |                 e = [action.source, max(g.nodes) + 1]
 81 |                 assert not g.has_edge(*e)
 82 |                 gp.add_node(e[1], v=action.value)
 83 |                 gp.add_edge(*e)
 84 | 
 85 |         elif action.act_type is ActionType.SET_NODE_ATTR:
 86 |             assert self.allow_node_attr
 87 |             assert action.source in gp.nodes
 88 |             assert action.attr not in gp.nodes[action.source]
 89 |             gp.nodes[action.source][action.attr] = action.value
 90 | 
 91 |         elif action.act_type is ActionType.SET_EDGE_ATTR:
 92 |             assert self.allow_edge_attr
 93 |             assert g.has_edge(action.source, action.target)
 94 |             assert action.attr not in gp.edges[(action.source, action.target)]
 95 |             gp.edges[(action.source, action.target)][action.attr] = action.value
 96 |         else:
 97 |             # TODO: backward actions if we want to support MCMC-GFN style algorithms
 98 |             raise ValueError(f"Unknown action type {action.act_type}", action.act_type)
 99 | 
100 |         return gp
101 | 
102 |     def parents(self, g: Graph) -> List[StateActionPair]:
103 |         """List possible parents of graph `g`
104 |         Parameters
105 |         ----------
106 |         g: Graph
107 |             graph
108 |         Returns
109 |         -------
110 |         parents: List[Pair(GraphAction, Graph)]
111 |             The list of parent-action pairs that lead to `g`.
112 |         """
113 |         raise NotImplementedError()
114 | 
115 |     @staticmethod
116 |     def count_backward_transitions(g: Graph):
117 |         """Counts the number of parents of g without checking for isomorphisms"""
118 |         c = 0
119 |         deg = [g.degree[i] for i in range(len(g.nodes))]
120 |         for a, b in g.edges:
121 |             if deg[a] > 1 and deg[b] > 1 and len(g.edges[(a, b)]) == 0:
122 |                 # Can only remove edges connected to non-leaves and without
123 |                 # attributes (the agent has to remove the attrs, then remove
124 |                 # the edge). Removal cannot disconnect the graph.
125 |                 new_g = graph_without_edge(g, (a, b))
126 |                 if nx.algorithms.is_connected(new_g):
127 |                     c += 1
128 |             c += len(g.edges[(a, b)])  # One action per edge attr
129 |         for i in g.nodes:
130 |             if (
131 |                 deg[i] == 1
132 |                 and len(g.nodes[i]) == 1
133 |                 and len(g.edges[list(g.edges(i))[0]]) == 0
134 |             ):
135 |                 c += 1
136 |             c += len(g.nodes[i]) - 1  # One action per node attr, except 'v'
137 |             if len(g.nodes) == 1 and len(g.nodes[i]) == 1:
138 |                 # special case if last node in graph
139 |                 c += 1
140 |         return c
141 | 
142 | 
143 | # TODO: Move these to a graph utils file
144 | def graph_without_edge(g, e):
145 |     gp = g.copy()
146 |     gp.remove_edge(*e)
147 |     return gp
148 | 


--------------------------------------------------------------------------------
/rxitect/models/transformers.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import TYPE_CHECKING, Callable
  4 | 
  5 | import torch
  6 | import torch_geometric.nn as gnn
  7 | from torch import nn
  8 | from torch_geometric.data import Batch
  9 | from torch_geometric.utils import add_self_loops
 10 | 
 11 | 
 12 | class GraphTransformer(nn.Module):
 13 |     """An agnostic GraphTransformer class, and the main model used by other model classes
 14 |     This graph model takes in node features, edge features, and graph features (referred to as
 15 |     conditional information, since they condition the output). The graph features are projected to
 16 |     virtual nodes (one per graph), which are fully connected.
 17 | 
 18 |     The per node outputs are the concatenation of the final (post graph-convolution) node embeddings
 19 |     and of the final virtual node embedding of the graph each node corresponds to.
 20 |     The per graph outputs are the concatenation of a global mean pooling operation, of the final
 21 |     virtual node embeddings, and of the conditional information embedding.
 22 |     """
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         x_dim: int,
 27 |         e_dim: int,
 28 |         g_dim: int,
 29 |         num_emb: int = 64,
 30 |         num_layers: int = 3,
 31 |         num_heads: int = 2,
 32 |     ):
 33 |         """
 34 |         Parameters
 35 |         ----------
 36 |         x_dim: int
 37 |             The number of node features
 38 |         e_dim: int
 39 |             The number of edge features
 40 |         g_dim: int
 41 |             The number of graph-level features
 42 |         num_emb: int
 43 |             The number of hidden dimensions, i.e. embedding size. Default 64.
 44 |         num_layers: int
 45 |             The number of Transformer layers.
 46 |         num_heads: int
 47 |             The number of Transformer heads per layer.
 48 |         """
 49 |         super().__init__()
 50 |         self.num_layers = num_layers
 51 | 
 52 |         self.x2h = create_mlp(x_dim, num_emb, num_emb, 2)
 53 |         self.e2h = create_mlp(e_dim, num_emb, num_emb, 2)
 54 |         self.c2h = create_mlp(g_dim, num_emb, num_emb, 2)
 55 |         self.graph2emb = nn.ModuleList(
 56 |             sum(
 57 |                 [
 58 |                     [
 59 |                         gnn.GENConv(
 60 |                             num_emb, num_emb, num_layers=1, aggr="add", norm=None
 61 |                         ),
 62 |                         gnn.TransformerConv(
 63 |                             num_emb * 2, num_emb, edge_dim=num_emb, heads=num_heads
 64 |                         ),
 65 |                         nn.Linear(num_heads * num_emb, num_emb),
 66 |                         gnn.LayerNorm(num_emb, affine=False),
 67 |                         create_mlp(num_emb, num_emb * 4, num_emb, 1),
 68 |                         gnn.LayerNorm(num_emb, affine=False),
 69 |                     ]
 70 |                     for i in range(self.num_layers)
 71 |                 ],
 72 |                 [],
 73 |             )
 74 |         )
 75 | 
 76 |     def forward(self, g: Batch, cond: torch.Tensor):
 77 |         """Forward pass
 78 |         Parameters
 79 |         ----------
 80 |         g: Batch
 81 |             A standard torch_geometric Batch object. Expects `edge_attr` to be set.
 82 |         cond: torch.Tensor
 83 |             The per-graph conditioning information. Shape: (g.num_graphs, self.g_dim).
 84 |         Returns
 85 |         -------
 86 |         node_embeddings: torch.Tensor
 87 |             Per node embeddings. Shape: (g.num_nodes, self.num_emb * 2).
 88 |         graph_embeddings: torch.Tensor
 89 |             Per graph embeddings. Shape: (g.num_graphs, self.num_emb * 3).
 90 |         """
 91 |         o = self.x2h(g.x)
 92 |         e = self.e2h(g.edge_attr)
 93 |         c = self.c2h(cond)
 94 |         num_total_nodes = g.x.shape[0]
 95 |         # Augment the edges with a new edge to the conditioning
 96 |         # information node. This new node is connected to every node
 97 |         # within its graph.
 98 |         u, v = torch.arange(num_total_nodes, device=o.device), g.batch + num_total_nodes
 99 |         aug_edge_index = torch.cat(
100 |             [g.edge_index, torch.stack([u, v]), torch.stack([v, u])], 1
101 |         )
102 |         e_p = torch.zeros((num_total_nodes * 2, e.shape[1]), device=g.x.device)
103 |         e_p[:, 0] = 1  # Manually create a bias term
104 |         aug_e = torch.cat([e, e_p], 0)
105 |         aug_edge_index, aug_e = add_self_loops(aug_edge_index, aug_e, "mean")
106 |         aug_batch = torch.cat([g.batch, torch.arange(c.shape[0], device=o.device)], 0)
107 | 
108 |         # Append the conditioning information node embedding to o
109 |         o = torch.cat([o, c], 0)
110 |         for i in range(self.num_layers):
111 |             # Run the graph transformer forward
112 |             gen, trans, linear, norm1, ff, norm2 = self.graph2emb[i * 6 : (i + 1) * 6]
113 |             agg = gen(o, aug_edge_index, aug_e)
114 |             o = norm1(
115 |                 o + linear(trans(torch.cat([o, agg], 1), aug_edge_index, aug_e)),
116 |                 aug_batch,
117 |             )
118 |             o = norm2(o + ff(o), aug_batch)
119 | 
120 |         glob = torch.cat(
121 |             [gnn.global_mean_pool(o[: -c.shape[0]], g.batch), o[-c.shape[0] :], c], 1
122 |         )
123 |         o_final = torch.cat([o[: -c.shape[0]], c[g.batch]], 1)
124 |         return o_final, glob
125 | 
126 | 
127 | def create_mlp(
128 |     n_in: int,
129 |     n_hid: int,
130 |     n_out: int,
131 |     n_layer: int,
132 |     activation_fn: Callable = nn.LeakyReLU,
133 | ):
134 |     """Helper function that creates a fully-connected network with no activation after the last layer.
135 |     If `n_layer` is 0 then this corresponds to `nn.Linear(n_in, n_out)`.
136 | 
137 |     Parameters
138 |     ----------
139 |     n_in: int
140 |         x
141 |     n_hid: int
142 |         x
143 |     n_out: int
144 |         x
145 |     n_layer: int
146 |         x
147 |     activation_fn: Callable
148 |         x
149 |     """
150 |     n = [n_in] + [n_hid] * n_layer + [n_out]
151 |     return nn.Sequential(
152 |         *sum(
153 |             [[nn.Linear(n[i], n[i + 1]), activation_fn()] for i in range(n_layer + 1)],
154 |             [],
155 |         )[:-1]
156 |     )
157 | 


--------------------------------------------------------------------------------
/rxitect/models/lstm_generator.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | 
  3 | import pytorch_lightning as pl
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from rxitect import utils
  8 | from rxitect.tokenizers import get_tokenizer
  9 | 
 10 | 
 11 | class LSTMGenerator(pl.LightningModule):
 12 |     """
 13 |     A molecule generator that uses an LSTM to learn how to build valid molecular representations
 14 |     through BPTT.
 15 | 
 16 |     Attributes
 17 |     ----------
 18 |     tokenizer : Tokenizer
 19 |         A tokenizer to handle a given molecular representation (e.g., SMILES or SELFIES).
 20 |     embedding_size : int
 21 |         TODO
 22 |     hidden_size : int
 23 |         TODO
 24 |     embedding_layer : torch.nn.Embedding
 25 |         TODO
 26 |     lstm : torch.nn.LSTM
 27 |         TODO
 28 |     output_layer : torch.nn.Linear
 29 |         TODO
 30 |     """
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         vocabulary_filepath: str,
 35 |         molecule_repr: str = "smiles",
 36 |         embedding_size: int = 128,
 37 |         hidden_size: int = 512,
 38 |         num_layers: int = 3,
 39 |         lr: float = 1e-3,
 40 |         weight_decay: float = 0,
 41 |     ) -> None:
 42 |         """
 43 |         Parameters
 44 |         ----------
 45 |         vocabulary_filepath : str
 46 |             TODO
 47 |         molecule_repr : str, optional
 48 |             The type of molecular (string) representation to use (default is "smiles")
 49 |         embedding_size : int, optional
 50 |             The size of the embedding layer (default is 128)
 51 |         hidden_size : int, optional
 52 |             The size of the hidden layer (default is 512)
 53 |         num_layers : int
 54 |             TODO
 55 |         lr: float
 56 |             The learning rate for the LSTM generator (default is 1e-3)
 57 |         weight_decay: float
 58 |             TODO
 59 |         """
 60 |         super().__init__()
 61 |         self.save_hyperparameters()
 62 |         self.tokenizer = get_tokenizer(
 63 |             molecule_repr,
 64 |             vocabulary_filepath=vocabulary_filepath,
 65 |         )
 66 |         self.embedding_size = embedding_size
 67 |         self.hidden_size = hidden_size
 68 |         self.embedding_layer = nn.Embedding(
 69 |             num_embeddings=self.tokenizer.vocabulary_size_, embedding_dim=embedding_size
 70 |         )
 71 |         self.num_layers = num_layers
 72 |         self.lstm = nn.LSTM(
 73 |             embedding_size, hidden_size, num_layers=num_layers, batch_first=True
 74 |         )
 75 |         self.output_layer = nn.Linear(hidden_size, self.tokenizer.vocabulary_size_)
 76 |         self.lr = lr
 77 |         self.weight_decay = weight_decay
 78 | 
 79 |     def forward(
 80 |         self, x: torch.Tensor, h: torch.Tensor
 81 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 82 |         x = self.embedding_layer(x.unsqueeze(dim=-1))
 83 |         x, h_out = self.lstm(x, h)
 84 |         x = self.output_layer(x).squeeze(dim=1)
 85 |         return x, h_out
 86 | 
 87 |     def training_step(self, batch: torch.Tensor, batch_idx: int) -> torch.Tensor:
 88 |         loss = self.likelihood(batch)
 89 |         loss = -loss.mean()
 90 |         self.log(
 91 |             "train/loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True
 92 |         )
 93 |         return loss
 94 | 
 95 |     def validation_step(
 96 |         self, batch: torch.Tensor, batch_idx: torch.Tensor
 97 |     ) -> torch.Tensor:
 98 |         loss = self.likelihood(batch)
 99 |         loss = -loss.mean()
100 |         self.log(
101 |             "val/loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True
102 |         )
103 |         return loss
104 | 
105 |     def on_validation_epoch_end(self) -> None:
106 |         sequences = self.sample(1024)
107 |         sequences = utils.filter_duplicate_tensors(sequences)
108 |         valid_arr = [
109 |             utils.is_valid_smiles(smi) for smi in self.tokenizer.batch_decode(sequences)
110 |         ]
111 |         frac_valid = sum(valid_arr) / len(valid_arr)
112 |         frac_unique = sum(valid_arr) / 1024
113 |         self.log("frac_valid_smiles", frac_valid)
114 |         self.log("frac_unique_smiles", frac_unique)
115 | 
116 |     def configure_optimizers(self):
117 |         optimizer = torch.optim.Adam(
118 |             self.parameters(), lr=self.lr, weight_decay=self.weight_decay
119 |         )
120 |         return optimizer
121 | 
122 |     def init_hidden(self, batch_size: int) -> Tuple[torch.Tensor, torch.Tensor]:
123 |         h = torch.rand(
124 |             self.num_layers, batch_size, self.hidden_size, device=self.device
125 |         )
126 |         c = torch.rand(
127 |             self.num_layers, batch_size, self.hidden_size, device=self.device
128 |         )
129 |         return h, c
130 | 
131 |     def likelihood(self, target: torch.Tensor) -> torch.Tensor:
132 |         batch_size, seq_len = target.size()
133 |         x = torch.tensor(
134 |             [self.tokenizer.tk2ix_[self.tokenizer.start_token]] * batch_size,
135 |             device=self.device,
136 |             dtype=torch.long,
137 |         )
138 |         h = self.init_hidden(batch_size)
139 |         scores = torch.zeros(batch_size, seq_len, device=self.device)
140 |         for step in range(seq_len):
141 |             logits, h = self(x, h)
142 |             logits = logits.log_softmax(dim=-1)
143 |             score = logits.gather(1, target[:, step : step + 1]).squeeze()
144 |             scores[:, step] = score
145 |             x = target[:, step]
146 |         return scores
147 | 
148 |     def sample(self, batch_size: int, max_len: int = 140):
149 |         x = torch.tensor(
150 |             [self.tokenizer.tk2ix_[self.tokenizer.start_token]] * batch_size,
151 |             dtype=torch.long,
152 |             device=self.device,
153 |         )
154 |         h = self.init_hidden(batch_size)
155 |         sequences = torch.zeros(
156 |             batch_size, max_len, dtype=torch.long, device=self.device
157 |         )
158 |         is_end = torch.zeros(batch_size, dtype=torch.bool, device=self.device)
159 | 
160 |         for step in range(max_len):
161 |             logit, h = self(x, h)
162 |             proba = logit.softmax(dim=-1)
163 |             x = torch.multinomial(proba, 1).view(-1)
164 |             x[is_end] = self.tokenizer.tk2ix_[self.tokenizer.stop_token]
165 |             sequences[:, step] = x
166 | 
167 |             end_token = x == self.tokenizer.tk2ix_[self.tokenizer.stop_token]
168 |             is_end = torch.ge(is_end + end_token, 1)
169 |             if (is_end == 1).all():
170 |                 break
171 |         return sequences
172 | 
173 | 
174 | class GRUGenerator(nn.Module):
175 |     pass
176 | 


--------------------------------------------------------------------------------
/rxitect/tokenizers.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from abc import ABC, abstractmethod
  3 | from typing import Dict, List
  4 | 
  5 | import selfies as sf
  6 | import torch
  7 | 
  8 | 
  9 | class Tokenizer(ABC):
 10 |     start_token: str
 11 |     stop_token: str
 12 |     pad_token: str
 13 | 
 14 |     # Inferred attrs
 15 |     vocabulary_size_: int
 16 |     tk2ix_: Dict[str, int]
 17 |     ix2tk_: Dict[int, str]
 18 | 
 19 |     @abstractmethod
 20 |     def encode(self, molecules: List[str]) -> torch.Tensor:
 21 |         pass
 22 | 
 23 |     @abstractmethod
 24 |     def decode(self, encoded_molecules: torch.Tensor) -> List[str]:
 25 |         pass
 26 | 
 27 |     def _get_vocabulary_from_file(self, vocabulary_filepath: str) -> List[str]:
 28 |         with open(vocabulary_filepath, "r") as f:
 29 |             vocabulary = f.read().splitlines()
 30 | 
 31 |         return sorted(vocabulary)
 32 | 
 33 | 
 34 | class SmilesTokenizer(Tokenizer):
 35 |     def __init__(self, vocabulary_filepath: str) -> None:
 36 |         self.pad_token = "<pad>"
 37 |         self.start_token = "GO"
 38 |         self.stop_token = "EOS"
 39 |         SENTINEL_TOKENS = [self.pad_token, self.start_token, self.stop_token]
 40 |         self.vocabulary = SENTINEL_TOKENS + self._get_vocabulary_from_file(
 41 |             vocabulary_filepath
 42 |         )
 43 |         self.vocabulary_size_ = len(self.vocabulary)
 44 |         self.tk2ix_ = dict(zip(self.vocabulary, range(self.vocabulary_size_)))
 45 |         self.ix2tk_ = {ix: tk for tk, ix in self.tk2ix_.items()}
 46 | 
 47 |     def encode(self, molecule: str) -> torch.Tensor:
 48 |         tokenized_smiles = self._tokenize(molecule)
 49 |         encoded_smiles = torch.zeros(len(tokenized_smiles), dtype=torch.long)
 50 |         for i, token in enumerate(tokenized_smiles):
 51 |             encoded_smiles[i] = self.tk2ix_[token]
 52 |         return encoded_smiles
 53 | 
 54 |     def batch_encode(self, molecules: List[str]) -> torch.Tensor:
 55 |         max_len = max([len(mol) for mol in molecules])
 56 |         encoded_smiles = torch.zeros(len(molecules), max_len, dtype=torch.long)
 57 |         for i, smi in enumerate(molecules):
 58 |             tokenized_smi = self._tokenize(smi)
 59 |             for j, token in enumerate(tokenized_smi):
 60 |                 encoded_smiles[i, j] = self.tk2ix_[token]
 61 |         return encoded_smiles
 62 | 
 63 |     def decode(self, encoded_molecule: torch.Tensor) -> List[str]:
 64 |         encoded_molecule = encoded_molecule.cpu().detach().numpy()
 65 |         chars = []
 66 |         for i in encoded_molecule:
 67 |             if i == self.tk2ix_[self.stop_token]:
 68 |                 break
 69 |             chars.append(self.ix2tk_[i])
 70 |         smiles = "".join(chars)
 71 |         smiles = smiles.replace("L", "Cl").replace("R", "Br")
 72 |         return smiles
 73 | 
 74 |     def batch_decode(self, encoded_molecules: torch.Tensor) -> List[str]:
 75 |         decoded_smiles = []
 76 |         encoded_molecules = encoded_molecules.cpu().detach().numpy()
 77 |         for enc_smiles in encoded_molecules:
 78 |             chars = []
 79 |             for i in enc_smiles:
 80 |                 if i == self.tk2ix_[self.stop_token]:
 81 |                     break
 82 |                 chars.append(self.ix2tk_[i])
 83 |             smiles = "".join(chars)
 84 |             smiles = smiles.replace("L", "Cl").replace("R", "Br")
 85 |             decoded_smiles.append(smiles)
 86 |         return decoded_smiles
 87 | 
 88 |     def _tokenize(self, smiles: str) -> List[str]:
 89 |         """
 90 |         Takes a SMILES string and returns a list containing the tokens its composed of.
 91 |         SOURCE: https://github.com/MarcusOlivecrona/REINVENT/
 92 | 
 93 |         Parameters
 94 |         ----------
 95 |         smiles: A SMILES string representing a molecule
 96 |         """
 97 |         regex = "(\[[^\[\]]{1,6}\])"
 98 |         smiles = self._replace_halogen(smiles)
 99 |         char_list = re.split(regex, smiles)
100 |         tokenized = []
101 |         for char in char_list:
102 |             if char.startswith("["):
103 |                 tokenized.append(char)
104 |             else:
105 |                 chars = [unit for unit in char]
106 |                 [tokenized.append(unit) for unit in chars]
107 |         tokenized.append(self.stop_token)
108 |         return tokenized
109 | 
110 |     def _replace_halogen(self, smiles: str) -> str:
111 |         """Regex to replace Br and Cl with single letters"""
112 |         br = re.compile("Br")
113 |         cl = re.compile("Cl")
114 |         smiles = br.sub("R", smiles)
115 |         smiles = cl.sub("L", smiles)
116 | 
117 |         return smiles
118 | 
119 | 
120 | class SelfiesTokenizer(Tokenizer):
121 |     def __init__(self, vocabulary_filepath: str, max_len: int) -> None:
122 |         self.start_token = "[GO]"
123 |         self.stop_token = "[EOS]"
124 |         self.pad_token = "[nop]"
125 |         SENTINEL_TOKENS = [self.pad_token, self.start_token, self.stop_token]
126 |         self.vocabulary = SENTINEL_TOKENS + self._get_vocabulary_from_file(
127 |             vocabulary_filepath
128 |         )
129 |         self.vocabulary_size_ = len(self.vocabulary)
130 |         self.max_len = max_len
131 |         self.tk2ix_ = dict(zip(self.vocabulary, range(self.vocabulary_size_)))
132 |         self.ix2tk_ = {ix: tk for tk, ix in self.tk2ix_.items()}
133 | 
134 |     def encode(self, molecule: List[str]) -> torch.Tensor:
135 |         print("Encoding some SELFIES!")
136 |         encoded_smiles = torch.zeros(self.max_len, dtype=torch.long)
137 |         tokenized_smiles = self._tokenize(molecule)
138 |         for i, token in enumerate(tokenized_smiles):
139 |             encoded_smiles[i] = self.tk2ix_[token]
140 |         return encoded_smiles
141 | 
142 |     def decode(self, encoded_molecule: torch.Tensor) -> List[str]:
143 |         print("Decoding some tensors to SELFIES!")
144 |         encoded_molecule = encoded_molecule.cpu().detach().numpy()
145 |         chars = []
146 |         for i in encoded_molecule:
147 |             if i == self.tk2ix_[self.stop_token]:
148 |                 break
149 |             chars.append(self.ix2tk_[i])
150 |         selfies = "".join(chars)
151 |         return selfies
152 | 
153 |     def _tokenize(self, selfies: str) -> List[str]:
154 |         """
155 |         Takes a SELFIES string and returns a list containing the tokens its composed of.
156 | 
157 |         Parameters
158 |         ----------
159 |         selfies: A SELFIES string representing a molecule
160 |         """
161 |         tokenized_selfies = list(sf.split_selfies(selfies))
162 |         tokenized_selfies.append(self.stop_token)
163 |         return tokenized_selfies
164 | 
165 | 
166 | def get_tokenizer(molecule_repr: str, vocabulary_filepath: str) -> Tokenizer:
167 |     if molecule_repr == "smiles":
168 |         return SmilesTokenizer(vocabulary_filepath=vocabulary_filepath)
169 |     elif molecule_repr == "selfies":
170 |         return SelfiesTokenizer(vocabulary_filepath=vocabulary_filepath)
171 |     else:
172 |         raise ValueError(molecule_repr)
173 | 


--------------------------------------------------------------------------------
/rxitect/trainers/gfn_trainer.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from abc import ABC, abstractmethod
  4 | from pathlib import Path
  5 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple
  6 | 
  7 | import torch
  8 | from rdkit.Chem.rdchem import Mol
  9 | from torch import nn
 10 | from torch.types import Device
 11 | from torch.utils.data import DataLoader, Dataset
 12 | from torch.utils.tensorboard.writer import SummaryWriter
 13 | from torch_geometric.data import Batch
 14 | 
 15 | from rxitect.data.iterators import SamplingIterator
 16 | from rxitect.envs.contexts import ActionCategorical
 17 | from rxitect.utils.multiprocessing_proxy import wrap_model_mp
 18 | 
 19 | if TYPE_CHECKING:
 20 |     from rxitect.algorithms.gfn_algorithm import GFNAlgorithm
 21 |     from rxitect.envs.contexts import GraphEnvContext
 22 |     from rxitect.tasks.gfn_task import GFNTask
 23 | 
 24 | 
 25 | class GFNTrainer:
 26 |     def __init__(self, hps: Dict[str, Any], device: Device):
 27 |         """A GFlowNet trainer. Contains the main training loop in `run` and should be subclassed.
 28 |         Parameters
 29 |         ----------
 30 |         hps: Dict[str, Any]
 31 |             A dictionary of hyperparameters. These override default values obtained by the `default_hps` method.
 32 |         device: Device
 33 |             The torch device of the main worker.
 34 |         """
 35 |         # self.setup should at least set these up:
 36 |         self.training_data: Dataset = None
 37 |         self.test_data: Dataset = None
 38 |         self.model: nn.Module = None
 39 |         # `sampling_model` is used by the data workers to sample new objects from the model. Can be
 40 |         # the same as `model`.
 41 |         self.sampling_model: nn.Module = None
 42 |         self.mb_size: int = None
 43 |         self.ctx: GraphEnvContext = None
 44 |         self.task: GFNTask = None
 45 |         self.algo: GFNAlgorithm = None
 46 | 
 47 |         # Override default hyperparameters with the constructor arguments
 48 |         self.hps = {**self.default_hps(), **hps}
 49 |         self.device = device
 50 |         # The number of processes spawned to sample object and do CPU work
 51 |         self.num_workers: int = self.hps.get("num_data_loader_workers", 0)
 52 |         # The offline_ratio of samples drawn from `self.training_data` during training. The rest is drawn from
 53 |         # `self.sampling_model`.
 54 |         self.offline_ratio: float = 0.5
 55 |         # idem, but from `self.test_data` during validation.
 56 |         self.valid_offline_ratio: float = 1
 57 |         # If True, print messages during training
 58 |         self.verbose: bool = False
 59 |         # These hooks allow us to compute extra quantities when sampling data
 60 |         self.sampling_hooks: List[Callable] = []
 61 | 
 62 |         self.setup()
 63 | 
 64 |     def default_hps(self) -> Dict[str, Any]:
 65 |         raise NotImplementedError()
 66 | 
 67 |     def setup(self):
 68 |         raise NotImplementedError()
 69 | 
 70 |     def step(self, loss: torch.Tensor):
 71 |         raise NotImplementedError()
 72 | 
 73 |     def _wrap_model_mp(self, model):
 74 |         """Wraps a nn.Module instance so that it can be shared to `DataLoader` workers."""
 75 |         model.to(self.device)
 76 |         if self.num_workers > 0:
 77 |             placeholder = wrap_model_mp(
 78 |                 model, self.num_workers, cast_types=(Batch, ActionCategorical)
 79 |             )
 80 |             return placeholder, torch.device("cpu")
 81 |         return model, self.device
 82 | 
 83 |     def build_training_data_loader(self) -> DataLoader:
 84 |         model, dev = self._wrap_model_mp(self.sampling_model)
 85 |         iterator = SamplingIterator(
 86 |             self.training_data,
 87 |             model,
 88 |             self.mb_size * 2,
 89 |             self.ctx,
 90 |             self.algo,
 91 |             self.task,
 92 |             dev,
 93 |             offline_ratio=self.offline_ratio,
 94 |             log_dir=self.hps["log_dir"],
 95 |         )
 96 |         for hook in self.sampling_hooks:
 97 |             iterator.add_log_hook(hook)
 98 |         return torch.utils.data.DataLoader(
 99 |             iterator,
100 |             batch_size=None,
101 |             num_workers=self.num_workers,
102 |             persistent_workers=self.num_workers > 0,
103 |         )
104 | 
105 |     def build_validation_data_loader(self) -> DataLoader:
106 |         model, dev = self._wrap_model_mp(self.model)
107 |         iterator = SamplingIterator(
108 |             self.test_data,
109 |             model,
110 |             self.mb_size,
111 |             self.ctx,
112 |             self.algo,
113 |             self.task,
114 |             dev,
115 |             offline_ratio=self.valid_offline_ratio,
116 |             stream=False,
117 |         )
118 |         return torch.utils.data.DataLoader(
119 |             iterator,
120 |             batch_size=None,
121 |             num_workers=self.num_workers,
122 |             persistent_workers=self.num_workers > 0,
123 |         )
124 | 
125 |     def train_batch(
126 |         self, batch: Batch, epoch_idx: int, batch_idx: int
127 |     ) -> Dict[str, Any]:
128 |         loss, info = self.algo.compute_batch_losses(
129 |             self.model, batch, num_bootstrap=self.mb_size
130 |         )
131 |         self.step(loss)
132 |         if hasattr(batch, "extra_info"):
133 |             info.update(batch.extra_info)
134 |         return {k: v.item() if hasattr(v, "item") else v for k, v in info.items()}
135 | 
136 |     def evaluate_batch(
137 |         self, batch: Batch, epoch_idx: int = 0, batch_idx: int = 0
138 |     ) -> Dict[str, Any]:
139 |         loss, info = self.algo.compute_batch_losses(
140 |             self.model, batch, num_bootstrap=batch.num_offline
141 |         )
142 |         return {k: v.item() if hasattr(v, "item") else v for k, v in info.items()}
143 | 
144 |     def run(self):
145 |         """Trains the GFN for `num_training_steps` minibatches, performing
146 |         validation every `validate_every` minibatches.
147 |         """
148 |         self.model.to(self.device)
149 |         self.sampling_model.to(self.device)
150 |         epoch_length = max(len(self.training_data), 1)
151 |         train_dl = self.build_training_data_loader()
152 |         valid_dl = self.build_validation_data_loader()
153 |         for it, batch in zip(range(1, 1 + self.hps["num_training_steps"]), train_dl):
154 |             epoch_idx = it // epoch_length
155 |             batch_idx = it % epoch_length
156 |             info = self.train_batch(batch.to(self.device), epoch_idx, batch_idx)
157 |             if self.verbose:
158 |                 print(it, " ".join(f"{k}:{v:.2f}" for k, v in info.items()))
159 |             self.log(info, it, "train")
160 | 
161 |             if it % self.hps["validate_every"] == 0:
162 |                 for val_batch in valid_dl:
163 |                     info = self.evaluate_batch(
164 |                         val_batch.to(self.device), epoch_idx, batch_idx
165 |                     )
166 |                     self.log(info, it, "valid")
167 |                 torch.save(
168 |                     {
169 |                         "models_state_dict": [self.model.state_dict()],
170 |                         "hps": self.hps,
171 |                     },
172 |                     open(Path(self.hps["log_dir"]) / "model_state.pt", "wb"),
173 |                 )
174 | 
175 |     def log(self, info, index, key):
176 |         if not hasattr(self, "_summary_writer"):
177 |             self._summary_writer = SummaryWriter(self.hps["log_dir"])
178 |         for k, v in info.items():
179 |             self._summary_writer.add_scalar(f"{key}_{k}", v, index)
180 | 


--------------------------------------------------------------------------------
/rxitect/tasks/original_task.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import copy
  3 | 
  4 | import numpy as np
  5 | from rdkit import RDLogger
  6 | from rdkit.Chem.rdchem import Mol
  7 | from scipy.stats import stats
  8 | from torch.types import Device
  9 | 
 10 | from rxitect.algorithms import TrajectoryBalance
 11 | from rxitect.envs import FragmentEnv, FragmentEnvContext
 12 | from rxitect.models import bengio2021flow, FragmentBasedGFN
 13 | from rxitect.tasks import GFNTask, FlatRewards, ScalarReward
 14 | import torch
 15 | from torch_geometric.data import Batch
 16 | from torch.utils.data import DataLoader, Dataset
 17 | from torch import nn
 18 | from typing import TYPE_CHECKING, Tuple, Union, List, Callable, Dict, Any
 19 | 
 20 | from rxitect.trainers import GFNTrainer
 21 | from rxitect.utils.transforms import thermometer
 22 | 
 23 | 
 24 | class SEHTask(GFNTask):
 25 |     """Sets up a task where the reward is computed using a proxy for the binding energy of a molecule to
 26 |     Soluble Epoxide Hydrolases.
 27 |     The proxy is pretrained, and obtained from the original GFlowNet paper, see `gflownet.models.bengio2021flow`.
 28 |     This setup essentially reproduces the results of the Trajectory Balance paper when using the TB
 29 |     objective, or of the original paper when using Flow Matching (TODO: port to this repo).
 30 |     """
 31 |     def __init__(self, dataset: Dataset, temperature_distribution: str, temperature_parameters: Tuple[float],
 32 |                  wrap_model: Callable[[nn.Module], nn.Module] = None):
 33 |         self._wrap_model = wrap_model
 34 |         self.models = self._load_task_models()
 35 |         self.dataset = dataset
 36 |         self.temperature_sample_dist = temperature_distribution
 37 |         self.temperature_dist_params = temperature_parameters
 38 | 
 39 |     def flat_reward_transform(self, y: Union[float, torch.Tensor]) -> FlatRewards:
 40 |         return FlatRewards(torch.as_tensor(y) / 8)
 41 | 
 42 |     def inverse_flat_reward_transform(self, rp):
 43 |         return rp * 8
 44 | 
 45 |     def _load_task_models(self):
 46 |         model = bengio2021flow.load_original_model()
 47 |         model, self.device = self._wrap_model(model)
 48 |         return {'seh': model}
 49 | 
 50 |     def sample_conditional_information(self, n):
 51 |         beta = None
 52 |         if self.temperature_sample_dist == 'gamma':
 53 |             loc, scale = self.temperature_dist_params
 54 |             beta = self.rng.gamma(loc, scale, n).astype(np.float32)
 55 |             upper_bound = stats.gamma.ppf(0.95, loc, scale=scale)
 56 |         elif self.temperature_sample_dist == 'uniform':
 57 |             beta = self.rng.uniform(*self.temperature_dist_params, n).astype(np.float32)
 58 |             upper_bound = self.temperature_dist_params[1]
 59 |         elif self.temperature_sample_dist == 'beta':
 60 |             beta = self.rng.beta(*self.temperature_dist_params, n).astype(np.float32)
 61 |             upper_bound = 1
 62 |         else:
 63 |             raise ValueError()
 64 |         beta_enc = thermometer(torch.tensor(beta), 32, 0, upper_bound)  # TODO: hyperparameters
 65 |         return {'beta': torch.tensor(beta), 'encoding': beta_enc}
 66 | 
 67 |     def cond_info_to_reward(self, cond_info: Dict[str, torch.Tensor], flat_reward: FlatRewards) -> ScalarReward:
 68 |         if isinstance(flat_reward, list):
 69 |             flat_reward = torch.tensor(flat_reward)
 70 |         return flat_reward**cond_info['beta']
 71 | 
 72 |     def compute_flat_rewards(self, mols: List[Mol]) -> Tuple[FlatRewards, torch.Tensor]:
 73 |         graphs = [bengio2021flow.mol2graph(i) for i in mols]
 74 |         is_valid = torch.tensor([i is not None for i in graphs]).bool()
 75 |         if not is_valid.any():
 76 |             return FlatRewards(torch.zeros((0,))), is_valid
 77 |         batch = Batch.from_data_list([i for i in graphs if i is not None])
 78 |         batch.to(self.device)
 79 |         preds = self.models['seh'](batch).reshape((-1,)).data.cpu()
 80 |         preds[preds.isnan()] = 0
 81 |         preds = self.flat_reward_transform(preds).clip(1e-4, 100).reshape((-1, 1))
 82 |         return FlatRewards(preds), is_valid
 83 | 
 84 | 
 85 | class SEHFragTrainer(GFNTrainer):
 86 |     def __init__(self, hps: Dict[str, Any], device: Device):
 87 |         super().__init__(hps, device)
 88 | 
 89 |     def default_hps(self) -> Dict[str, Any]:
 90 |         return {
 91 |             'bootstrap_own_reward': False,
 92 |             'learning_rate': 1e-4,
 93 |             'global_batch_size': 64,
 94 |             'num_emb': 128,
 95 |             'num_layers': 4,
 96 |             'tb_epsilon': None,
 97 |             'illegal_action_logreward': -75,
 98 |             'reward_loss_multiplier': 1,
 99 |             'temperature_sample_dist': 'uniform',
100 |             'temperature_dist_params': '(.5, 32)',
101 |             'weight_decay': 1e-8,
102 |             'num_data_loader_workers': 1,
103 |             'momentum': 0.9,
104 |             'adam_eps': 1e-8,
105 |             'lr_decay': 20_000,
106 |             'Z_lr_decay': 20_000,
107 |             'clip_grad_type': 'norm',
108 |             'clip_grad_param': 10,
109 |             'random_action_prob': 0.,
110 |             'sampling_tau': 0.,
111 |             'num_cond_dim': 32,
112 |         }
113 | 
114 |     def setup(self):
115 |         hps = self.hps
116 |         RDLogger.DisableLog('rdApp.*')
117 |         self.rng = np.random.default_rng(142857)
118 |         self.env = FragmentEnv()
119 |         self.ctx = FragmentEnvContext(max_frags=9, num_cond_dim=hps['num_cond_dim'])
120 |         self.training_data = []
121 |         self.test_data = []
122 |         self.offline_ratio = 0
123 |         self.valid_offline_ratio = 0
124 | 
125 |         model = FragmentBasedGFN(self.ctx, num_emb=hps['num_emb'], num_layers=hps['num_layers'])
126 |         self.model = model
127 |         # Separate Z parameters from non-Z to allow for LR decay on the former
128 |         Z_params = list(model.log_z.parameters())
129 |         non_Z_params = [i for i in self.model.parameters() if all(id(i) != id(j) for j in Z_params)]
130 |         self.opt = torch.optim.Adam(non_Z_params, hps['learning_rate'], (hps['momentum'], 0.999),
131 |                                     weight_decay=hps['weight_decay'], eps=hps['adam_eps'])
132 |         self.opt_Z = torch.optim.Adam(Z_params, hps['learning_rate'], (0.9, 0.999))
133 |         self.lr_sched = torch.optim.lr_scheduler.LambdaLR(self.opt, lambda steps: 2**(-steps / hps['lr_decay']))
134 |         self.lr_sched_Z = torch.optim.lr_scheduler.LambdaLR(self.opt_Z, lambda steps: 2**(-steps / hps['Z_lr_decay']))
135 | 
136 |         self.sampling_tau = hps['sampling_tau']
137 |         if self.sampling_tau > 0:
138 |             self.sampling_model = copy.deepcopy(model)
139 |         else:
140 |             self.sampling_model = self.model
141 |         eps = hps['tb_epsilon']
142 |         hps['tb_epsilon'] = ast.literal_eval(eps) if isinstance(eps, str) else eps
143 |         self.algo = TrajectoryBalance(self.env, self.ctx, self.rng, hps, max_nodes=9)
144 | 
145 |         self.task = SEHTask(self.training_data, hps['temperature_sample_dist'],
146 |                             ast.literal_eval(hps['temperature_dist_params']), wrap_model=self._wrap_model_mp)
147 |         self.mb_size = hps['global_batch_size']
148 |         self.clip_grad_param = hps['clip_grad_param']
149 |         self.clip_grad_callback = {
150 |             'value': (lambda params: torch.nn.utils.clip_grad_value_(params, self.clip_grad_param)),
151 |             'norm': (lambda params: torch.nn.utils.clip_grad_norm_(params, self.clip_grad_param)),
152 |             'none': (lambda x: None)
153 |         }[hps['clip_grad_type']]
154 | 
155 |     def step(self, loss: torch.Tensor):
156 |         loss.backward()
157 |         for i in self.model.parameters():
158 |             self.clip_grad_callback(i)
159 |         self.opt.step()
160 |         self.opt.zero_grad()
161 |         self.opt_Z.step()
162 |         self.opt_Z.zero_grad()
163 |         self.lr_sched.step()
164 |         self.lr_sched_Z.step()
165 |         if self.sampling_tau > 0:
166 |             for a, b in zip(self.model.parameters(), self.sampling_model.parameters()):
167 |                 b.data.mul_(self.sampling_tau).add_(a.data * (1 - self.sampling_tau))
168 | 
169 | 
170 | def main():
171 |     """Example of how this model can be run outside Determined"""
172 |     from pyprojroot import here
173 |     log_dir = str(here() / 'scratch/logs/seh_frag/run_0/')
174 |     hps = {
175 |         'lr_decay': 10,
176 |         'qm9_h5_path': 'data/chem/qm9/qm9.h5',
177 |         'log_dir': log_dir,
178 |         'num_training_steps': 10,
179 |         'validate_every': 5,
180 |         'sampling_tau': 0.99,
181 |         'temperature_dist_params': '(0, 64)',
182 |     }
183 |     trial = SEHFragTrainer(hps, torch.device('cpu'))
184 |     trial.verbose = True
185 |     print(f"params: {trial.hps}")
186 |     trial.run()
187 | 
188 | 
189 | if __name__ == "__main__":
190 |     main()
191 | 


--------------------------------------------------------------------------------
/rxitect/models/bengio2021flow.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is code adapted from Bengio et al. (2021), 'Flow Network based
  3 | Generative Models for Non-Iterative Diverse Candidate Generation',
  4 | from
  5 |    https://github.com/GFNOrg/gflownet
  6 | In particular, this model class allows us to compare to the same
  7 | target proxy used in that paper (sEH binding affinity prediction).
  8 | """
  9 | import gzip
 10 | import os
 11 | import pickle  # nosec
 12 | 
 13 | import numpy as np
 14 | import requests  # type: ignore
 15 | import torch
 16 | import torch.nn as nn
 17 | import torch.nn.functional as F
 18 | from rdkit import RDConfig
 19 | from rdkit.Chem import ChemicalFeatures
 20 | from rdkit.Chem.rdchem import BondType as BT
 21 | from rdkit.Chem.rdchem import HybridizationType
 22 | from torch_geometric.data import Batch, Data
 23 | from torch_geometric.nn import NNConv, Set2Set
 24 | from torch_sparse import coalesce
 25 | 
 26 | NUM_ATOMIC_NUMBERS = 56  # Number of atoms used in the molecules (i.e. up to Ba)
 27 | 
 28 | 
 29 | class MPNNet(nn.Module):
 30 |     def __init__(
 31 |         self,
 32 |         num_feat=14,
 33 |         num_vec=3,
 34 |         dim=64,
 35 |         num_out_per_mol=1,
 36 |         num_out_per_stem=105,
 37 |         num_out_per_bond=1,
 38 |         num_conv_steps=12,
 39 |     ):
 40 |         super().__init__()
 41 |         self.lin0 = nn.Linear(num_feat + num_vec, dim)
 42 |         self.num_ops = num_out_per_stem
 43 |         self.num_opm = num_out_per_mol
 44 |         self.num_conv_steps = num_conv_steps
 45 |         self.dropout_rate = 0
 46 | 
 47 |         self.act = nn.LeakyReLU()
 48 | 
 49 |         net = nn.Sequential(nn.Linear(4, 128), self.act, nn.Linear(128, dim * dim))
 50 |         self.conv = NNConv(dim, dim, net, aggr="mean")
 51 |         self.gru = nn.GRU(dim, dim)
 52 | 
 53 |         self.set2set = Set2Set(dim, processing_steps=3)
 54 |         self.lin3 = nn.Linear(dim * 2, num_out_per_mol)
 55 |         self.bond2out = nn.Sequential(
 56 |             nn.Linear(dim * 2, dim),
 57 |             self.act,
 58 |             nn.Linear(dim, dim),
 59 |             self.act,
 60 |             nn.Linear(dim, num_out_per_bond),
 61 |         )
 62 | 
 63 |     def forward(self, data, do_dropout=False):
 64 |         out = self.act(self.lin0(data.x))
 65 |         h = out.unsqueeze(0)
 66 |         h = F.dropout(h, training=do_dropout, p=self.dropout_rate)
 67 | 
 68 |         for i in range(self.num_conv_steps):
 69 |             m = self.act(self.conv(out, data.edge_index, data.edge_attr))
 70 |             m = F.dropout(m, training=do_dropout, p=self.dropout_rate)
 71 |             out, h = self.gru(m.unsqueeze(0).contiguous(), h.contiguous())
 72 |             h = F.dropout(h, training=do_dropout, p=self.dropout_rate)
 73 |             out = out.squeeze(0)
 74 | 
 75 |         global_out = self.set2set(out, data.batch)
 76 |         global_out = F.dropout(global_out, training=do_dropout, p=self.dropout_rate)
 77 |         per_mol_out = self.lin3(global_out)  # per mol scalar outputs
 78 |         return per_mol_out
 79 | 
 80 | 
 81 | def load_original_model():
 82 |     num_feat = 14 + 1 + NUM_ATOMIC_NUMBERS
 83 |     mpnn = MPNNet(
 84 |         num_feat=num_feat,
 85 |         num_vec=0,
 86 |         dim=64,
 87 |         num_out_per_mol=1,
 88 |         num_out_per_stem=105,
 89 |         num_conv_steps=12,
 90 |     )
 91 |     f = requests.get(
 92 |         "https://github.com/GFNOrg/gflownet/raw/master/mols/data/pretrained_proxy/best_params.pkl.gz",
 93 |         stream=True,
 94 |     )
 95 |     params = pickle.load(gzip.open(f.raw))  # nosec
 96 |     param_map = {
 97 |         "lin0.weight": params[0],
 98 |         "lin0.bias": params[1],
 99 |         "conv.bias": params[3],
100 |         "conv.nn.0.weight": params[4],
101 |         "conv.nn.0.bias": params[5],
102 |         "conv.nn.2.weight": params[6],
103 |         "conv.nn.2.bias": params[7],
104 |         "conv.lin.weight": params[2],
105 |         "gru.weight_ih_l0": params[8],
106 |         "gru.weight_hh_l0": params[9],
107 |         "gru.bias_ih_l0": params[10],
108 |         "gru.bias_hh_l0": params[11],
109 |         "set2set.lstm.weight_ih_l0": params[16],
110 |         "set2set.lstm.weight_hh_l0": params[17],
111 |         "set2set.lstm.bias_ih_l0": params[18],
112 |         "set2set.lstm.bias_hh_l0": params[19],
113 |         "lin3.weight": params[20],
114 |         "lin3.bias": params[21],
115 |     }
116 |     for k, v in param_map.items():
117 |         mpnn.get_parameter(k).data = torch.tensor(v)
118 |     return mpnn
119 | 
120 | 
121 | _mpnn_feat_cache = [None]
122 | 
123 | 
124 | def mpnn_feat(
125 |     mol, ifcoord=True, panda_fmt=False, one_hot_atom=False, donor_features=False
126 | ):
127 |     atomtypes = {"H": 0, "C": 1, "N": 2, "O": 3, "F": 4}
128 |     bondtypes = {
129 |         BT.SINGLE: 0,
130 |         BT.DOUBLE: 1,
131 |         BT.TRIPLE: 2,
132 |         BT.AROMATIC: 3,
133 |         BT.UNSPECIFIED: 0,
134 |     }
135 | 
136 |     natm = len(mol.GetAtoms())
137 |     ntypes = len(atomtypes)
138 |     # featurize elements
139 |     # columns are: ["type_idx" .. , "atomic_number", "acceptor", "donor",
140 |     # "aromatic", "sp", "sp2", "sp3", "num_hs", [atomic_number_onehot] .. ])
141 | 
142 |     nfeat = ntypes + 1 + 8
143 |     if one_hot_atom:
144 |         nfeat += NUM_ATOMIC_NUMBERS
145 |     atmfeat = np.zeros((natm, nfeat))
146 | 
147 |     # featurize
148 |     for i, atom in enumerate(mol.GetAtoms()):
149 |         type_idx = atomtypes.get(atom.GetSymbol(), 5)
150 |         atmfeat[i, type_idx] = 1
151 |         if one_hot_atom:
152 |             atmfeat[i, ntypes + 9 + atom.GetAtomicNum() - 1] = 1
153 |         else:
154 |             atmfeat[i, ntypes + 1] = (atom.GetAtomicNum() % 16) / 2.0
155 |         atmfeat[i, ntypes + 4] = atom.GetIsAromatic()
156 |         hybridization = atom.GetHybridization()
157 |         atmfeat[i, ntypes + 5] = hybridization == HybridizationType.SP
158 |         atmfeat[i, ntypes + 6] = hybridization == HybridizationType.SP2
159 |         atmfeat[i, ntypes + 7] = hybridization == HybridizationType.SP3
160 |         atmfeat[i, ntypes + 8] = atom.GetTotalNumHs(includeNeighbors=True)
161 | 
162 |     # get donors and acceptors
163 |     if donor_features:
164 |         if _mpnn_feat_cache[0] is None:
165 |             fdef_name = os.path.join(RDConfig.RDDataDir, "BaseFeatures.fdef")
166 |             factory = ChemicalFeatures.BuildFeatureFactory(fdef_name)
167 |             _mpnn_feat_cache[0] = factory
168 |         else:
169 |             factory = _mpnn_feat_cache[0]
170 |         feats = factory.GetFeaturesForMol(mol)
171 |         for j in range(0, len(feats)):
172 |             if feats[j].GetFamily() == "Donor":
173 |                 node_list = feats[j].GetAtomIds()
174 |                 for k in node_list:
175 |                     atmfeat[k, ntypes + 3] = 1
176 |             elif feats[j].GetFamily() == "Acceptor":
177 |                 node_list = feats[j].GetAtomIds()
178 |                 for k in node_list:
179 |                     atmfeat[k, ntypes + 2] = 1
180 |     # get coord
181 |     if ifcoord:
182 |         coord = np.asarray(
183 |             [mol.GetConformer(0).GetAtomPosition(j) for j in range(natm)]
184 |         )
185 |     else:
186 |         coord = None
187 |     # get bonds and bond features
188 |     bond = np.asarray(
189 |         [[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] for bond in mol.GetBonds()]
190 |     )
191 |     bondfeat = [bondtypes[bond.GetBondType()] for bond in mol.GetBonds()]
192 |     bondfeat = onehot(bondfeat, num_classes=len(bondtypes) - 1)
193 | 
194 |     return atmfeat, coord, bond, bondfeat
195 | 
196 | 
197 | def mol_to_graph_backend(atmfeat, coord, bond, bondfeat, props={}, data_cls=Data):
198 |     "convert to PyTorch geometric module"
199 |     natm = atmfeat.shape[0]
200 |     # transform to torch_geometric bond format; send edges both ways; sort bonds
201 |     atmfeat = torch.tensor(atmfeat, dtype=torch.float32)
202 |     if bond.shape[0] > 0:
203 |         edge_index = torch.tensor(
204 |             np.concatenate([bond.T, np.flipud(bond.T)], axis=1), dtype=torch.int64
205 |         )
206 |         edge_attr = torch.tensor(
207 |             np.concatenate([bondfeat, bondfeat], axis=0), dtype=torch.float32
208 |         )
209 |         edge_index, edge_attr = coalesce(edge_index, edge_attr, natm, natm)
210 |     else:
211 |         edge_index = torch.zeros((0, 2), dtype=torch.int64)
212 |         edge_attr = torch.tensor(bondfeat, dtype=torch.float32)
213 | 
214 |     # make torch data
215 |     if coord is not None:
216 |         coord = torch.tensor(coord, dtype=torch.float32)
217 |         data = data_cls(
218 |             x=atmfeat, pos=coord, edge_index=edge_index, edge_attr=edge_attr, **props
219 |         )
220 |     else:
221 |         data = data_cls(x=atmfeat, edge_index=edge_index, edge_attr=edge_attr, **props)
222 |     return data
223 | 
224 | 
225 | def onehot(arr, num_classes, dtype=np.int):
226 |     arr = np.asarray(arr, dtype=np.int)
227 |     assert len(arr.shape) == 1, "dims other than 1 not implemented"
228 |     onehot_arr = np.zeros(arr.shape + (num_classes,), dtype=dtype)
229 |     onehot_arr[np.arange(arr.shape[0]), arr] = 1
230 |     return onehot_arr
231 | 
232 | 
233 | def mol2graph(mol, floatX=torch.float, bonds=False, nblocks=False):
234 |     rdmol = mol
235 |     if rdmol is None:
236 |         g = Data(
237 |             x=torch.zeros((1, 14 + NUM_ATOMIC_NUMBERS)),
238 |             edge_attr=torch.zeros((0, 4)),
239 |             edge_index=torch.zeros((0, 2)).long(),
240 |         )
241 |     else:
242 |         atmfeat, _, bond, bondfeat = mpnn_feat(
243 |             mol, ifcoord=False, one_hot_atom=True, donor_features=False
244 |         )
245 |         g = mol_to_graph_backend(atmfeat, None, bond, bondfeat)
246 |     stem_mask = torch.zeros((g.x.shape[0], 1))
247 |     g.x = torch.cat([g.x, stem_mask], 1).to(floatX)
248 |     g.edge_attr = g.edge_attr.to(floatX)
249 |     if g.edge_index.shape[0] == 0:
250 |         g.edge_index = torch.zeros((2, 1)).long()
251 |         g.edge_attr = torch.zeros((1, g.edge_attr.shape[1])).to(floatX)
252 |     return g
253 | 
254 | 
255 | def mols2batch(mols):
256 |     batch = Batch.from_data_list(mols)
257 |     return batch
258 | 


--------------------------------------------------------------------------------
/rxitect/envs/contexts/fragment_env_context.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from pathlib import Path
  4 | from typing import TYPE_CHECKING, List, Union
  5 | 
  6 | import numpy as np
  7 | import rdkit.Chem as Chem
  8 | import torch
  9 | from pyprojroot import here
 10 | from rdkit.Chem import Atom
 11 | from torch_geometric.data import Batch, Data
 12 | 
 13 | from rxitect.envs.contexts import Action, ActionType, Graph, GraphEnvContext
 14 | 
 15 | if TYPE_CHECKING:
 16 |     from rdkit.Chem.rdchem import Mol
 17 | 
 18 |     from rxitect.envs.contexts import ActionIndex
 19 | 
 20 | 
 21 | class FragmentEnvContext(GraphEnvContext):
 22 |     """A specification of what is being generated for a GraphBuildingEnv
 23 |     This context specifies how to create molecules fragment by fragment as encoded by a junction tree.
 24 |     Fragments are obtained from the original GFlowNet paper, Bengio et al., 2021.
 25 |     """
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         max_frags: int = 8,
 30 |         num_cond_dim: int = 0,
 31 |         frags_filepath: Union[str, Path] = here()
 32 |         / "data/processed/bengio_2021_fragments.txt",
 33 |         device: str = "cpu",
 34 |     ):
 35 |         """Construct a fragment environment
 36 |         Parameters
 37 |         ----------
 38 |         max_frags: int
 39 |             The maximum number of fragments the agent is allowed to insert.
 40 |         num_cond_dim: int
 41 |             The dimensionality of the observations' conditional information vector (if >0)
 42 |         frags_filepath: str
 43 |             The file containing the fragments available to the agent to construct molecules with. Defaults to Bengio
 44 |             et al.'s (2021) original GFlowNet paper's fragments
 45 |         device: str
 46 |             The device to process the data on, can be either 'cpu' or 'cuda'. Defaults to 'cpu'
 47 |         """
 48 |         self.max_frags = max_frags
 49 |         self.frags_smi = open(frags_filepath, "r").read().splitlines()
 50 |         self.frags_mol = [Chem.MolFromSmiles(i) for i in self.frags_smi]
 51 |         self.frags_stems = [
 52 |             [
 53 |                 atom_idx
 54 |                 for atom_idx in range(m.GetNumAtoms())
 55 |                 if m.GetAtomWithIdx(atom_idx).GetTotalNumHs() > 0
 56 |             ]
 57 |             for m in self.frags_mol
 58 |         ]
 59 |         self.frags_num_atoms = [m.GetNumAtoms() for m in self.frags_mol]
 60 |         self.num_stem_acts = most_stems = max(map(len, self.frags_stems))
 61 |         self.action_map = [
 62 |             (frag_idx, stem_idx)
 63 |             for frag_idx in range(len(self.frags_stems))
 64 |             for stem_idx in range(len(self.frags_stems[frag_idx]))
 65 |         ]
 66 |         self.num_actions = len(self.action_map)
 67 | 
 68 |         # These values are used by Models to know how many inputs/logits to produce
 69 |         self.num_new_node_values = len(self.frags_smi)
 70 |         self.num_node_attr_logits = 0
 71 |         self.num_node_dim = len(self.frags_smi) + 1
 72 |         self.num_edge_attr_logits = most_stems * 2
 73 |         self.num_edge_dim = most_stems * 2
 74 |         self.num_cond_dim = num_cond_dim
 75 |         self.num_stop_logits = 1
 76 | 
 77 |         # Order in which models have to output logits
 78 |         self.action_type_order = [
 79 |             ActionType.STOP,
 80 |             ActionType.ADD_NODE,
 81 |             ActionType.SET_EDGE_ATTR,
 82 |         ]
 83 |         self.device = torch.device(device)
 84 | 
 85 |     def idx_to_action(self, g: Data, action_idx: ActionIndex):
 86 |         """Translate an action index (e.g. from a GraphActionCategorical) to a GraphAction
 87 |         Parameters
 88 |         ----------
 89 |         g: Data
 90 |             The graph object on which this action would be applied.
 91 |         action_idx: ActionIndex
 92 |              A triple describing the type of action, and the corresponding row and column index for
 93 |              the corresponding Categorical matrix.
 94 |         Returns
 95 |         -------
 96 |         action: Action
 97 |             A graph action whose type is one of STOP, ADD_NODE, or SET_EDGE_ATTR.
 98 |         """
 99 |         act_type, act_row, act_col = [int(i) for i in action_idx]
100 |         t = self.action_type_order[act_type]
101 |         if t is ActionType.STOP:
102 |             return Action(t)
103 |         elif t is ActionType.ADD_NODE:
104 |             return Action(t, source=act_row, value=act_col)
105 |         elif t is ActionType.SET_EDGE_ATTR:
106 |             a, b = g.edge_index[
107 |                 :, act_row * 2
108 |             ]  # Edges are duplicated to get undirected GNN, deduplicated for logits
109 |             if act_col < self.num_stem_acts:
110 |                 attr = f"{int(a)}_attach"
111 |                 val = act_col
112 |             else:
113 |                 attr = f"{int(b)}_attach"
114 |                 val = act_col - self.num_stem_acts
115 |             return Action(t, source=a.item(), target=b.item(), attr=attr, value=val)
116 | 
117 |     def action_to_idx(self, g: Data, action: Action) -> ActionIndex:
118 |         """Translate a GraphAction to an index tuple
119 |         Parameters
120 |         ----------
121 |         g: Data
122 |             The graph object on which this action would be applied.
123 |         action: Action
124 |             A graph action whose type is one of Stop, AddNode, or SetEdgeAttr.
125 |         Returns
126 |         -------
127 |         action_idx: ActionIndex
128 |              A triple describing the type of action, and the corresponding row and column index for
129 |              the corresponding Categorical matrix.
130 |         """
131 |         if action.act_type is ActionType.STOP:
132 |             row = col = 0
133 |         elif action.act_type is ActionType.ADD_NODE:
134 |             row = action.source
135 |             col = action.value
136 |         elif action.act_type is ActionType.SET_EDGE_ATTR:
137 |             # Here the edges are duplicated, both (i,j) and (j,i) are in edge_index
138 |             # so no need for a double check.
139 |             row = (
140 |                 (g.edge_index.T == torch.tensor([(action.source, action.target)]))
141 |                 .prod(1)
142 |                 .argmax()
143 |             )
144 |             # Because edges are duplicated but logits aren't, divide by two
145 |             row = row.div(2, rounding_mode="floor")  # type: ignore
146 |             if action.attr == f"{int(action.source)}_attach":
147 |                 col = action.value
148 |             else:
149 |                 col = action.value + self.num_stem_acts
150 |         else:
151 |             raise ValueError(f"Action type '{action.act_type}' is unsupported.")
152 |         type_idx = self.action_type_order.index(action.act_type)
153 |         return type_idx, int(row), int(col)
154 | 
155 |     def graph_to_data(self, g: Graph) -> Data:
156 |         """Convert a networkx Graph to a torch geometric Data instance
157 |         Parameters
158 |         ----------
159 |         g: Graph
160 |             A Graph object representing a fragment junction tree
161 |         Returns
162 |         -------
163 |         data: Data
164 |             The corresponding torch_geometric object.
165 |         """
166 |         x = torch.zeros((max(1, len(g.nodes)), self.num_node_dim))
167 |         x[0, -1] = len(g.nodes) == 0
168 |         for i, n in enumerate(g.nodes):
169 |             x[i, g.nodes[n]["v"]] = 1
170 |         edge_attr = torch.zeros((len(g.edges) * 2, self.num_edge_dim))
171 |         set_edge_attr_mask = torch.zeros((len(g.edges), self.num_edge_attr_logits))
172 |         for i, e in enumerate(g.edges):
173 |             ad = g.edges[e]
174 |             for n, offset in zip(e, [0, self.num_stem_acts]):
175 |                 idx = ad.get(f"{int(n)}_attach", 0) + offset
176 |                 edge_attr[i * 2, idx] = 1
177 |                 edge_attr[i * 2 + 1, idx] = 1
178 |                 if f"{int(n)}_attach" not in ad:
179 |                     set_edge_attr_mask[
180 |                         i, offset : offset + len(self.frags_stems[g.nodes[n]["v"]])
181 |                     ] = 1
182 |         edge_index = (
183 |             torch.tensor(
184 |                 [e for i, j in g.edges for e in [(i, j), (j, i)]], dtype=torch.long
185 |             )
186 |             .reshape((-1, 2))
187 |             .T
188 |         )
189 |         if x.shape[0] == self.max_frags:
190 |             add_node_mask = torch.zeros((x.shape[0], 1))
191 |         else:
192 |             add_node_mask = torch.ones((x.shape[0], 1))
193 | 
194 |         return Data(
195 |             x,
196 |             edge_index,
197 |             edge_attr,
198 |             add_node_mask=add_node_mask,
199 |             set_edge_attr_mask=set_edge_attr_mask,
200 |         )
201 | 
202 |     def collate_fn(self, graphs: List[Data]) -> Batch:
203 |         """Batch Data instances
204 |         Parameters
205 |         ----------
206 |         graphs: List[gd.Data]
207 |             A list of gd.Data objects (e.g. given by graph_to_Data).
208 |         Returns
209 |         -------
210 |         batch: gd.Batch
211 |             A torch_geometric Batch object
212 |         """
213 |         return Batch.from_data_list(graphs, follow_batch=["edge_index"])
214 | 
215 |     def mol_to_graph(self, mol) -> Graph:
216 |         """Convert an RDMol to a Graph"""
217 |         raise NotImplementedError()
218 | 
219 |     def graph_to_mol(self, g: Graph) -> Mol:
220 |         """Convert a Graph to an RDKit molecule
221 |         Parameters
222 |         ----------
223 |         g: Graph
224 |             A Graph instance representing a fragment junction tree.
225 |         Returns
226 |         -------
227 |         m: Mol
228 |             The corresponding RDKit molecule
229 |         """
230 |         offsets = np.cumsum([0] + [self.frags_num_atoms[g.nodes[i]["v"]] for i in g])
231 |         mol = None
232 |         for i in g.nodes:
233 |             if mol is None:
234 |                 mol = self.frags_mol[g.nodes[i]["v"]]
235 |             else:
236 |                 mol = Chem.CombineMols(mol, self.frags_mol[g.nodes[i]["v"]])
237 | 
238 |         mol = Chem.EditableMol(mol)
239 |         bond_atoms = []
240 |         for a, b in g.edges:
241 |             frag_a = g.nodes[a]["v"]
242 |             frag_b = g.nodes[b]["v"]
243 |             u, v = (
244 |                 int(
245 |                     self.frags_stems[frag_a][g.edges[(a, b)].get(f"{a}_attach", 0)]
246 |                     + offsets[a]
247 |                 ),
248 |                 int(
249 |                     self.frags_stems[frag_b][g.edges[(a, b)].get(f"{b}_attach", 0)]
250 |                     + offsets[b]
251 |                 ),
252 |             )
253 |             bond_atoms += [u, v]
254 |             mol.AddBond(u, v, Chem.BondType.SINGLE)
255 |         mol = mol.GetMol(None)
256 | 
257 |         def _pop_hydrogen_atom(atom: Atom) -> None:
258 |             atom = mol.GetAtomWithIdx(atom)
259 |             nh = atom.GetNumExplicitHs()
260 |             if nh > 0:
261 |                 atom.SetNumExplicitHs(nh - 1)
262 | 
263 |         list(map(_pop_hydrogen_atom, bond_atoms))
264 |         return mol
265 | 
266 |     def is_valid_graph(self, g: Graph) -> bool:
267 |         """Verifies whether the given Graph is valid according to RDKit"""
268 |         mol = self.graph_to_mol(g)
269 |         assert Chem.MolFromSmiles(Chem.MolToSmiles(mol)) is not None
270 |         if mol is None:
271 |             return False
272 |         return True
273 | 


--------------------------------------------------------------------------------
/rxitect/data/iterators.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | from typing import TYPE_CHECKING, Callable, List, Optional
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | from rdkit import Chem, RDLogger
 10 | from torch.utils.data import Dataset, IterableDataset
 11 | 
 12 | if TYPE_CHECKING:
 13 |     from torch.types import Device
 14 | 
 15 |     from rxitect.algorithms.gfn_algorithm import GFNAlgorithm
 16 |     from rxitect.envs.contexts import GraphEnvContext
 17 |     from rxitect.tasks.gfn_task import GFNTask
 18 | 
 19 | 
 20 | class SamplingIterator(IterableDataset):
 21 |     """This class allows us to parallelize and train faster.
 22 |     By separating sampling data/the model and building torch geometric
 23 |     graphs from training the model, we can do the former in different
 24 |     processes, which is much faster since much of graph construction
 25 |     is CPU-bound.
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         dataset: Dataset,
 31 |         model: nn.Module,
 32 |         batch_size: int,
 33 |         ctx: GraphEnvContext,
 34 |         algo: GFNAlgorithm,
 35 |         task: GFNTask,
 36 |         device: Device,
 37 |         offline_ratio: float = 0.5,
 38 |         stream: bool = True,
 39 |         log_dir: Optional[str] = None,
 40 |     ):
 41 |         """Parameters
 42 |         ----------
 43 |         dataset: Dataset
 44 |             A dataset instance
 45 |         model: nn.Module
 46 |             The model we sample from (must be on CUDA already or share_memory() must be called so that
 47 |             parameters are synchronized between each worker)
 48 |         batch_size: int
 49 |             The number of trajectories, each trajectory will be composed of many graphs, so this is
 50 |             _not_ the batch size in terms of the number of graphs (that will depend on the task)
 51 |         algo:
 52 |             The training algorithm, e.g. a TrajectoryBalance instance
 53 |         task: ConditionalTask
 54 |         offline_ratio: float
 55 |             The offline_ratio of offline trajectories in the batch.
 56 |         stream: bool
 57 |             If True, data is sampled iid for every batch. Otherwise, this is a normal in-order
 58 |             dataset iterator.
 59 |         log_dir: str
 60 |             If not None, logs each SamplingIterator worker's generated molecules to that file.
 61 |         """
 62 |         self.data = dataset
 63 |         self.model = model
 64 |         self.batch_size = batch_size
 65 |         self.offline_batch_size = int(np.ceil(batch_size * offline_ratio))
 66 |         self.online_batch_size = int(np.floor(batch_size * (1 - offline_ratio)))
 67 |         self.offline_ratio = offline_ratio
 68 |         self.ctx = ctx
 69 |         self.algo = algo
 70 |         self.task = task
 71 |         self.device = device
 72 |         self.stream = stream
 73 |         self.log_dir = log_dir if self.offline_ratio < 1 and self.stream else None
 74 |         # This SamplingIterator instance will be copied by torch DataLoaders for each worker, so we
 75 |         # don't want to initialize per-worker things just yet, such as the log the worker writes
 76 |         # to. This must be done in __iter__, which is called by the DataLoader once this instance
 77 |         # has been copied into a new python process.
 78 |         # self.log = SQLiteLog()  # Make generic logger that writes to txt file
 79 |         self.log_hooks: List[Callable] = []
 80 | 
 81 |     def add_log_hook(self, hook: Callable):
 82 |         self.log_hooks.append(hook)
 83 | 
 84 |     def _idx_iterator(self):
 85 |         RDLogger.DisableLog("rdApp.*")
 86 |         if self.stream:
 87 |             # If we're streaming data, just sample `offline_batch_size` indices
 88 |             while True:
 89 |                 yield self.rng.integers(0, len(self.data), self.offline_batch_size)
 90 |         else:
 91 |             # Otherwise, figure out which indices correspond to this worker
 92 |             worker_info = torch.utils.data.get_worker_info()
 93 |             n = len(self.data)
 94 |             if n == 0:
 95 |                 yield np.arange(0, 0)
 96 |                 return
 97 |             if worker_info is None:
 98 |                 start, end, wid = 0, n, -1
 99 |             else:
100 |                 nw = worker_info.num_workers
101 |                 wid = worker_info.id
102 |                 start, end = int(np.floor(n / nw * wid)), int(
103 |                     np.ceil(n / nw * (wid + 1))
104 |                 )
105 |             bs = self.offline_batch_size
106 |             if end - start < bs:
107 |                 yield np.arange(start, end)
108 |                 return
109 |             for i in range(start, end - bs, bs):
110 |                 yield np.arange(i, i + bs)
111 |             if i + bs < end:
112 |                 yield np.arange(i + bs, end)
113 | 
114 |     def __len__(self):
115 |         if self.stream:
116 |             return int(1e6)
117 |         return len(self.data)
118 | 
119 |     def __iter__(self):
120 |         worker_info = torch.utils.data.get_worker_info()
121 |         self._wid = worker_info.id if worker_info is not None else 0
122 |         # Now that we know we are in a worker instance, we can initialize per-worker things
123 |         self.rng = self.algo.rng = self.task.rng = np.random.default_rng(
124 |             142857 + self._wid
125 |         )
126 |         self.ctx.device = self.device
127 |         if self.log_dir is not None:
128 |             os.makedirs(self.log_dir, exist_ok=True)
129 |             self.log_path = f"{self.log_dir}/generated_mols_{self._wid}.db"
130 |             # self.log.connect(self.log_path)
131 | 
132 |         for idcs in self._idx_iterator():
133 |             num_offline = idcs.shape[0]  # This is in [0, self.offline_batch_size]
134 |             # Sample conditional info such as temperature, trade-off weights, etc.
135 | 
136 |             cond_info = self.task.sample_conditional_information(
137 |                 num_offline + self.online_batch_size
138 |             )
139 |             is_valid = torch.ones(cond_info["beta"].shape[0]).bool()
140 | 
141 |             # Sample some dataset data
142 |             mols, flat_rewards = (
143 |                 map(list, zip(*[self.data[i] for i in idcs])) if len(idcs) else ([], [])
144 |             )
145 |             flat_rewards = list(
146 |                 self.task.flat_reward_transform(torch.tensor(flat_rewards))
147 |             )
148 |             graphs = [self.ctx.mol_to_graph(m) for m in mols]
149 |             trajs = self.algo.create_training_data_from_graphs(graphs)
150 |             # Sample some on-policy data
151 |             if self.online_batch_size > 0:
152 |                 with torch.no_grad():
153 |                     trajs += self.algo.create_training_data_from_own_samples(
154 |                         self.model,
155 |                         self.online_batch_size,
156 |                         cond_info["encoding"][num_offline:],
157 |                     )
158 |                 if self.algo.bootstrap_own_reward:
159 |                     # The model can be trained to predict its own reward,
160 |                     # i.e. predict the output of cond_info_to_reward
161 |                     pred_reward = [
162 |                         i["reward_pred"].cpu().item() for i in trajs[num_offline:]
163 |                     ]
164 |                     flat_rewards += pred_reward
165 |                 else:
166 |                     # Otherwise, query the task for flat rewards
167 |                     valid_idcs = torch.tensor(
168 |                         [
169 |                             i + num_offline
170 |                             for i in range(self.online_batch_size)
171 |                             if trajs[i + num_offline]["is_valid"]
172 |                         ]
173 |                     ).long()
174 |                     # fetch the valid trajectories endpoints
175 |                     mols = [
176 |                         self.ctx.graph_to_mol(trajs[i]["traj"][-1][0])
177 |                         for i in valid_idcs
178 |                     ]
179 |                     # ask the task to compute their reward
180 |                     preds, m_is_valid = self.task.compute_flat_rewards(mols)
181 |                     # The task may decide some mols are invalid, we have to again filter those
182 |                     valid_idcs = valid_idcs[m_is_valid]
183 |                     pred_reward = torch.zeros((self.online_batch_size, preds.shape[1]))
184 |                     pred_reward[valid_idcs - num_offline] = preds
185 |                     # if preds.shape[0] > 0:
186 |                     #     for i in range(self.number_of_objectives):
187 |                     #         pred_reward[valid_idcs - num_offline, i] = preds[range(preds.shape[0]), i]
188 |                     is_valid[num_offline:] = False
189 |                     is_valid[valid_idcs] = True
190 |                     flat_rewards += list(pred_reward)
191 |                     # Override the is_valid key in case the task made some mols invalid
192 |                     for i in range(self.online_batch_size):
193 |                         trajs[num_offline + i]["is_valid"] = is_valid[
194 |                             num_offline + i
195 |                         ].item()
196 |             flat_rewards = torch.stack(flat_rewards)
197 |             # Compute scalar rewards from conditional information & flat rewards
198 |             rewards = self.task.cond_info_to_reward(cond_info, flat_rewards)
199 |             rewards[torch.logical_not(is_valid)] = np.exp(
200 |                 self.algo.illegal_action_logreward
201 |             )
202 |             # Construct batch
203 |             batch = self.algo.construct_batch(trajs, cond_info["encoding"], rewards)
204 |             batch.num_offline = num_offline
205 |             batch.num_online = self.online_batch_size
206 |             batch.flat_rewards = flat_rewards
207 |             batch.mols = mols
208 | 
209 |             if self.online_batch_size > 0 and self.log_dir is not None:
210 |                 self.log_generated(
211 |                     trajs[num_offline:],
212 |                     rewards[num_offline:],
213 |                     flat_rewards[num_offline:],
214 |                     {k: v[num_offline:] for k, v in cond_info.items()},
215 |                 )
216 |             if self.online_batch_size > 0:
217 |                 extra_info = {}
218 |                 for hook in self.log_hooks:
219 |                     extra_info.update(hook(trajs, rewards, flat_rewards, cond_info))
220 |                 batch.extra_info = extra_info
221 |             yield batch
222 | 
223 |     def log_generated(self, trajs, rewards, flat_rewards, cond_info):
224 |         mols = [
225 |             Chem.MolToSmiles(self.ctx.graph_to_mol(trajs[i]["traj"][-1][0]))
226 |             if trajs[i]["is_valid"]
227 |             else ""
228 |             for i in range(len(trajs))
229 |         ]
230 | 
231 |         flat_rewards = (
232 |             flat_rewards.reshape((len(flat_rewards), -1)).data.numpy().tolist()
233 |         )
234 |         rewards = rewards.data.numpy().tolist()
235 |         preferences = (
236 |             cond_info.get("preferences", torch.zeros((len(mols), 0)))
237 |             .data.numpy()
238 |             .tolist()
239 |         )
240 |         logged_keys = [
241 |             k for k in sorted(cond_info.keys()) if k not in ["encoding", "preferences"]
242 |         ]
243 | 
244 |         data = [
245 |             [mols[i], rewards[i]]
246 |             + flat_rewards[i]
247 |             + preferences[i]
248 |             + [cond_info[k][i].item() for k in logged_keys]
249 |             for i in range(len(trajs))
250 |         ]
251 |         data_labels = (
252 |             ["smi", "r"]
253 |             + [f"fr_{i}" for i in range(len(flat_rewards[0]))]
254 |             + [f"pref_{i}" for i in range(len(preferences[0]))]
255 |             + [f"ci_{k}" for k in logged_keys]
256 |         )
257 |         # self.log.insert_many(data, data_labels)
258 | 


--------------------------------------------------------------------------------
/rxitect/utils/metrics.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import math
  3 | from typing import TYPE_CHECKING
  4 | 
  5 | from botorch.utils.multi_objective import infer_reference_point
  6 | from botorch.utils.multi_objective import pareto
  7 | from botorch.utils.multi_objective.hypervolume import Hypervolume
  8 | from cvxopt import matrix
  9 | from cvxopt import solvers
 10 | import numpy as np
 11 | from rdkit import Chem
 12 | from rdkit import DataStructs
 13 | import torch
 14 | 
 15 | if TYPE_CHECKING:
 16 |     from numpy.typing import NDArray
 17 | 
 18 | 
 19 | def pareto_frontier(obj_vals: NDArray, maximize: bool = True):
 20 |     """
 21 |     Compute the Pareto frontier of a set of candidate solutions.
 22 |     Parameters
 23 |     ----------
 24 |         obj_vals: NDArray
 25 |             NumPy array of objective values
 26 |         maximize: bool
 27 |             x
 28 |     """
 29 |     # pareto utility assumes maximization
 30 |     if maximize:
 31 |         pareto_mask = pareto.is_non_dominated(torch.from_numpy(obj_vals))
 32 |     else:
 33 |         pareto_mask = pareto.is_non_dominated(-torch.from_numpy(obj_vals))
 34 |     return obj_vals[pareto_mask]
 35 | 
 36 | 
 37 | def get_hypervolume(flat_rewards: torch.Tensor, zero_ref: bool = True) -> float:
 38 |     """Compute the hypervolume of a set of trajectories.
 39 |         Parameters
 40 |         ----------
 41 |         flat_rewards: torch.Tensor
 42 |             A tensor of shape (num_trajs, num_of_objectives) containing the rewards of each trajectory.
 43 |         zero_ref: bool
 44 |             x
 45 |         """
 46 |     # Compute the reference point
 47 |     if zero_ref:
 48 |         reference_point = torch.zeros_like(flat_rewards[0])
 49 |     else:
 50 |         reference_point = infer_reference_point(flat_rewards)
 51 |     # Compute the hypervolume
 52 |     hv_indicator = Hypervolume(reference_point)  # Difference
 53 |     return hv_indicator.compute(flat_rewards)
 54 | 
 55 | 
 56 | def uniform_reference_points(nobj, p=4, scaling=None):
 57 |     """Generate reference points uniformly on the hyperplane intersecting
 58 |     each axis at 1. The scaling factor is used to combine multiple layers of
 59 |     reference points.
 60 |     """
 61 |     def gen_refs_recursive(ref, nobj, left, total, depth):
 62 |         points = []
 63 |         if depth == nobj - 1:
 64 |             ref[depth] = left / total
 65 |             points.append(ref)
 66 |         else:
 67 |             for i in range(left + 1):
 68 |                 ref[depth] = i / total
 69 |                 points.extend(gen_refs_recursive(ref.copy(), nobj, left - i, total, depth + 1))
 70 |         return points
 71 | 
 72 |     ref_points = np.array(gen_refs_recursive(np.zeros(nobj), nobj, p, p, 0))
 73 |     if scaling is not None:
 74 |         ref_points *= scaling
 75 |         ref_points += (1 - scaling) / nobj
 76 | 
 77 |     return ref_points
 78 | 
 79 | 
 80 | def r2_indicator_set(reference_points, solutions, utopian_point):
 81 |     """Computer R2 indicator value of a set of solutions (*solutions*) given a set of
 82 |     reference points (*reference_points) and a utopian_point (*utopian_point).
 83 |         :param reference_points: An array of reference points from a uniform distribution.
 84 |         :param solutions: the multi-objective solutions (fitness values).
 85 |         :param utopian_point: utopian point that represents the best possible solution
 86 |         :returns: r2 value (float).
 87 |         """
 88 | 
 89 |     min_list = []
 90 |     for v in reference_points:
 91 |         max_list = []
 92 |         for a in solutions:
 93 |             max_list.append(np.max(v * np.abs(utopian_point - a)))
 94 | 
 95 |         min_list.append(np.min(max_list))
 96 | 
 97 |     v_norm = np.linalg.norm(reference_points)
 98 |     r2 = np.sum(min_list) / v_norm
 99 | 
100 |     return r2
101 | 
102 | 
103 | solvers.options['abstol'] = 1e-15
104 | solvers.options['reltol'] = 1e-15
105 | solvers.options['feastol'] = 1e-15
106 | solvers.options['maxiters'] = 1000
107 | solvers.options['show_progress'] = False
108 | 
109 | 
110 | def sharpe_ratio(p, Q, x, rf):
111 |     """ Compute the Sharpe ratio.
112 |     Returns the Sharpe ratio given the expected return vector, p,
113 |     the covariance matrix, Q, the investment column vector, x, and
114 |     the return of the riskless asset, rf.
115 |     Parameters
116 |     ----------
117 |     p : ndarray
118 |         Expected return vector (of size n).
119 |     Q : ndarray
120 |         Covariance (n,n)-matrix.
121 |     x : ndarray
122 |         Investment vector of size (n,1). The sum of which should be 1.
123 |     rf : float
124 |         Return of a riskless asset.
125 |     Returns
126 |     -------
127 |     sr : float
128 |         The HSR value.
129 |     """
130 |     return (x.T.dot(p) - rf) / math.sqrt(x.T.dot(Q).dot(x))
131 | 
132 | 
133 | def _sharpe_ratio_qp_max(p, Q, rf):
134 |     """ Sharpe ratio maximization problem - QP formulation """
135 |     n = len(p)
136 | 
137 |     # inequality constraints (investment in assets is higher or equal to 0)
138 |     C = np.diag(np.ones(n))
139 |     d = np.zeros((n, 1), dtype=np.double)
140 | 
141 |     # equality constraints (just one)
142 |     A = np.zeros((1, n), dtype=np.double)
143 |     b = np.zeros((1, 1), dtype=np.double)
144 |     A[0, :] = p - rf
145 |     b[0, 0] = 1
146 | 
147 |     # convert numpy matrix to cvxopt matrix
148 |     G, c, A, b, C, d = matrix(Q, tc='d'), matrix(np.zeros(n), tc='d'), matrix(A, tc='d'), matrix(b, tc='d'), matrix(
149 |         C, tc='d'), matrix(d, tc='d')
150 | 
151 |     sol = solvers.coneqp(G, c, -C, -d, None, A, b, kktsolver='ldl')  # , initvals=self.initGuess)
152 |     y = np.array(sol['x'])
153 | 
154 |     return y
155 | 
156 | 
157 | def sharpe_ratio_max(p, Q, rf):
158 |     """ Compute the Sharpe ratio and investment of an optimal portfolio.
159 |     Parameters
160 |     ----------
161 |     p : ndarray
162 |         Expected return vector (of size n).
163 |     Q : ndarray
164 |         Covariance (n,n)-matrix.
165 |     rf : float
166 |         Return of a riskless asset.
167 |     Returns
168 |     -------
169 |     sr : float
170 |         The HSR value.
171 |     x : ndarray
172 |         Investment vector of size (n,1).
173 |     """
174 |     y = _sharpe_ratio_qp_max(p, Q, rf)
175 |     x = y / y.sum()
176 |     x = np.where(x > 1e-9, x, 0)
177 |     sr = sharpe_ratio(p, Q, x, rf)
178 |     return sr, x
179 | 
180 | 
181 | # Assumes that l <= A << u
182 | # Assumes A, l, u are numpy arrays
183 | def _expected_return(A, low, up):
184 |     """
185 |     Returns the expected return (computed as defined by the HSR indicator), as a
186 |     column vector.
187 |     """
188 |     A = np.array(A, dtype=np.double)  # because of division operator in python 2.7
189 |     return ((up - A).prod(axis=-1)) / ((up - low).prod())
190 | 
191 | 
192 | def _covariance(A, low, up, p=None):
193 |     """  Returns the covariance matrix (computed as defined by the HSR indicator). """
194 |     p = _expected_return(A, low, up) if p is None else p
195 |     Pmax = np.maximum(A[:, np.newaxis, :], A[np.newaxis, ...])
196 |     P = _expected_return(Pmax, low, up)
197 | 
198 |     Q = P - p[:, np.newaxis] * p[np.newaxis, :]
199 |     return Q
200 | 
201 | 
202 | def _argunique(pts):
203 |     """ Find the unique points of a matrix. Returns their indexes. """
204 |     ix = np.lexsort(pts.T)
205 |     diff = (pts[ix][1:] != pts[ix][:-1]).any(axis=1)
206 |     un = np.ones(len(pts), dtype=bool)
207 |     un[ix[1:]] = diff
208 |     return un
209 | 
210 | 
211 | def HSRindicator(A, low, up, managedup=False):
212 |     """
213 |     Compute the HSR indicator of the point set A given reference points l and u.
214 |     Returns the HSR value of A given l and u, and returns the optimal investment.
215 |     By default, points in A are assumed to be unique.
216 |     Tip: Either ensure that A does not contain duplicated points
217 |         (for example, remove them previously and then split the
218 |         investment between the copies as you wish), or set the flag
219 |         'managedup' to True.
220 |     Parameters
221 |     ----------
222 |     A : ndarray
223 |         Input matrix (n,d) with n points and d dimensions.
224 |     low : array_like
225 |         Lower reference point.
226 |     up : array_like
227 |         Upper reference point.
228 |     managedup : bool, optional
229 |         If A contains duplicated points and 'managedup' is set to True, only the
230 |         first copy may be assigned positive investment, all other copies are
231 |         assigned zero investment. Otherwise, no special treatment is given to
232 |         duplicate points.
233 |     Returns
234 |     -------
235 |     hsri : float
236 |         The HSR value.
237 |        x : ndarray
238 |         The optimal investment as a column vector array (n,1).
239 |     """
240 |     n = len(A)
241 |     x = np.zeros((n, 1), dtype=float)
242 | 
243 |     # if u is not strongly dominated by l or A is the empty set
244 |     if (up <= low).any():
245 |         raise ValueError("The lower reference point does not strongly dominate the upper reference point!")
246 | 
247 |     if len(A) == 0:
248 |         return 0, x
249 | 
250 |     valid = (A < up).all(axis=1)
251 |     validix = np.where(valid)[0]
252 | 
253 |     # if A is the empty set
254 |     if valid.sum() == 0:
255 |         return 0, x
256 |     A = A[valid]  # A only contains points that strongly dominate u
257 |     A = np.maximum(A, low)
258 |     m = len(A)  # new size (m <= n)
259 | 
260 |     # manage duplicate points
261 |     ix = _argunique(A) if managedup else np.ones(m).astype(bool)
262 |     p = _expected_return(A[ix], low, up)
263 |     Q = _covariance(A[ix], low, up, p)
264 | 
265 |     hsri, x[validix[ix]] = sharpe_ratio_max(p, Q, 0)
266 | 
267 |     return hsri, x
268 | 
269 | 
270 | class HSR_Calculator:
271 |     def __init__(self, lower_bound, upper_bound, max_obj_bool=None):
272 |         '''
273 |         Class to calculate HSR Indicator with assumption that assumes a maximization on all objectives.
274 |          Parameters
275 |         ----------
276 |         lower_bound : array_like
277 |             Lower reference point.
278 |         upper_bound : array_like
279 |             Upper reference point.
280 |         max_obj_bool : bool, optional
281 |             Details of the objectives for which dimension maximization is not the case.
282 |         '''
283 | 
284 |         self.lower_bound = lower_bound
285 |         self.upper_bound = upper_bound
286 |         self.max_obj_bool = None
287 | 
288 |         if max_obj_bool is not None:
289 |             self.max_obj_bool = max_obj_bool
290 | 
291 |     def reset_lower_bound(self, lower_bound):
292 |         self.lower_bound = lower_bound
293 | 
294 |     def reset_upper_bound(self, upper_bound):
295 |         self.upper_bound = upper_bound
296 | 
297 |     def make_max_problem(self, matrix):
298 | 
299 |         if self.max_obj_bool is None:
300 |             return matrix
301 | 
302 |         max_matrix = deepcopy(matrix)
303 | 
304 |         for dim in self.max_obj_bool:
305 |             max_matrix[:, dim] = max_matrix**-1
306 | 
307 |         return max_matrix
308 | 
309 |     def calculate_hsr(self, solutions):
310 | 
311 |         max_solutions = self.make_max_problem(solutions)
312 | 
313 |         hsr_indicator, hsr_invest = HSRindicator(A=max_solutions, low=self.lower_bound, up=self.upper_bound)
314 | 
315 |         return hsr_indicator, hsr_invest
316 | 
317 | 
318 | class Normalizer(object):
319 |     def __init__(self, loc=0., scale=1.):
320 |         self.loc = loc
321 |         self.scale = np.where(scale != 0, scale, 1.)
322 | 
323 |     def __call__(self, arr):
324 |         min_val = self.loc - 4 * self.scale
325 |         max_val = self.loc + 4 * self.scale
326 |         clipped_arr = np.clip(arr, a_min=min_val, a_max=max_val)
327 |         norm_arr = (clipped_arr - self.loc) / self.scale
328 | 
329 |         return norm_arr
330 | 
331 |     def inv_transform(self, arr):
332 |         return self.scale * arr + self.loc
333 | 
334 | 
335 | # Should be calculated per preference
336 | def compute_diverse_top_k(smiles, rewards, k, thresh=0.7):
337 |     # mols is a list of (reward, mol)
338 |     mols = []
339 |     for i in range(len(smiles)):
340 |         mols.append([rewards[i].item(), smiles[i]])
341 |     mols = sorted(mols, key=lambda m: m[0], reverse=True)
342 |     modes = [mols[0]]
343 |     mode_fps = [Chem.RDKFingerprint(mols[0][1])]
344 |     for i in range(1, len(mols)):
345 |         fp = Chem.RDKFingerprint(mols[i][1])
346 |         sim = DataStructs.BulkTanimotoSimilarity(fp, mode_fps)
347 |         if max(sim) < thresh:
348 |             modes.append(mols[i])
349 |             mode_fps.append(fp)
350 |         if len(modes) >= k:
351 |             # last_idx = i
352 |             break
353 |     return np.mean([i[0] for i in modes])  # return sim
354 | 
355 | 
356 | def get_topk(rewards, k):
357 |     """
358 |     Parameters
359 |     ----------
360 |     rewards : array_like
361 |         Rewards obtained after taking the convex combination.
362 |         Shape: number_of_preferences x number_of_samples
363 |     k : int
364 |         Top-K value
365 |     Returns
366 |     ----------
367 |     average top-K rewards across all preferences
368 |     """
369 |     if len(rewards.shape) < 2:
370 |         rewards = torch.unsqueeze(rewards, -1)
371 |     sorted_rewards = torch.sort(rewards, 1).values
372 |     topk_rewards = sorted_rewards[range(rewards.shape[0]), :k]
373 |     mean_topk = torch.mean(topk_rewards.mean(-1))
374 |     return mean_topk
375 | 
376 | 
377 | if __name__ == "__main__":
378 | 
379 |     # Example for 2 dimensions
380 |     # Point set: {(1,3), (2,2), (3,1)},  l = (0,0), u = (4,4)
381 |     A = np.array([[1, 3], [2, 2], [3, 1]])  # matrix with dimensions n x d (n points, d dimensions)
382 |     low = np.zeros(2)  # l must weakly dominate every point in A
383 |     up = np.array([4, 4])  # u must be strongly dominated by every point in A
384 | 
385 |     # A = np.array([[3.41e-01, 9.72e-01, 2.47e-01],
386 |     #              [9.30e-01, 1.53e-01, 4.72e-01],
387 |     #              [4.56e-01, 1.71e-01, 8.68e-01],
388 |     #              [8.70e-02, 5.94e-01, 9.50e-01],
389 |     #              [5.31e-01, 6.35e-01, 1.95e-01],
390 |     #              [3.12e-01, 3.37e-01, 7.01e-01],
391 |     #              [3.05e-02, 9.10e-01, 7.71e-01],
392 |     #              [8.89e-01, 8.29e-01, 2.07e-02],
393 |     #              [6.92e-01, 3.62e-01, 2.93e-01],
394 |     #              [2.33e-01, 4.55e-01, 6.60e-01]])
395 |     #
396 |     # l = np.zeros(3)  # l must weakly dominate every point in A
397 |     # u = np.array([1, 1, 1])
398 | 
399 |     hsr_class = HSR_Calculator(lower_bound=low, upper_bound=up)
400 |     hsri, x = hsr_class.calculate_hsr(A)  # compute HSR indicator
401 | 
402 |     print("Optimal investment:")
403 |     print("%s" % "\n".join(map(str, x[:, 0])))
404 |     print("HSR indicator value: %f" % hsri)
405 | 


--------------------------------------------------------------------------------
/rxitect/algorithms/trajectory_balance.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import copy
  4 | from itertools import count
  5 | from typing import TYPE_CHECKING, Any, Dict, List
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch_geometric.data import Batch
 10 | from torch_scatter import scatter
 11 | 
 12 | from rxitect.algorithms.gfn_algorithm import GFNAlgorithm, Trajectory
 13 | from rxitect.envs.contexts import ActionType
 14 | 
 15 | if TYPE_CHECKING:
 16 |     from rxitect.algorithms.gfn_algorithm import SamplingModel
 17 |     from rxitect.envs import FragmentEnv, FragmentEnvContext
 18 |     from rxitect.envs.contexts import ActionCategorical, ActionIndex
 19 | 
 20 | 
 21 | class TrajectoryBalance(GFNAlgorithm):
 22 |     """ """
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         env: FragmentEnv,
 27 |         ctx: FragmentEnvContext,
 28 |         rng: np.random.RandomState,
 29 |         hps: Dict[str, Any],
 30 |         max_len=None,
 31 |         max_nodes=None,
 32 |     ):
 33 |         """TB implementation, see
 34 |         "Trajectory Balance: Improved Credit Assignment in GFlowNets Nikolay Malkin, Moksh Jain,
 35 |         Emmanuel Bengio, Chen Sun, Yoshua Bengio"
 36 |         https://arxiv.org/abs/2201.13259
 37 |         Hyperparameters used:
 38 |         random_action_prob: float, probability of taking a uniform random action when sampling
 39 |         illegal_action_logreward: float, log(R) given to the model for non-sane end states or illegal actions
 40 |         bootstrap_own_reward: bool, if True, uses the .reward batch data to predict rewards for sampled data
 41 |         tb_epsilon: float, if not None, adds this epsilon in the numerator and denominator of the log-ratio
 42 |         reward_loss_multiplier: float, multiplying constant for the bootstrap loss.
 43 |         Parameters
 44 |         ----------
 45 |         env: FragmentEnv
 46 |             A graph environment.
 47 |         ctx: FragmentEnvContext
 48 |             A context.
 49 |         rng: np.random.RandomState
 50 |             rng used to take random actions
 51 |         hps: Dict[str, Any]
 52 |             Hyperparameter dictionary, see above for used keys.
 53 |         max_len: int
 54 |             If not None, ends trajectories of more than max_len steps.
 55 |         max_nodes: int
 56 |             If not None, ends trajectories of graphs with more than max_nodes steps (illegal action).
 57 |         """
 58 |         self.ctx = ctx
 59 |         self.env = env
 60 |         self.rng = rng
 61 |         self.max_len = max_len
 62 |         self.max_nodes = max_nodes
 63 |         self.random_action_prob = hps["random_action_prob"]
 64 |         self.illegal_action_logreward = hps["illegal_action_logreward"]
 65 |         self.bootstrap_own_reward = hps["bootstrap_own_reward"]
 66 |         self.sanitize_samples = True
 67 |         self.epsilon = hps["tb_epsilon"]
 68 |         self.reward_loss_multiplier = hps["reward_loss_multiplier"]
 69 |         # Experimental flags
 70 |         self.reward_loss_is_mae = True
 71 |         self.tb_loss_is_mae = False
 72 |         self.tb_loss_is_huber = False
 73 |         self.mask_invalid_rewards = False
 74 |         self.length_normalize_losses = False
 75 |         self.reward_normalize_losses = False
 76 |         self.sample_temp = 1
 77 | 
 78 |     def _corrupt_actions(self, actions: List[ActionIndex], cat: ActionCategorical):
 79 |         """Sample from the uniform policy with probability `self.random_action_prob`"""
 80 |         # Should this be a method of GraphActionCategorical?
 81 |         if self.random_action_prob <= 0:
 82 |             return
 83 |         (corrupted,) = (
 84 |             self.rng.uniform(size=len(actions)) < self.random_action_prob
 85 |         ).nonzero()
 86 |         for i in corrupted:
 87 |             n_in_batch = [int((b == i).sum()) for b in cat.batch]
 88 |             n_each = np.array(
 89 |                 [
 90 |                     float(logit.shape[1]) * nb
 91 |                     for logit, nb in zip(cat.logits, n_in_batch)
 92 |                 ]
 93 |             )
 94 |             which = self.rng.choice(len(n_each), p=n_each / n_each.sum())
 95 |             row = self.rng.choice(n_in_batch[which])
 96 |             col = self.rng.choice(cat.logits[which].shape[1])
 97 |             actions[i] = (which, row, col)
 98 | 
 99 |     def create_training_data_from_own_samples(
100 |         self, model: SamplingModel, n: int, cond_info: torch.Tensor
101 |     ):
102 |         ctx = self.ctx
103 |         env = self.env
104 |         dev = self.ctx.device
105 |         cond_info = cond_info.to(dev)
106 |         log_z_pred = model.log_z(cond_info)
107 |         # This will be returned as training data
108 |         data = [{"traj": [], "reward_pred": None, "is_valid": True} for _ in range(n)]
109 |         # Let's also keep track of trajectory statistics according to the model
110 |         zero = torch.tensor([0], device=dev).float()
111 |         fwd_logprob: List[List[torch.Tensor]] = [[] for _ in range(n)]
112 |         bck_logprob: List[List[torch.Tensor]] = [
113 |             [zero] for _ in range(n)
114 |         ]  # zero in case there is a single invalid action
115 | 
116 |         graphs = [env.new() for _ in range(n)]
117 |         done = [False] * n
118 | 
119 |         def not_done(lst):
120 |             return [e for i, e in enumerate(lst) if not done[i]]
121 | 
122 |         # TODO report these stats:
123 |         mol_too_big = 0
124 |         mol_not_sane = 0
125 |         invalid_act = 0
126 |         logprob_of_illegal: List[torch.Tensor] = []
127 | 
128 |         illegal_action_logreward = torch.tensor(
129 |             [self.illegal_action_logreward], device=dev
130 |         )
131 |         if self.epsilon is not None:
132 |             epsilon = torch.tensor([self.epsilon], device=dev).float()
133 |         for t in range(self.max_len) if self.max_len is not None else count(0):
134 |             # Construct graphs for the trajectories that aren't yet done
135 |             torch_graphs = [ctx.graph_to_data(i) for i in not_done(graphs)]
136 |             not_done_mask = torch.tensor(done, device=dev).logical_not()
137 |             # Forward pass to get ActionCategorical
138 |             fwd_cat, log_reward_preds = model(
139 |                 ctx.collate_fn(torch_graphs).to(dev), cond_info[not_done_mask]
140 |             )
141 |             if self.sample_temp != 1:
142 |                 sample_cat = copy.copy(fwd_cat)
143 |                 sample_cat.logits = [i / self.sample_temp for i in fwd_cat.logits]
144 |                 actions = sample_cat.sample()
145 |             else:
146 |                 actions = fwd_cat.sample()
147 |             self._corrupt_actions(actions, fwd_cat)
148 |             graph_actions = [
149 |                 ctx.idx_to_action(g, a) for g, a in zip(torch_graphs, actions)
150 |             ]
151 |             log_probs = fwd_cat.log_probability(actions)
152 |             for i, j in zip(not_done(range(n)), range(n)):
153 |                 # Step each trajectory, and accumulate statistics
154 |                 fwd_logprob[i].append(log_probs[j].unsqueeze(0))
155 |                 data[i]["traj"].append((graphs[i], graph_actions[j]))
156 |                 # Check if we're done
157 |                 if graph_actions[j].act_type is ActionType.STOP or (
158 |                     self.max_len and t == self.max_len - 1
159 |                 ):
160 |                     done[i] = True
161 |                     if self.sanitize_samples and not ctx.is_valid_graph(graphs[i]):
162 |                         # check if the graph is sane (e.g. RDKit can
163 |                         # construct a molecule from it) otherwise
164 |                         # treat the done action as illegal
165 |                         mol_not_sane += 1
166 |                         data[i]["reward_pred"] = illegal_action_logreward.exp()
167 |                         data[i]["is_valid"] = False
168 |                     elif self.bootstrap_own_reward:
169 |                         # if we're bootstrapping, extract reward prediction
170 |                         data[i]["reward_pred"] = log_reward_preds[j].detach().exp()
171 |                 else:  # If not done, try to step the environment
172 |                     gp = graphs[i]
173 |                     try:
174 |                         # env.step can raise AssertionError if the action is illegal
175 |                         gp = env.step(graphs[i], graph_actions[j])
176 |                         if self.max_nodes is not None:
177 |                             assert len(gp.nodes) <= self.max_nodes
178 |                     except AssertionError:
179 |                         if len(gp.nodes) > self.max_nodes:
180 |                             mol_too_big += 1
181 |                         else:
182 |                             invalid_act += 1
183 |                         done[i] = True
184 |                         data[i]["reward_pred"] = illegal_action_logreward.exp()
185 |                         data[i]["is_valid"] = False
186 |                         continue
187 |                     # Add to the trajectory
188 |                     # P_B = uniform backward
189 |                     n_back = env.count_backward_transitions(gp)
190 |                     bck_logprob[i].append(torch.tensor([1 / n_back], device=dev).log())
191 |                     graphs[i] = gp
192 |             if all(done):
193 |                 break
194 | 
195 |         for i in range(n):
196 |             # If we're not bootstrapping, we could query the reward
197 |             # model here, but this is expensive/impractical.
198 |             # Instead, just report forward and backward flows
199 |             data[i]["log_z"] = log_z_pred[i].item()
200 |             data[i]["fwd_logprob"] = sum(fwd_logprob[i])
201 |             data[i]["bck_logprob"] = sum(bck_logprob[i])
202 |             if self.bootstrap_own_reward and False:  # TODO: verify
203 |                 if not data[i]["is_valid"]:
204 |                     logprob_of_illegal.append(data[i]["fwd_logprob"].item())
205 |                 # If we are bootstrapping, we can report the theoretical loss as well
206 |                 numerator = data[i]["fwd_logprob"] + log_z_pred[i]
207 |                 denominator = data[i]["bck_logprob"] + data[i]["reward_pred"].log()
208 |                 if self.epsilon is not None:
209 |                     numerator = torch.logaddexp(numerator, epsilon)
210 |                     denominator = torch.logaddexp(denominator, epsilon)
211 |                 data[i]["loss"] = (numerator - denominator).pow(2)
212 |         return data
213 | 
214 |     def construct_batch(self, trajs: List[Trajectory], cond_info, rewards):
215 |         """Construct a batch from a list of trajectories and their information
216 |         Parameters
217 |         ----------
218 |         trajs: List[Trajectory]
219 |             A list of N trajectories.
220 |         cond_info: Tensor
221 |             The conditional info that is considered for each trajectory. Shape (N, n_info)
222 |         rewards: Tensor
223 |             The transformed reward (e.g. R(x) ** beta) for each trajectory. Shape (N,)
224 |         Returns
225 |         -------
226 |         batch: gd.Batch
227 |              A (CPU) Batch object with relevant attributes added
228 |         """
229 |         torch_graphs = [
230 |             self.ctx.graph_to_data(i[0]) for tj in trajs for i in tj["traj"]
231 |         ]
232 |         actions = [
233 |             self.ctx.action_to_idx(g, a)
234 |             for g, a in zip(torch_graphs, [i[1] for tj in trajs for i in tj["traj"]])
235 |         ]
236 |         num_backward = torch.tensor(
237 |             [
238 |                 # Count the number of backward transitions from s_{t+1},
239 |                 # unless t+1 = T is the last time step
240 |                 self.env.count_backward_transitions(tj["traj"][i + 1][0])
241 |                 if i + 1 < len(tj["traj"])
242 |                 else 1
243 |                 for tj in trajs
244 |                 for i in range(len(tj["traj"]))
245 |             ]
246 |         )
247 |         batch = self.ctx.collate_fn(torch_graphs)
248 |         batch.traj_lens = torch.tensor([len(i["traj"]) for i in trajs])
249 |         batch.num_backward = num_backward
250 |         batch.actions = torch.tensor(actions)
251 |         batch.rewards = rewards
252 |         batch.cond_info = cond_info
253 |         batch.is_valid = torch.tensor([i.get("is_valid", True) for i in trajs]).float()
254 |         return batch
255 | 
256 |     def compute_batch_losses(
257 |         self, model: SamplingModel, batch: Batch, num_bootstrap: int = 0
258 |     ):
259 |         """Compute the losses over trajectories contained in the batch
260 |         Parameters
261 |         ----------
262 |         model: SamplingModel
263 |             A GNN taking in a batch of graphs as input as per constructed by `self.construct_batch`.
264 |             Must have a `logZ` attribute, itself a model, which predicts log of Z(cond_info)
265 |         batch: Batch
266 |             batch of graphs inputs as per constructed by `self.construct_batch`
267 |         num_bootstrap: int
268 |             the number of trajectories for which the reward loss is computed. Ignored if 0."""
269 |         dev = batch.x.device
270 |         # A single trajectory comprises many graphs
271 |         num_trajs = int(batch.traj_lens.shape[0])
272 |         rewards = batch.rewards
273 |         cond_info = batch.cond_info
274 | 
275 |         # This index says which trajectory each graph belongs to, so
276 |         # it will look like [0,0,0,0,1,1,1,2,...] if trajectory 0 is
277 |         # of length 4, trajectory 1 of length 3, and so on.
278 |         batch_idx = torch.arange(num_trajs, device=dev).repeat_interleave(
279 |             batch.traj_lens
280 |         )
281 |         # The position of the last graph of each trajectory
282 |         final_graph_idx = torch.cumsum(batch.traj_lens, 0) - 1
283 | 
284 |         # Forward pass of the model, returns a GraphActionCategorical and the optional bootstrap predictions
285 |         fwd_cat, log_reward_preds = model(batch, cond_info[batch_idx])
286 | 
287 |         # Retreive the reward predictions for the full graphs,
288 |         # i.e. the final graph of each trajectory
289 |         log_reward_preds = log_reward_preds[final_graph_idx, 0]
290 |         # Compute trajectory balance objective
291 |         Z = model.log_z(cond_info)[:, 0]
292 |         # This is the log prob of each action in the trajectory
293 |         log_prob = fwd_cat.log_probability(batch.actions)
294 |         # The log prob of each backward action
295 |         log_p_B = (1 / batch.num_backward).log()
296 |         # Take log rewards, and clip
297 |         assert rewards.ndim == 1
298 |         Rp = torch.maximum(rewards.log(), torch.tensor(-100.0, device=dev))
299 |         # This is the log probability of each trajectory
300 |         traj_log_prob = scatter(
301 |             log_prob, batch_idx, dim=0, dim_size=num_trajs, reduce="sum"
302 |         )
303 |         # Compute log numerator and denominator of the TB objective
304 |         numerator = Z + traj_log_prob
305 |         denominator = Rp + scatter(
306 |             log_p_B, batch_idx, dim=0, dim_size=num_trajs, reduce="sum"
307 |         )
308 | 
309 |         if self.epsilon is not None:
310 |             # Numerical stability epsilon
311 |             epsilon = torch.tensor([self.epsilon], device=dev).float()
312 |             numerator = torch.logaddexp(numerator, epsilon)
313 |             denominator = torch.logaddexp(denominator, epsilon)
314 | 
315 |         invalid_mask = 1 - batch.is_valid
316 |         if self.mask_invalid_rewards:
317 |             # Instead of being rude to the model and giving a
318 |             # log-reward of -100 what if we say, whatever you think the
319 |             # logprobablity of this trajectory is it should be smaller
320 |             # (thus the `numerator - 1`). Why 1? Intuition?
321 |             denominator = denominator * (1 - invalid_mask) + invalid_mask * (
322 |                 numerator.detach() - 1
323 |             )
324 | 
325 |         if self.tb_loss_is_mae:
326 |             traj_losses = abs(numerator - denominator)
327 |         elif self.tb_loss_is_huber:
328 |             raise NotImplementedError("Huber loss is not supported yet")  # TODO
329 |         else:
330 |             traj_losses = (numerator - denominator).pow(2)
331 | 
332 |         # Normalize losses by trajectory length
333 |         if self.length_normalize_losses:
334 |             traj_losses = traj_losses / batch.traj_lens
335 |         if self.reward_normalize_losses:
336 |             # multiply each loss by how important it is, using R as the importance factor
337 |             # factor = Rp.exp() / Rp.exp().sum()
338 |             factor = -Rp.min() + Rp + 1
339 |             factor = factor / factor.sum()
340 |             assert factor.shape == traj_losses.shape
341 |             # * num_trajs because we're doing a convex combination, and a .mean() later, which would
342 |             # undercount (by 2N) the contribution of each loss
343 |             traj_losses = factor * traj_losses * num_trajs
344 | 
345 |         if self.bootstrap_own_reward:
346 |             num_bootstrap = num_bootstrap or len(rewards)
347 |             if self.reward_loss_is_mae:
348 |                 reward_losses = abs(
349 |                     rewards[:num_bootstrap] - log_reward_preds[:num_bootstrap].exp()
350 |                 )
351 |             else:
352 |                 reward_losses = (
353 |                     rewards[:num_bootstrap] - log_reward_preds[:num_bootstrap].exp()
354 |                 ).pow(2)
355 |             reward_loss = reward_losses.mean()
356 |         else:
357 |             reward_loss = 0
358 | 
359 |         loss = traj_losses.mean() + reward_loss * self.reward_loss_multiplier
360 |         info = {
361 |             "offline_loss": traj_losses[: batch.num_offline].mean()
362 |             if batch.num_offline > 0
363 |             else 0,
364 |             "online_loss": traj_losses[batch.num_offline :].mean()
365 |             if batch.num_online > 0
366 |             else 0,
367 |             "reward_loss": reward_loss,
368 |             "invalid_trajectories": invalid_mask.sum() / batch.num_online
369 |             if batch.num_online > 0
370 |             else 0,
371 |             "invalid_logprob": (invalid_mask * traj_log_prob).sum()
372 |             / (invalid_mask.sum() + 1e-4),
373 |             "invalid_losses": (invalid_mask * traj_losses).sum()
374 |             / (invalid_mask.sum() + 1e-4),
375 |             "log_z": Z.mean(),
376 |         }
377 | 
378 |         if not torch.isfinite(traj_losses).all():
379 |             raise ValueError("loss is not finite")
380 |         return loss, info
381 | 


--------------------------------------------------------------------------------
/rxitect/envs/contexts/graph_env_context.py:
--------------------------------------------------------------------------------
  1 | """Adapted from recursionpharma's gflownet implementation @ https://github.com/recursionpharma/gflownet.
  2 | Contains code designed to give context for actions an agent can take in a setting where
  3 | the actions are a combination of choosing a molecular fragment and where to attach it, effectively
  4 | resulting in the creation of a (final) molecular graph.
  5 | """
  6 | from __future__ import annotations
  7 | 
  8 | from abc import ABC, abstractmethod
  9 | from copy import copy
 10 | from dataclasses import dataclass
 11 | from enum import Enum, auto
 12 | from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 13 | 
 14 | import networkx as nx
 15 | import numpy as np
 16 | import torch
 17 | from torch.types import Device
 18 | from torch_geometric.data import Batch, Data
 19 | from torch_scatter import scatter, scatter_max
 20 | 
 21 | if TYPE_CHECKING:
 22 |     from rdkit.Chem.rdchem import Mol
 23 | 
 24 | 
 25 | class Graph(nx.Graph):
 26 |     """A wrapper around networkx's Graph class to facilitate debugging."""
 27 | 
 28 |     def __str__(self):
 29 |         return repr(self)
 30 | 
 31 |     def __repr__(self):
 32 |         return f"<{list(self.nodes)}, {list(self.edges)}, {list(self.nodes[i]['v'] for i in self.nodes)}>"
 33 | 
 34 | 
 35 | class ActionType(Enum):
 36 |     """Class that contains all actions in the context of fragment-based molecular graph building"""
 37 | 
 38 |     STOP = auto()
 39 |     ADD_NODE = auto()
 40 |     ADD_EDGE = auto()
 41 |     SET_NODE_ATTR = auto()
 42 |     SET_EDGE_ATTR = auto()
 43 |     REMOVE_NODE = auto()
 44 |     REMOVE_EDGE = auto()
 45 |     REMOVE_NODE_ATTR = auto()
 46 |     REMOVE_EDGE_ATTR = auto()
 47 | 
 48 | 
 49 | @dataclass
 50 | class Action:
 51 |     """A class representing a single graph-building action
 52 | 
 53 |     Parameters
 54 |     ----------
 55 |     act_type: ActionType
 56 |         The action type
 57 |     source: :obj:`int`, optional
 58 |         The source node this action is applied on
 59 |     target: :obj:`int`, optional
 60 |         The target node (i.e. if specified this is an edge action)
 61 |     value: :obj:`Any`, optional
 62 |         The value (e.g. new node type) applied
 63 |     attr: :obj:`str`, optional
 64 |         The set attribute of a node/edge
 65 |     """
 66 | 
 67 |     act_type: ActionType
 68 |     source: Optional[int] = None
 69 |     target: Optional[int] = None
 70 |     value: Optional[Any] = None
 71 |     attr: Optional[str] = None
 72 | 
 73 |     def __repr__(self) -> str:
 74 |         attrs = ", ".join(
 75 |             str(i)
 76 |             for i in [self.source, self.target, self.attr, self.value]
 77 |             if i is not None
 78 |         )
 79 |         return f"<{self.act_type}, {attrs}>"
 80 | 
 81 | 
 82 | class GraphEnvContext(ABC):
 83 |     device: Device
 84 | 
 85 |     @abstractmethod
 86 |     def idx_to_action(self, g: Data, idx: ActionIndex) -> Action:
 87 |         """Translate an action index (e.g. from an ActionCategorical) to an Action
 88 |         Parameters
 89 |         ----------
 90 |         g: Data
 91 |             The graph to which the action is being applied
 92 |         idx: ActionIndex
 93 |             The tensor indices for the corresponding action
 94 |         Returns
 95 |         -------
 96 |         action: Action
 97 |             A graph action that could be applied to the original graph corresponding to g.
 98 |         """
 99 |         pass
100 | 
101 |     @abstractmethod
102 |     def action_to_idx(self, g: Data, action: Action) -> ActionIndex:
103 |         """Translate a Action to an action index (e.g. from an ActionCategorical)
104 |         Parameters
105 |         ----------
106 |         g: Data
107 |             The graph to which the action is being applied
108 |         action: Action
109 |             A graph action that could be applied to the original graph corresponding to g.
110 |         Returns
111 |         -------
112 |         action_idx: ActionIndex
113 |             The tensor indices for the corresponding action
114 |         """
115 |         pass
116 | 
117 |     @abstractmethod
118 |     def graph_to_data(self, g: Graph) -> Data:
119 |         """Convert a networkx Graph to a torch geometric Data instance
120 |         Parameters
121 |         ----------
122 |         g: Graph
123 |             A graph instance.
124 |         Returns
125 |         -------
126 |         torch_g: Data
127 |             The corresponding torch_geometric graph.
128 |         """
129 |         pass
130 | 
131 |     @classmethod
132 |     def collate_fn(cls, graphs: List[Data]) -> Batch:
133 |         """Convert a list of torch geometric Data instances to a Batch
134 |         instance.  This exists so that environment contexts can set
135 |         custom batching attributes, e.g. by using `follow_batch`.
136 |         Parameters
137 |         ----------
138 |         graphs: List[Data]
139 |             Graph instances
140 |         Returns
141 |         -------
142 |         batch: Batch
143 |             The corresponding batch.
144 |         """
145 |         return Batch.from_data_list(graphs)
146 | 
147 |     @abstractmethod
148 |     def is_valid_graph(self, g: Graph) -> bool:
149 |         """Verifies whether a graph is valid according to the context. This can
150 |         catch, e.g. impossible molecules.
151 | 
152 |         Parameters
153 |         ----------
154 |         g: Graph
155 |             A graph.
156 |         Returns
157 |         -------
158 |         is_sane: bool:
159 |             True if the environment considers g to be valid.
160 |         """
161 |         pass
162 | 
163 |     @abstractmethod
164 |     def graph_to_mol(self, g: Graph) -> Mol:
165 |         """Convert a Graph to an RDKit molecule
166 |         Parameters
167 |         ----------
168 |         g: Graph
169 |             A Graph instance representing a fragment junction tree.
170 |         Returns
171 |         -------
172 |         m: Mol
173 |             The corresponding RDKit molecule
174 |         """
175 |         pass
176 | 
177 |     @abstractmethod
178 |     def mol_to_graph(self, mol: Mol) -> Graph:
179 |         """Transforms an RDKit representation of a molecule into
180 |         its corresponding generic Graph representation
181 |         Parameters
182 |         ----------
183 |         mol: Mol
184 |             An RDKit molecule
185 |         Returns
186 |         -------
187 |         g: Graph
188 |             The corresponding Graph representation of that molecule.
189 |         """
190 |         pass
191 | 
192 | 
193 | class ActionCategorical:
194 |     def __init__(
195 |         self,
196 |         graphs: Batch,
197 |         logits: List[torch.Tensor],
198 |         keys: List[str],
199 |         types: List[ActionType],
200 |         deduplicate_edge_index: bool = True,
201 |     ):
202 |         """A multi-type Categorical compatible with generating structured actions.
203 |         What is meant by type here is that there are multiple types of
204 |         mutually exclusive actions, e.g. AddNode and AddEdge are
205 |         mutually exclusive, but since their logits will be produced by
206 |         different variable-sized tensors (corresponding to different
207 |         elements of the graph, e.g. nodes or edges) it is inconvient
208 |         to stack them all into one single Categorical. This class
209 |         provides this convenient interaction between torch_geometric
210 |         Batch objects and lists of logit tensors.
211 |         Parameters
212 |         ----------
213 |         graphs: Batch
214 |             A Batch of graphs to which the logits correspond
215 |         logits: List[torch.Tensor]
216 |             A list of tensors of shape `(n, m)` representing logits
217 |             over a variable number of graph elements (e.g. nodes) for
218 |             which there are `m` possible actions. `n` should thus be
219 |             equal to the sum of the number of such elements for each
220 |             graph in the Batch object. The length of the `logits` list
221 |             should thus be equal to the number of element types (in
222 |             other words there should be one tensor per type).
223 |         keys: List[Union[str, None]]
224 |             The keys corresponding to the Graph elements for each
225 |             tensor in the logits list. Used to extract the `_batch`
226 |             and slice attributes. For example, if the first logit
227 |             tensor is a per-node action logit, and the second is a
228 |             per-edge, `keys` could be `['x', 'edge_index']`. If
229 |             keys[i] is None, the corresponding logits are assumed to
230 |             be graph-level (i.e. if there are `k` graphs in the Batch
231 |             object, this logit tensor would have shape `(k, m)`)
232 |         types: List[ActionType]
233 |            The action type each logit corresponds to.
234 |         deduplicate_edge_index: bool, default=True
235 |            If true, this means that the 'edge_index' keys have been reduced
236 |            by e_i[::2] (presumably because the graphs are undirected)
237 |         """
238 |         # TODO: handle legal action masks? (e.g. can't add a node attr to a node that already has an attr)
239 |         self.num_graphs = graphs.num_graphs
240 |         # The logits
241 |         self.logits = logits
242 |         self.types = types
243 |         self.keys = keys
244 |         self.dev = dev = graphs.x.device
245 | 
246 |         # I'm extracting batches and slices in a slightly hackish way,
247 |         # but I'm not aware of a proper API to torch_geometric that
248 |         # achieves this "neatly" without accessing private attributes
249 | 
250 |         # This is the minibatch index of each entry in the logits
251 |         # i.e., if graph i in the Batch has N[i] nodes,
252 |         #    g.batch == [0,0,0, ...,  1,1,1,1,1, ... ]
253 |         #                 N[0] times    N[1] times
254 |         # This generalizes to edges and non-edges.
255 |         # Append '_batch' to keys except for 'x', since TG has a special case (done by default for 'x')
256 |         self.batch = [
257 |             getattr(graphs, f"{k}_batch" if k != "x" else "batch") if k is not None
258 |             # None signals a global logit rather than a per-instance logit
259 |             else torch.arange(graphs.num_graphs, device=dev)
260 |             for k in keys
261 |         ]
262 |         # This is the cumulative sum (prefixed by 0) of N[i]s
263 |         self.slice = [
264 |             graphs._slice_dict[k]
265 |             if k is not None
266 |             else torch.arange(graphs.num_graphs, device=dev)
267 |             for k in keys
268 |         ]
269 |         self.log_probs = None
270 | 
271 |         if deduplicate_edge_index and "edge_index" in keys:
272 |             idx = keys.index("edge_index")
273 |             self.batch[idx] = self.batch[idx][::2]
274 |             self.slice[idx] = self.slice[idx].div(2, rounding_mode="floor")
275 | 
276 |     def detach(self):
277 |         new = copy(self)
278 |         new.logits = [i.detach() for i in new.logits]
279 |         if new.log_probs is not None:
280 |             new.log_probs = [i.detach() for i in new.log_probs]
281 |         return new
282 | 
283 |     def to(self, device):
284 |         self.dev = device
285 |         self.logits = [i.to(device) for i in self.logits]
286 |         self.batch = [i.to(device) for i in self.batch]
287 |         self.slice = [i.to(device) for i in self.slice]
288 |         if self.log_probs is not None:
289 |             self.log_probs = [i.to(device) for i in self.log_probs]
290 |         return self
291 | 
292 |     def log_softmax(self):
293 |         """Compute log-probabilities given logits"""
294 |         if self.log_probs is not None:
295 |             return self.log_probs
296 |         # Use the `subtract by max` trick to avoid precision errors:
297 |         # compute max
298 |         maxl = (
299 |             torch.cat(
300 |                 [
301 |                     scatter(i, b, dim=0, dim_size=self.num_graphs, reduce="max")
302 |                     for i, b in zip(self.logits, self.batch)
303 |                 ],
304 |                 dim=1,
305 |             )
306 |             .max(1)
307 |             .values.detach()
308 |         )
309 |         # subtract by max then take exp
310 |         # x[b, None] indexes by the batch to map back to each node/edge and adds a broadcast dim
311 |         exp_logits = [
312 |             (i - maxl[b, None]).exp() + 1e-40 for i, b in zip(self.logits, self.batch)
313 |         ]
314 |         # sum corrected exponentiated logits, to get log(Z - max) = log(sum(exp(logits)) - max)
315 |         log_z = sum(
316 |             [
317 |                 scatter(i, b, dim=0, dim_size=self.num_graphs, reduce="sum").sum(1)
318 |                 for i, b in zip(exp_logits, self.batch)
319 |             ]
320 |         ).log()
321 |         # log probabilities is log(exp(logit) / Z)
322 |         self.log_probs = [
323 |             i.log() - log_z[b, None] for i, b in zip(exp_logits, self.batch)
324 |         ]
325 |         return self.log_probs
326 | 
327 |     def sample(self) -> List[ActionIndex]:
328 |         # Use the Gumbel trick to sample categoricals
329 |         # i.e. if X ~ argmax(logits - log(-log(uniform(logits.shape))))
330 |         # then  p(X = i) = exp(logits[i]) / Z
331 |         # Here we have to do the argmax first over the variable number
332 |         # of rows of each element type for each graph in the
333 |         # minibatch, then over the different types (since they are
334 |         # mutually exclusive).
335 | 
336 |         # Uniform noise
337 |         u = [torch.rand(i.shape, device=self.dev) for i in self.logits]
338 |         # Gumbel noise
339 |         gumbel = [logit - (-noise.log()).log() for logit, noise in zip(self.logits, u)]
340 |         # scatter_max and .max create a (values, indices) pair
341 |         # These logits are 2d (num_obj_of_type, num_actions_of_type),
342 |         # first reduce-max over the batch, which preserves the
343 |         # columns, so we get (minibatch_size, num_actions_of_type).
344 |         # First we prefill `out` with very negative values in case
345 |         # there are no corresponding logits (this can happen if e.g. a
346 |         # graph has no edges), we don't want to accidentally take the
347 |         # max of that type.
348 |         mnb_max = [
349 |             torch.zeros(self.num_graphs, i.shape[1], device=self.dev) - 1e6
350 |             for i in self.logits
351 |         ]
352 |         mnb_max = [
353 |             scatter_max(i, b, dim=0, out=out)
354 |             for i, b, out in zip(gumbel, self.batch, mnb_max)
355 |         ]
356 |         # Then over cols, this gets us which col holds the max value,
357 |         # so we get (minibatch_size,)
358 |         col_max = [values.max(1) for values, idx in mnb_max]
359 |         # Now we look up which row in those argmax cols was the max:
360 |         row_pos = [
361 |             idx_mnb[torch.arange(len(idx_col)), idx_col]
362 |             for (_, idx_mnb), (_, idx_col) in zip(mnb_max, col_max)
363 |         ]
364 |         # The maxes themselves
365 |         maxs = [values for values, idx in col_max]
366 |         # Now we need to check which type of logit has the actual max
367 |         type_max_val, type_max_idx = torch.stack(maxs).max(0)
368 |         if torch.isfinite(type_max_val).logical_not_().any():
369 |             raise ValueError(
370 |                 "Non finite max value in sample", (type_max_val, self.logits)
371 |             )
372 | 
373 |         # Now we can return the indices of where the actions occurred
374 |         # in the form List[(type, row, column)]
375 |         actions = []
376 |         for i in range(type_max_idx.shape[0]):
377 |             t = type_max_idx[i]
378 |             # Subtract from the slice of that type and index, since the computed
379 |             # row position is batch-wise rather graph-wise
380 |             actions.append(
381 |                 (int(t), int(row_pos[t][i] - self.slice[t][i]), int(col_max[t][1][i]))
382 |             )
383 |         # It's now up to the Context class to create GraphBuildingAction instances
384 |         # if it wants to convert these indices to env-compatible actions
385 |         return actions
386 | 
387 |     def log_probability(self, actions: List[ActionIndex]) -> torch.Tensor:
388 |         """The log-probability of a list of action tuples
389 |         Parameters
390 |         ----------
391 |         actions: ActionIndex
392 |             A list of action indices (action index triples)
393 |         """
394 |         log_probs = self.log_softmax()
395 |         return torch.stack(
396 |             [
397 |                 log_probs[t][row + self.slice[t][i], col]
398 |                 for i, (t, row, col) in enumerate(actions)
399 |             ]
400 |         )
401 | 
402 | 
403 | def generate_forward_trajectory(
404 |     g: Graph, max_nodes: int = None
405 | ) -> List[Tuple[Graph, Action]]:
406 |     """Sample (uniformly) a trajectory that generates `g`"""
407 |     # TODO: should this be a method of GraphBuildingEnv? handle set_node_attr flags and so on?
408 |     gn = Graph()
409 |     # Choose an arbitrary starting point, add to the stack
410 |     stack: List[Tuple[int, ...]] = [(np.random.randint(0, len(g.nodes)),)]
411 |     traj = []
412 |     # This map keeps track of node labels in gn, since we have to start from 0
413 |     relabeling_map: Dict[int, int] = {}
414 |     while len(stack):
415 |         # We pop from the stack until all nodes and edges have been
416 |         # generated and their attributes have been set. Un-inserted
417 |         # nodes/edges will be added to the stack as the graph is
418 |         # expanded from the starting point. Nodes/edges that have
419 |         # attributes will be reinserted into the stack until those
420 |         # attributes are "set".
421 |         i = stack.pop(np.random.randint(len(stack)))
422 | 
423 |         gt = gn.copy()  # This is a shallow copy
424 |         if len(i) > 1:  # i is an edge
425 |             e = relabeling_map.get(i[0], None), relabeling_map.get(i[1], None)
426 |             if e in gn.edges:
427 |                 # i exists in the new graph, that means some of its attributes need to be added
428 |                 attrs = [j for j in g.edges[i] if j not in gn.edges[e]]
429 |                 if len(attrs) == 0:
430 |                     continue  # If nodes are in cycles edges leading to them get stack multiple times, disregard
431 |                 attr = attrs[np.random.randint(len(attrs))]
432 |                 gn.edges[e][attr] = g.edges[i][attr]
433 |                 act = Action(
434 |                     ActionType.SET_EDGE_ATTR,
435 |                     source=e[0],
436 |                     target=e[1],
437 |                     attr=attr,
438 |                     value=g.edges[i][attr],
439 |                 )
440 |             else:
441 |                 # i doesn't exist, add the edge
442 |                 if e[1] not in gn.nodes:
443 |                     # The endpoint of the edge is not in the graph, this is a AddNode action
444 |                     assert e[1] is None  # normally we shouldn't have relabeled i[1] yet
445 |                     relabeling_map[i[1]] = len(relabeling_map)
446 |                     e = e[0], relabeling_map[i[1]]
447 |                     gn.add_node(e[1], v=g.nodes[i[1]]["v"])
448 |                     gn.add_edge(*e)
449 |                     for j in g[i[1]]:  # stack unadded edges/neighbours
450 |                         jp = relabeling_map.get(j, None)
451 |                         if jp not in gn or (e[1], jp) not in gn.edges:
452 |                             stack.append((i[1], j))
453 |                     act = Action(
454 |                         ActionType.ADD_NODE, source=e[0], value=g.nodes[i[1]]["v"]
455 |                     )
456 |                     if len(gn.nodes[e[1]]) < len(g.nodes[i[1]]):
457 |                         stack.append(
458 |                             (i[1],)
459 |                         )  # we still have attributes to add to node i[1]
460 |                 else:
461 |                     # The endpoint is in the graph, this is an AddEdge action
462 |                     assert e[0] in gn.nodes
463 |                     gn.add_edge(*e)
464 |                     act = Action(ActionType.ADD_EDGE, source=e[0], target=e[1])
465 | 
466 |             if len(gn.edges[e]) < len(g.edges[i]):
467 |                 stack.append(i)  # we still have attributes to add to edge i
468 |         else:  # i is a node, (u,)
469 |             u = i[0]
470 |             n = relabeling_map.get(u, None)
471 |             if n not in gn.nodes:
472 |                 # u doesn't exist yet, this should only happen for the first node
473 |                 assert len(gn.nodes) == 0
474 |                 act = Action(ActionType.ADD_NODE, source=0, value=g.nodes[u]["v"])
475 |                 n = relabeling_map[u] = len(relabeling_map)
476 |                 gn.add_node(0, v=g.nodes[u]["v"])
477 |                 for j in g[u]:  # For every neighbour of node u
478 |                     if relabeling_map.get(j, None) not in gn:
479 |                         stack.append((u, j))  # push the (u,j) edge onto the stack
480 |             else:
481 |                 # u exists, meaning we have attributes left to add
482 |                 attrs = [j for j in g.nodes[u] if j not in gn.nodes[n]]
483 |                 attr = attrs[np.random.randint(len(attrs))]
484 |                 gn.nodes[n][attr] = g.nodes[u][attr]
485 |                 act = Action(
486 |                     ActionType.SET_NODE_ATTR,
487 |                     source=n,
488 |                     attr=attr,
489 |                     value=g.nodes[u][attr],
490 |                 )
491 |             if len(gn.nodes[n]) < len(g.nodes[u]):
492 |                 stack.append((u,))  # we still have attributes to add to node u
493 |         traj.append((gt, act))
494 |     traj.append((gn, Action(ActionType.STOP)))
495 |     return traj
496 | 
497 | 
498 | # Utility type-aliases
499 | StateActionPair = Tuple[Graph, Action]
500 | ActionIndex = Tuple[int, int, int]
501 | 


--------------------------------------------------------------------------------
/tests/data/test.smi:
--------------------------------------------------------------------------------
  1 | c1ccc(Nc2n[nH]c3cccnc23)cc1
  2 | N#Cc1c(NC(=O)Cc2ccccc2Br)sc2c1CCCC2
  3 | CCCCC(=O)C1CCN(C(=O)c2cc3sccc3n2Cc2ccc(OC)cc2)CC1
  4 | Cc1cc(=O)oc2cc(OCCSC(=S)N3CCN(C(C)C)CC3)ccc12
  5 | C=CCOc1nc(-c2ccc(CP(=O)(OCC)OCC)cc2)nc2ccc(Br)cc12
  6 | CCCCOc1c(CCCC)c(O)nc2ccccc12
  7 | O=C(O)C1=CC(c2ccc(F)cc2)CC(OCc2ccc(CO)cc2)O1
  8 | CC(=O)c1ccc(NC(=S)N2CCN(CCN3C(=O)c4cccc5cccc(c45)C3=O)CC2)cc1
  9 | c1cncc(-c2cc(-c3cccnc3)ncn2)c1
 10 | OCCN1C=NCC1
 11 | O=C(Nc1cccc(F)c1)N1CCN(c2ccnc(Cl)n2)CC1
 12 | COc1ccc(-n2c(C3CC3)nnc2N2CC3CC3(c3ccc(F)cc3Cl)C2)cn1
 13 | CCOC(=O)c1nc2c(c(=O)[nH]1)C1CCCN1C(=O)N2c1ccccc1
 14 | COc1cc(C=NNC(=O)c2ccncc2)ccc1OC(=O)c1cccc(Br)c1
 15 | Cc1nn(-c2ccccc2)c(C)c1NC(=O)COC(=O)Cc1c(C)n(C)c2ccccc12
 16 | Cc1ccc(C2=C(C#N)C(=O)NC(c3cccc(Br)c3)(C(F)(F)F)C2)cc1
 17 | CC1(COc2cccc(-c3ccc(F)cc3F)n2)CCn2cc([N+](=O)[O-])nc2O1
 18 | Cc1nn(C)c2c1C(c1ccc(C(C)C)cc1)C(C#N)=C(N)O2
 19 | Cc1[nH]c2ncnc(Nc3cccc(C#N)c3)c2c1C
 20 | Nc1nccc(-c2ccc3c(NCc4ccccc4)n[nH]c3c2)n1
 21 | CCOc1ccc(NC(=O)CSc2cn(CCNC(=O)c3ccccc3F)c3ccccc23)cc1
 22 | CC(C)(C)NC(=O)C1CCN(Cc2cccc(NC(=O)C3CCOC3)c2)CC1
 23 | CCCCC(=O)NC(Cc1cccc(C)c1)C(=O)NCC#N
 24 | COc1ccc(-c2cc(-c3cccc(S(C)(=O)=O)c3)cnc2N)cn1
 25 | CC(C)n1cc(NC(=O)c2cc(NC(=O)c3ccc(Cl)c(Cl)n3)cn2C)cc1C(=O)Nc1cc(C(=O)NCCCN(C)C)n(C)c1
 26 | CC(C)(NC(=O)OCc1ccccc1)C(=O)OC(=O)C(C)(C)NC(=O)OCc1ccccc1
 27 | CC(C)Cc1cc(C(=O)NCC2(C#N)CCN(CC3=Cc4ccccc4OC3(C)C)CC2)nn1-c1ccccc1
 28 | COCCOc1cc(N)nc(C)c1CNC(=O)c1cnn(Cc2ccccc2)c1
 29 | CC(=O)Nc1ccc2c(c1)[nH]c1cc(C(F)(F)F)ccc12
 30 | COc1ccc(-c2nc3ccccc3c(=O)[nH]2)cc1Oc1ccc(Cl)cc1Cl
 31 | O=C(COC(=O)c1cncc(Br)c1)NC1CCCCCC1
 32 | CN1C2CCC1CC(=NOC(c1ccc(Cl)cc1)c1ccc(Cl)cc1)C2
 33 | COc1ccc(C=C(NC(=O)c2ccc([N+](=O)[O-])cc2)C(=O)NCCN2CCOCC2)cc1
 34 | Cc1ccc(S(=O)(=O)N(C)C)cc1NC(S)=NCc1ccco1
 35 | CCOC(=O)c1cc2ccccn2c(=Nc2ccc(F)cc2)n1
 36 | Cc1nccn1-c1cc(-c2cnc(N)c(OC(F)F)c2)nc(N2CC3CC2C3)n1
 37 | O=C(NCc1ccccc1Br)c1cnc2sc(N3CCCCCC3)nn2c1=O
 38 | CCCCCCNC(=O)CSc1nc2c([nH]c3ccccc32)c(=O)n1-c1ccccc1
 39 | CCN(CC)S(=O)(=O)c1cccc(C(=O)Nc2ccc(-c3cccc(C(N)=O)c3)cc2C(=O)O)c1
 40 | COc1cc(Cl)c(Cl)cc1NC(=O)NS(=O)(=O)c1ccc(C)cc1
 41 | CS(=O)(=O)Oc1ccc(C(=NNc2ccc([N+](=O)[O-])cc2[N+](=O)[O-])c2ccc(OS(C)(=O)=O)cc2)cc1
 42 | CC(C)=CCCC1(C)C=Cc2c(ccc(C(=O)C=Cc3ccc(C#N)cc3)c2O)O1
 43 | O=C1CC(=C2c3ccccc3-c3ccccc32)C(=O)N1CCCCN1CCN(c2cccc(C(F)(F)F)c2)CC1
 44 | CN(C)C(=O)CN1CC2CCN(C(=O)C3CCCO3)CCC2S1(=O)=O
 45 | CN(C)C(=O)Cc1cn(-c2cccc(C(F)(F)F)c2)nc1-c1ccc2c(c1)CCc1ccccc1-2
 46 | O=C(Nc1cc(Br)ccc1C(=O)O)c1ccc(SC(F)(F)F)cc1
 47 | CCn1c(COc2ccccc2C)nnc1SCC(=O)CC(=O)Nc1ccccc1OC
 48 | Cc1cccc(CN2CCN(CC(=O)O)C2=O)c1
 49 | CCc1c(-c2nnc(C3(c4ccc(OC)cc4)CC3)o2)nc(-c2ccc(Cl)cc2Cl)n1-c1ccc(Br)cc1
 50 | O=C1NC(=O)C(=Cc2cnn3c(NC4CC4)cc(NC(=O)OCc4ccccc4)nc23)N1
 51 | Cc1ccccc1NS(=O)(=O)c1cc2c(cc1C)NC(=O)CO2
 52 | CCCCCCCN(CC(=O)Nc1ccc(C)cc1C)Cc1ccc(OC(C)(C)C(=O)O)cc1
 53 | Cc1cc(NC(=O)Nc2cccc(OC(F)(F)F)c2)c2ccccc2n1
 54 | CC1=C(CCc2ccoc2)C2(C)CCCC(C)(C)C2C(O)C1=O
 55 | COc1ccc(NC(=O)CCN2C(=O)NC3(CCCC3)C2=O)cc1Cl
 56 | Cc1nc(SCC(=O)c2ccc(S(N)(=O)=O)c(Cl)c2)[nH]c(=O)c1Cc1ccccc1
 57 | O=C(O)CCNc1nc(Cc2nnc(SCC(=O)NNC(=O)CCl)n2NC(=O)c2cccc([N+](=O)[O-])c2)cs1
 58 | Cc1cccc(C)c1N1C(=O)c2nccnc2C1=O
 59 | CCc1nc2ccc(C3CCN(S(C)(=O)=O)CC3)cn2c1N(C)c1nc2c(s1)Cc1ccc(F)cc1-2
 60 | Cc1cc(C)cc(Oc2nccc(-c3c(C)nnn3C3CCN(C(=N)N)CC3)n2)c1
 61 | CS(=O)(=O)c1ccc(-c2c(-c3ccccc3)cc(-c3ccc(F)cc3)oc2=O)cc1
 62 | COc1cc(-c2nc(=O)c3c([nH]2)sc2cc(C(F)(F)F)ccc23)ccc1OCC(=O)O
 63 | O=C(Nc1ccc(SC(F)(F)F)cc1)Nc1ccc(SC(F)(F)F)c(Cl)c1
 64 | COCc1nn(CCn2cc(Br)cn2)c(=O)o1
 65 | O=C(Cc1ccccc1)NCc1ccc(-c2nc(C(=O)N3CC=CCC3)co2)cc1
 66 | COc1ccc(C2=C(c3ccc(SC)cc3)CC3CCCN3C2=O)cc1C
 67 | CC[N+]1=C(C)C(C)(C)c2cc(C)ccc21
 68 | COc1ccc(C)cc1NC(=O)CN(C)S(=O)(=O)c1ccc2[nH]c(=O)oc2c1
 69 | Cc1cc2cc(C#N)ccc2c(C)c1Nc1nc(Nc2ccc(C#N)cc2)nc(OCCCN2CCOCC2)n1
 70 | NCCC(N)C(=O)N1CCCCC1
 71 | CN(C)c1ncccc1C(=O)N1CCCC1Cn1cccn1
 72 | COc1ccc(NC(=O)CN(C)C(=O)c2cc(C3CC3)nn2-c2ccccc2)cc1
 73 | NS(=O)(=O)c1ccc(NC(=O)c2cccc(C(=O)O)n2)c(F)c1
 74 | CCN1C(=O)C(C)(C)Oc2cc(C)c(-c3cc(C(C)=CC(=O)O)ccc3OC(F)(F)F)cc21
 75 | NC1CCCC(C(=O)Nc2ccc3[nH]nc(Nc4cccc5cccnc45)c3c2)C1
 76 | CCCCc1ccc(CC(=O)Nc2cc(S(=O)(=O)N3CCOCC3)ccc2O)cc1
 77 | O=C(c1nc(-c2ccc(Cl)cc2)c2cc(Cl)ccc2n1)N1CCCCC1
 78 | NNc1ncnc2[nH]cnc12
 79 | COc1ccc(C2C(Oc3ccc(Cl)cc3Cl)C(=O)N2c2sc3c(c2C#N)C=C(C=Cc2c(C)nn(-c4ccccc4)c2Cl)CC3(C)C)cc1
 80 | CON=C1CN=C(C(C)C)N1c1ccc(C(O)(C(F)(F)F)C(F)(F)F)cc1
 81 | O=C(CCC(NC(=O)c1cc(Cl)cc(Cl)c1)C(=O)N1CCC2(CCCC2)CC1)NC(Cc1ccc2ccccc2c1)C(=O)O
 82 | O=C(NC1=C(N2CCOCC2)C(=O)c2ccccc2C1=O)c1ccccc1
 83 | CCCc1cc(=O)[nH]c(SCC(=O)c2ccc(S(N)(=O)=O)c(Cl)c2)n1
 84 | COc1cc2nc(N(C)c3nc(OC)c4ccccc4n3)nc(C)c2cc1OC
 85 | O=C(NO)c1ccc[n+]([O-])c1
 86 | CC(C)NC(=O)CSc1nc2sccc2c(=O)n1-c1ccccc1
 87 | CSc1nc(C(=O)N2CCCC(C(=O)c3ccc(Cl)cc3)C2)cs1
 88 | CCCc1nc(N)c2nc(-n3nccn3)n(C)c2n1
 89 | O=C(O)Cn1c2c(c3cc(F)ccc31)CN(C(=O)CCc1cccc3ccccc13)CC2
 90 | COc1ccc(C(=O)Nc2ccc(C(C)N(C)c3ncc4c(N)nc(N)nc4n3)cc2)cc1
 91 | C=CCc1cc(CN2CCN(C(C)C)CC2)c(O)c(-c2ccc(O)c(CC=C)c2)c1
 92 | NC1(C(=O)O)CC2CCC1C2
 93 | O=C(NCCCCCn1c(C2CCNCC2)nc2cc(Cl)c(Cl)cc21)c1ccccc1
 94 | CC(C)(NC(=O)c1ccc(C#Cc2ccccc2)cc1)C(=O)NO
 95 | O=c1[nH]c2ccccc2n2c(=O)n(-c3ccc(Cl)cc3)nc12
 96 | CCOC(=O)c1ccc(NC2CCCN(c3nnc(Cc4ccccc4)c4ccccc34)C2)nc1
 97 | Cc1nc(C(O)CNC(C)(C)C)ccc1O
 98 | COc1cc(CCC(=O)OCC(=O)Nc2ccc(C)cc2Cl)cc(OC)c1OC
 99 | Cc1noc(C)c1CN(C)C(=O)NC(C)c1cccs1
100 | CCS(=O)(=O)c1cc(C#N)ccc1C1C2=C(CCC2=O)N(c2cccc(C(F)(F)F)c2)C(=O)N1C(=O)NC
101 | FC(F)(F)C1(c2nnc(-c3nn(-c4ccc(Cl)cc4Cl)c(-c4ccc(Br)cc4)c3Cn3cncn3)s2)CC1
102 | CC#CC(CC(=O)O)c1ccc(OCc2ccc(CN3CCC4(C=Cc5ccccc54)CC3)cc2)cc1F
103 | Cc1ccnc(NCCCCCCC(=O)NC(CC(=O)O)c2cccc([N+](=O)[O-])c2)c1
104 | N=C(N)NN=Cc1c(-c2ccc(Cl)c(Cl)c2Cl)nc2n1CCS2
105 | O=C(NCCC1CCN(Cc2ccccc2)CC1)c1ccc(Cl)cn1
106 | CN(C)Nc1nc(-c2ccc([N+](=O)[O-])o2)cs1
107 | CC(=O)N1C2CCC1c1c(n(C)c3cc(-n4ccc(OCc5ccccc5)cc4=O)ccc13)C2
108 | O=C1CSC(N2N=C(c3ccc(Br)cc3)CC2c2ccco2)=N1
109 | Fc1ccc2[nH]c3c(c2c1)CN(CCCCc1ccncc1)CC3
110 | O=C(O)c1ccc(C(=O)c2cc([N+](=O)[O-])cc([N+](=O)[O-])c2)cc1
111 | O=c1[nH]cnc2c1ncn2C1OC2COP(=O)(O)OC3C(O)C(COP(=O)(O)OC2C1O)OC3n1cnc2c(=O)[nH]cnc21
112 | CCOC(=O)N1CCN(S(=O)(=O)N2CCCC(C(=O)NCc3ccc(OC)cc3)C2)CC1
113 | NC1=CC(=O)c2ncccc2C1=O
114 | CNCc1cc(C(=O)NC)ccc1Oc1ccc(Cl)cc1C
115 | O=C(Cc1ccc(F)cc1)N1CCN=C1SCc1cccc([N+](=O)[O-])c1
116 | CC(Nc1cc(F)cc(F)c1)c1cc(C(=O)N(C)C)cc2c(=O)cc(N3CCOCC3C)oc12
117 | C=C(COC)C1CCC2(C(=O)O)CCC3(C)C(CCC4C5(C)CCC(OC(=O)n6ccnc6C)C(C)(C)C5CCC43C)C12
118 | O=C(OCCCc1cccnc1)C1CCCCN1S(=O)(=O)Cc1cccc([N+](=O)[O-])c1
119 | CCN(CC)c1ccc(C(=O)c2sc(NC(=O)C3(c4ccc5c(c4)OCO5)CC3)nc2-c2ccccc2)cc1
120 | FC1(F)CCNC(C2COC(c3ccccc3)(c3ccccc3)O2)C1
121 | Cc1ccc2c(c1Br)C(=NN1C(=O)c3ccccc3C1=O)C(=O)N2
122 | O=C(C1CC(=O)N(C2CCCCC2)C1)N1CCC2(CC1)OCCO2
123 | O=C(O)C(Cc1ccc(F)cc1)C1CS1
124 | O=S(=O)(c1ccccc1)N(CC(F)(F)F)c1ccc(C(O)(C(F)(F)F)C(F)(F)F)cc1
125 | CC1Cc2ccccc2N1C(=O)COC(=O)C=Cc1ccc(F)cc1
126 | O=C(O)C1C(c2ccccc2)C(C(=O)OCc2ccccc2)C1c1ccccc1
127 | OC1CCCCCCC1
128 | COCC#Cc1nn(C2OC(COS(N)(=O)=O)C(O)C2O)c2ncnc(N)c12
129 | CN(C1CCCCC1)S(=O)(=O)c1cccc2nsnc12
130 | CC(=O)N(C)Cc1cn2cc(NC(=O)c3ccc(-c4ccc(C(F)(F)F)cc4)cc3)ccc2n1
131 | COc1cccc2oc3c(N4CC5CC4CN5)nc(N)nc3c12
132 | COc1cc2c(cc1OC)CN(CCCCn1c3ccccc3c3ccccc31)CC2
133 | OC(COc1cccc2[nH]ccc12)CN1CCC(COc2ccccc2)CC1
134 | C=CC(=O)Nc1cccc(Br)c1
135 | O=C(NC1CCN(S(=O)(=O)c2ccc(OC3CCNCC3)c(Br)c2)C1)c1ccc(Cl)c(Cl)c1
136 | Cc1ccc(C)n1NC(=O)c1ccc(O)cc1
137 | Cc1cccc(CN2CC3CN(C(=O)NC(C)C)CC3C2=O)c1
138 | COc1cc(N2CCN(N3CCN(C)CC3)CC2)ccc1Nc1ncc2ccc(-c3ccccc3OC)n2n1
139 | Cc1ncc(CNC2CCN(CCn3c(=O)ccc4ncc(F)cc43)CC2F)cc1C#N
140 | O=C(C1Cc2c(sc3ccccc23)CN1)N1CCNCC1
141 | CCCCCCCCCCCCCCCCNC(=O)c1c[nH]c(-c2ccccc2)n1
142 | O=c1c2c(-c3ccccc3)c3ccccc3nc2cnn1-c1ccccc1
143 | O=C(NCc1ccc(F)cc1)C(=O)c1cn(CC(=O)N2CCOCC2)c2ccccc12
144 | Cc1cnn(C(C)C2CC2)c1NC(=O)c1ccco1
145 | Cc1[nH]n(-c2ccccc2)c(=O)c1N=Nc1ccc2c(c1)OCO2
146 | O=C(Nc1ccc2cn[nH]c2c1)NC1CCCCC1CN1CCCC(Cc2ccc(F)cc2)C1
147 | COc1ccc2[nH]cc3c2c1CC(N(C)C)C3
148 | CCC=CCCCCCCCCCCCCCCCCCC(=O)O
149 | CCC(N)Cc1cc(OC)c(SC)cc1OC
150 | COc1ccc(NC(=O)NCCN2CCOCC2)cn1
151 | O=C1NC(=O)N(c2ccccc2)C(=O)C1=CNc1cccc2c1C(=O)c1ccccc1C2=O
152 | CCOc1cc(=O)n(C)cc1-c1cc(NC(=O)CCc2ccc(OC)c(OC)c2OC)ccc1Oc1ccc(F)cc1F
153 | CS(=O)(=O)c1cc(-c2ccc(F)cc2)cc(S(=O)(=O)c2ccc(CN)s2)c1
154 | Cc1cc(Nc2ccccc2Cl)n2nc(-c3ccco3)nc2n1
155 | CC#Cc1cc(-c2ccc3c(c2)C2(N=C(C)C(N)=N2)C2(CCC(OC)CC2)C3)ccn1
156 | COc1cncc(-c2cc3ccc(OC)cc3cc2C)c1
157 | Cc1noc(C)c1CN(C)C(=O)c1nc(N2CCCC2)ncc1Cl
158 | CC(NP(=O)(OCC1OC(n2cnc3c(=O)[nH]c(N)nc32)C(C)(O)C1O)Oc1cccc2ccccc12)C(=O)OC1CCCC1
159 | COc1ccccc1CNC(=O)C1CCN(c2nc3ccc(C)cc3[nH]2)CC1
160 | c1coc(C2=Nc3ccccc3SC(c3ccc4c(c3)OCO4)C2)c1
161 | O=c1c(-c2ccccc2)cncn1C(CN1CCCC1)c1ccccc1
162 | COc1ccc(-c2nn(CCn3ccnc3)c(=O)c3ccccc23)cc1OC
163 | CN(C)CCCNC(=O)C(c1cccc(F)c1)N(C)C
164 | O=C(COC(=O)CNC(=O)c1ccc(Cl)cc1)Nc1ncc(C(F)(F)F)cc1Cl
165 | CCC(O)C1CC(Cc2ccccc2)CCN1CCCNC(=O)Nc1cccc(C(C)=O)c1
166 | Cc1ccc(CSc2nc(N)cc(NCCC(=O)N3CCCC3)n2)cc1
167 | O=C(CO)NC1C(O)CC(OCc2ccc(-c3ccccc3)cc2)(C(=O)O)OC1C(O)C(O)CNCc1ccc(-c2ccc(O)cc2)cc1
168 | O=C(NCCc1ccccc1)c1ccc2c(c1)S(=O)(=O)N=C1CCCCCN12
169 | CCC(=O)CCCCCC(NC(=O)CC1CC2(C1)CN(C)C2)c1ncc(-c2cc3ccccc3nc2OC)[nH]1
170 | CCN1CC2(COC(=O)c3ccccc3N)CCC(OC)C34C5CC6C(OC)CC(O)(C5C6OC)C(O)(C(OC)C23)C14
171 | COc1ccc2nc(Cl)c(C3CC(c4ccco4)=NN3C(=O)CCCC(=O)O)cc2c1
172 | O=Cc1ccc(-c2ccc(Cl)cc2F)o1
173 | O=C(Nc1ccc(C(F)(F)F)cc1)N1CCCC1c1ncnn1Cc1ccc(Cl)cc1
174 | COc1ccc2c(c1)-c1c(N3CCNCC3)nc3ccccc3c1C2=O
175 | O=C(c1ccco1)N(Cc1cccs1)c1ccc(O)cc1
176 | O=C1CCCC2C3CCC[N+]4([O-])CCCC(CN12)C34
177 | O=C1CCCN1CC#CC[S+]1CCCC1
178 | O=C(CCCOC(=O)c1ccc(O)cc1)c1ccccc1
179 | COc1ccc(NC(=O)c2cc(-c3ccc(F)cc3OC)[nH]n2)c(OC)c1
180 | CCN(Cc1ccc(Cl)nc1)C1=C([N+](=O)[O-])CN(Cc2ccc(OC)cc2)CN1C
181 | O=C(CCCN1CCN(C(=O)c2ccccc2)CC1)NC1c2ccccc2SCC2CCCCC21
182 | CCOc1ccc(S(=O)(=O)N2CCOCC2)cc1NC(=O)CSCc1ccc(C)cc1
183 | COC(=O)C(=Cc1ccc(OC)c(OC)c1)c1ccc(OC)c(OC)c1
184 | COc1ccc(-c2nc(NCc3cccc(C=CC(=O)NO)c3)cc3c2[nH]c2ccccc23)cc1
185 | CCN1C(=O)c2cccc3c(S(=O)(=O)NC4CCOC4)ccc1c23
186 | COc1ccccc1NC(=O)C(NC(=O)c1ccc(C)cc1)c1ccccc1
187 | Cc1nc2cc(S(=O)(=O)c3ccc4[nH]c(C)nc4c3)ccc2[nH]1
188 | CC1CN(C(=O)C2CN(C(C)(C)C)CC2c2ccc(F)cc2F)CC(C)C1(O)c1ccc(C(F)(F)F)cc1
189 | COc1ccc(N)cc1C1CN(c2nc(-c3ccncc3F)cc(=O)n2C)CCO1
190 | Cc1cccc(CN2CCN(c3n[nH]c(N)n3)CC2)c1
191 | Cn1cncc1C(O)(C#Cc1ccc(C#N)cc1)c1ccc(C#N)c(-c2cccc3ccccc23)c1
192 | CC(C)(O)c1ccn2c(-c3ccc(F)c(-c4ccccc4C#N)c3)cnc2c1F
193 | COc1ncccc1-c1csc2ncnc(N3C4CCC3CC(NC3CCC3)C4)c12
194 | CSC1CN(C(=O)c2ccc3cc(Oc4ccc(C(F)(F)F)cn4)ccc3n2)CC1O
195 | Cc1nn(C)cc1C(=O)NC1CCOCC1O
196 | Cc1nc2ncnn2c(N2CCN(C(=O)c3ccco3)CC2)c1Cc1ccccc1
197 | CN(C)C(=N)c1ccc(C(=O)Nc2ccc(Cl)cc2C(=O)Nc2ccc(Cl)cn2)c(N2CCCCC2C(=O)O)c1
198 | C=CCN1C(=O)C(=Cc2ccc(-c3ccccc3)cc2)SC1=S
199 | Cc1c(S(=O)(=O)NC2CCN(CCCc3noc4ccccc34)C2)sc2ccc(F)cc12
200 | O=C(CC1CCC(NC(=O)Nc2ccccc2F)C(CO)O1)NCc1ccc(-c2ccccc2)cc1
201 | CC(C)=CCCC(C)=CCc1c(O)ccc(C=O)c1O
202 | CC(=O)NC(C)C(=O)NCCCc1ccccc1
203 | COc1nc(NC2OC(COC(C)=O)C(OC(C)=O)C(OC(C)=O)C2OC(C)=O)c(N=C(C)C(C)=O)c(=O)n1C
204 | O=C(CCCN1C(=O)c2ccc(F)c(F)c2C1=O)Nc1ccc(-c2ccccc2)cn1
205 | CCOC(=O)c1c(C)nc(-c2cccc(C#N)c2)n1OCC(N)=O
206 | Cc1cc(C(=O)CN2C(=O)C(=O)N(C3CCCC3)C2=O)c(C)n1CCc1ccccc1
207 | CCn1c(CCC(O)CC(O)CC(=O)O)c(-c2ccc(F)cc2)c(C)c1C(=O)NCc1ccc(C(=O)OC)cc1
208 | CCNC(=O)Nc1ncnc2c1ncn2C1OC(CSCCCS(=O)(=O)O)C2OC(C=Cc3ccccc3)OC21
209 | CCCn1c(=O)c2[nH]c(-c3ccc(OCC(=O)Nc4ccc(C(C)=O)cc4)cc3)cc2n(CCC)c1=O
210 | COc1ccccc1C=CCN1CCN(CCOC(c2ccc(F)cc2)c2ccc(F)cc2)CC1
211 | C=C(C(=O)OCCC)C(O)c1ccccc1[N+](=O)[O-]
212 | CC(C)(C)[Si](C)(C)Oc1ccc2ccc(=S)oc2c1
213 | COC(=O)C(CCCCNC(=O)OC(C)(C)C)N(C=CCc1cccc(Oc2ccc(C(C)(C)C)cc2)c1)Cc1cccc(OCc2ccccc2)c1
214 | c1ccc(CN2C(c3ccccc3)=Nc3ccccc3C2c2ccccc2)cc1
215 | COc1ccc(C2c3nc(CCCO)ccc3C(c3ccc4c(c3)OCO4)C2C(=O)O)cc1
216 | O=c1[nH]ccc2c(Cc3nnc4ccc(-c5ccsc5)nn34)cccc12
217 | O=C(Cc1ccccc1)OCC1OC(=O)NC1CN1CCOCC1
218 | O=C(CN1CCOCC1)Nc1ccc(Br)cc1C(=O)c1ccccc1
219 | COc1ccc2nc(-c3cccnc3-c3cc(C(F)(F)F)ccc3Cl)[nH]c2c1
220 | O=C(CNC(=O)c1ccco1)Nc1ccc(NC(=O)N2Cc3ccccc3C2)cc1
221 | CCc1cccc(NC(=O)C2CCN(S(=O)(=O)c3cccs3)CC2)c1
222 | Cn1cc(NC(=O)c2cc(NC(=O)c3cc(NC(=O)CN=C(N)N)cn3C)cn2C)cc1C(=O)NCCC(=N)N
223 | COc1cn2c(c(C(C)C)c1=O)C(=O)c1ccnc(CC(C)C)c1-2
224 | CC1(C)C=Cc2c(cc3c(c2O)C(=O)CC(c2ccc(O)cc2O)O3)O1
225 | CCc1ccc2c(c1)NC(=O)C2(C)Cc1ccccc1OC
226 | CCC(=O)N1CCC2C(CC(Cn3cccn3)N2c2nccs2)C1
227 | CC(C)(C)OC(=O)NC(=NCC1CCCCC1)NCCCc1c[nH]cn1
228 | COc1ccc(-c2cn3nc(N4CCCC(C(=O)Nc5cc(Cl)ccc5OC)C4)sc3n2)cc1
229 | CC[Si]1(C#CC2(O)CCC3C4CCc5cc(O)ccc5C4CCC32C)CCCCC1
230 | CCCCC(CO)NC(=O)C(C)NC(=O)C(O)c1cc(F)cc(F)c1
231 | CSc1cccc(NC(=O)Nc2cccc3ccc(O)cc23)c1
232 | COc1c(O)c2c3c(c1OC)CCNC3Cc1ccccc1-2
233 | C=C(Br)CN1CCC2(c3ccccc3)CC1Cc1ccc(O)cc12
234 | COc1ccc(C2OC(O)C3C(c4cc(OC)c5c(c4)OCO5)OC(=O)C23)cc1OC
235 | Brc1cncc(C2CN3CCC2CC3)n1
236 | COc1cc(-c2cccc(C3CC3)c2)c(F)cc1-n1c(=O)ccc2cc(S(=O)(=O)Nc3ccon3)ccc21
237 | O=C(CCl)Nc1cccc(C(=O)N2CCC(c3cc4c(-c5cnn6ncccc56)ccnc4[nH]3)C2)c1
238 | CCNC(=O)Nc1nc2nc(N)ncc2cc1-c1c(Cl)cccc1Cl
239 | O=C(NCCc1c[nH]c2ccccc12)c1ccc[n+](Cc2ccccc2F)c1
240 | Cc1nn(CCOc2ccccc2)c(=O)n1-c1ccccc1
241 | O=c1nc(N2CC3CN(Cc4ccc(F)cc4)CC3C2)sc2c([N+](=O)[O-])cc(C(F)(F)F)cc12
242 | Cc1cncc(-c2cc3c(cn2)cnn3-c2cnc(C3CC3)c(N3CCCC(N)C3)n2)n1
243 | O=C(OCC(=O)C12CC3CC(CC(C3)C1)C2)c1ccc(CO)cc1
244 | COc1cc(OC)c(-c2cn3ccc(N4CCC(NCc5ccnc(Cl)c5)C4)cc3n2)cc1Cl
245 | COC(=O)C1=C(C(=O)OC)C2N(Cc3ccccc3)c3ccccc3C23CC(CO)NC3=N1
246 | CC(C)(C(=O)SSC(=O)C(C)(C)c1ccccc1)c1ccccc1
247 | CCc1nnc(NC(=O)CCN(c2ccc(C)cc2)S(=O)(=O)c2ccccc2)s1
248 | COc1ccc(COC(=O)C2=C(C)NC(=O)NC2c2cccc(OC)c2OC)cc1
249 | Cc1cc(C)cc(NC(=O)C2CCN(S(=O)(=O)c3c(C)noc3C=Cc3ccco3)CC2)c1
250 | c1ccc(CNCCNC2CC2)cc1
251 | O=S(=O)(NCCCCO)c1cccc(-c2ccc(C(F)(F)F)cc2)c1
252 | O=C(C(O)c1ccccc1Cl)N1CCCN(Cc2cccnc2)CC1
253 | COC(=O)c1ccc(CNc2nc(NCCCN(C)C)nc(NCc3ccc(C(=O)OC)cc3)n2)cc1
254 | CN(C)S(=O)(=O)N1CCN(S(=O)(=O)c2ccc3ccccc3c2)CC1
255 | CCCCN1C(=O)C(=Cc2ccc(OCC(=O)OC)cc2)SC1=Nc1cccc(C(=O)O)c1
256 | O=S1(=O)c2ccccc2-c2ccccc2N1CCCN1CC=C(c2ccccc2)CC1
257 | O=C(Cc1cccs1)N(Cc1ccco1)C1(C(=O)NC2CCCCC2)CCCCC1
258 | O=S1(=O)CCOCC(c2ccccc2)N1Cc1ccccc1F
259 | CS(=O)(=O)NC(CCc1ccccc1)C(=O)NC(C=O)Cc1ccccc1
260 | CC(C)Cc1nn2c(=O)cc(COC(=O)c3cccc(NS(C)(=O)=O)c3)nc2s1
261 | CC(Cl)(Cl)C(NC(=O)c1cccs1)NC(=NC#N)Nc1ccc(Cl)nc1
262 | O=C(NO)c1ccc(Cn2nnc(-c3cccc(S(F)(F)(F)(F)F)c3)n2)cc1
263 | CC(=O)N1CCC(c2nccnc2-c2ccc(F)cc2)CC1
264 | COc1cccc(-n2nc(NC(=O)C3CNC(=O)C3)cc2-c2cccc(OC(F)(F)F)c2)c1F
265 | CC(C)(C)CN1CCC2(CC1)CN(c1ccccc1Nc1ccc(-c3ccccc3)nn1)c1c(O)ccc(Cl)c12
266 | Cc1ccc(CNCC2(F)CCN(C(=O)c3ccco3)CC2)nc1
267 | CC(=O)OC1c2ccccc2-c2nc(N3CCOCC3)c3ccccc3c21
268 | COc1ccc2c(CNCCc3ccco3)c(C(=O)O)n(Cc3ccccc3)c2c1
269 | CC1CN(C(C)CO)C(=O)c2cc(NC(=O)Cc3cn(C)c4ccccc34)ccc2OC1CN(C)C(=O)Nc1ccc(C(F)(F)F)cc1
270 | CC(C)(C)CCN1C(=O)C(CC(=O)N2CCC(N3Cc4ccccc4NC3=O)CC2)SC1c1ccccc1N1CCNCC1
271 | O=C(O)c1ccc(-c2ccccc2C(=O)NCCCCCCCn2cc(-c3cccnc3)nn2)cc1
272 | COc1ccc(N2CCN(CCCNC(=O)c3ccc(CS(=O)(=O)c4ccc(OC)cc4)o3)CC2)cc1
273 | Cc1ccc(S(=O)(=O)N2CCc3ccccc3C2CC(=O)NCCc2ccc(C3=NCCN3)cc2)cc1
274 | NC(Cn1c(=O)c(-c2ccc(COC(=O)CO)cc2)cn(Cc2c(F)cccc2C(F)(F)F)c1=O)c1ccccc1
275 | COc1cc(-c2cnc(N)c3c(-c4ccc5ccccc5c4)csc23)cc(OC)c1OC
276 | NCCn1oc(=O)[nH]c1=O
277 | Cc1cc(C(=O)NN=Cc2cccc([N+](=O)[O-])c2)c(C)o1
278 | CNC(=O)c1c(-n2ccc(C)cc2=O)oc2cc(N(C)S(C)(=O)=O)c(-c3ccc4c(n3)-c3cc5c(F)cccc5n3CC4)cc12
279 | Cc1coc2cc3oc(=O)c(CC(=O)NC(CC(C)C)C(=O)O)c(C)c3cc12
280 | COc1ccc(C(=O)NCC(=O)N2CCN(C(=O)c3ccco3)CC2)cc1[N+](=O)[O-]
281 | CC(=O)Nc1cccc(C(=O)Nc2cccc(-c3ccc(-c4nc5cc(F)ccc5[nH]4)s3)c2)c1
282 | COC(=O)Nc1coc(C(=O)Nc2coc(C(N)=O)c2)c1
283 | O=C(NNC(=S)Nc1ccccc1)C12CC3CC(CC(C3)C1)C2
284 | C=C1CC2C3C=C(Br)C4=CC(=O)CCC4(C)C3CCC2(C)C1(OC(C)=O)C(C)=O
285 | CNc1ccc2c3c(n(CCCNC(=O)OC(C)(C)C)c(=O)c2c1)-c1ccccc1C3O
286 | CCC(N)(CC)CNS(=O)(=O)c1cccc(C(C)=O)c1
287 | COc1cccc(C(=O)NN=Cc2ccc(OC)c3ccccc23)c1
288 | Oc1c(O)c(Cl)c2c(c1Cl)CCN(C(=S)NCc1ccc(Cl)cc1)C2
289 | CC(Cn1cnc2c(N)ncnc21)OCP(=O)(O)NC(CCCNC(=N)N)C(=O)O
290 | Cc1ccccc1C=C1CCc2ccccc2C1=O
291 | CC(C)C1CN(C(=O)c2ccc(O)cn2)CC1N(C)C
292 | CC1(C)CC2CC(C)(CN2C(S)=Nc2ccccc2F)C1
293 | CCOC(=O)C1Oc2ccc(CNC(=O)C3SCCN3C(=O)CC(N)Cc3cc(F)c(F)cc3F)cc2O1
294 | Cc1ccccc1NC(=S)NN=C1CC2C(CCC3CC(O)CCC32C)C2CC3OC4(CCC(C)CO4)C(C)C3C12C
295 | COc1ccc2c3c([nH]c2c1)C(CO)N(C(C)=O)CC31CCN(Cc2ccccc2Cl)CC1
296 | NC(CC(O)CP(=O)(O)O)C(=O)O
297 | CCC(CC)CC1(C)CC(CC)C(CC(=O)OC)OO1
298 | N=C(NO)NN=Cc1c(Cl)cccc1[N+](=O)[O-]
299 | FC(F)(F)c1cccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)c1
300 | COc1ccc(-c2ccc(C(=O)NCCCCc3cccnc3)cc2OC)cc1OC
301 | CN1C(=O)C(N2CCc3[nH]c(Cc4ccccc4)nc3C2=O)COc2ccccc21
302 | O=C(Nc1ccc(Oc2cc(O)cc(O)c2)c(Cl)c1)c1cc(Cl)cc(Cl)c1O
303 | N#Cc1cn(-c2ccc(C(=O)O)cc2)cc1-c1ccccc1OCCCCO
304 | CC12Cc3cnn(-c4ccc(F)cc4)c3C=C1CCN(S(=O)(=O)c1ccc(C(C)(C)C)cc1)C2
305 | COc1ccc(-c2ccc3c(c2)C2CC(N(C)S(=O)(=O)c4ccccc4)C(C(C)O)C(=O)N2CC3)cc1
306 | O=C(CC1CCN(Cc2ccncc2)CC1)N1CCC(n2c(=O)[nH]c3ccccc32)CC1
307 | O=C(NCCCNC(=O)c1cc(-c2ccccc2)on1)c1cccnc1
308 | O=C(NCCCc1nnc2ccccn12)C1CCCN1
309 | CSC1=C(C(=O)O)N2C(=O)C(NC(=O)C(N)c3ccccc3)C2CC1
310 | COc1cc(O)cc2oc(-c3ccc(O)cc3)cc(=O)c12
311 | CC(Nc1nc(N)nc(N)c1Cl)c1nc2cccc(Cl)c2c(=O)n1-c1cc[nH]n1
312 | COCCNC(=O)NC1(c2nc(C)no2)CCCCC1
313 | CCCc1nc(SC)n2c(O)nnc2c1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1
314 | CCCCCCCCCC1=C(O)C(=O)C(CCCCC)=C(O)C1=O
315 | Cc1cccc(C(=O)N2CC3OCCN(C(C)C)C3C2)n1
316 | CNC(Cc1ccccc1)C(=O)N1CCCC1C(=O)NC(C)(CCCNC(=N)N)C(=O)c1nc2ccccc2s1
317 | Cc1ccc2[nH]c3c(NCc4ccccc4)ncnc3c2c1
318 | O=C(COC(=O)CCc1ccc(S(=O)(=O)N2CCOCC2)cc1)Nc1ccccc1OC(F)F
319 | CCCCNC(=O)CCn1c(=O)c2ccccc2n(Cc2ccc(F)cc2)c1=O
320 | CC(C)(C)CC(N)CN
321 | CCCCCCCSc1cccc2c1CN(C1CCC(=O)NC1=O)C2=O
322 | O=C(NCCc1ccc(C(F)(F)F)cc1)C1CC(=O)N(c2ccc3c(c2)OCCO3)C1
323 | COC1CCCCC1Nc1cc2c(c3nsnc13)C(=O)c1ccccc1C2=O
324 | CCOC(=O)C1=C(Nc2ccc(C(F)(F)F)cc2)C(=O)N(c2ccc(C(F)(F)F)cc2)C1c1ccc(OC(F)(F)F)cc1
325 | CCOC(=O)C(Cc1ccccc1N)(C(=O)OCC)N1CCN(Cc2ccc(Cl)cc2)CC1
326 | COc1cc(OC)c(Cl)c(-c2ccc3c(NC(=O)c4ccc(N5CCN(C)CC5)cc4)n[nH]c3n2)c1Cl
327 | N#CC1(C(=O)NCC2CCC3(CCNCC3)O2)CC1
328 | O=C(CNC(=O)CNC(=O)c1ccccc1)NCC(=O)Nc1ccc(Oc2cccc(NC(=O)CNC(=O)CNC(=O)CNC(=O)c3ccccc3)c2)cc1
329 | COc1ccc2c(OC3CC4C(=O)NC5(C(=O)NS(=O)(=O)C6(C)CC6)CC5C=CCCCCCNC(=O)N4C3)cc(-c3nc(C(C)C)cs3)nc2c1C
330 | COCC(C)Oc1ccc2[nH]nc(-c3cc(N4CCOCC4)ncn3)c2c1
331 | Cc1ccc(OC(=O)C(Cc2ccccc2)NS(C)(=O)=O)cc1
332 | O=S(=O)(c1cccc(S(=O)(=O)N2CCC(n3nnc4ccccc43)CC2)c1)N1CCCCC1
333 | CN(CCCNc1c2ccccc2nc2cccc([N+](=O)[O-])c12)CCCNc1ccc([N+](=O)[O-])c2[nH]c3ccccc3c(=O)c12
334 | CCNc1nc(NC(C)C)nc(N(C#N)CC)n1
335 | CCc1cn(CS(C)(=O)=O)c(CC)c1Oc1ccc(C#N)cc1
336 | Cc1nnc2ccc(-c3ccc(OC(F)(F)F)cc3)cn12
337 | O=C(O)c1cn2c(ccc3cc([N+](=O)[O-])ccc32)n1
338 | CC(c1ccon1)N(C)C(=O)c1cccc(N)n1
339 | CN1CC2CC2(c2cc(Cl)c([N+](=O)[O-])cc2[N+](=O)[O-])C1
340 | CCc1c(C(=O)Nc2ccc(F)c(C(=O)NC)c2)cnn1CC(C)C
341 | COc1cc(C)ccc1OCCSc1nc2ccc(NC(=O)c3ccc(F)cc3)cc2s1
342 | COC1OC(CS(=O)(=O)CCC(C)(C)N(Cl)Cl)C(O)C(O)C1O
343 | O=C(OC1CN2CCC1CC2)c1ccc(Cl)cc1
344 | O=C(Cn1cc(-c2ccc(Cl)c(Cl)c2)nn1)NC12CC3CC(CC(C3)C1)C2
345 | Cc1ccc(NC(=O)Cn2cc(S(=O)(=O)NC(C)C)c(S(=O)(=O)NC(C)C)c2)cc1Cl
346 | O=C(NC1CCCC1OCc1ccccc1)c1cccnc1Oc1ccc(Nc2ccccn2)cc1
347 | Cc1nc(-c2ccc(N(Cc3ccccc3)C(=O)c3ccc(O)cc3O)cc2)c2ccccc2n1
348 | CN(C)CCCCCCNC(=O)c1ccccc1Nc1ccc2c(C=Cc3ccccn3)n[nH]c2c1
349 | COC(=O)C(Oc1cc(Cl)cc(Cl)c1)c1ccc(Oc2ccc(Cl)cc2)cc1
350 | C[N+](C)(C)CCOP(=O)(O)O
351 | O=C(CSc1nnc(COc2ccccc2)o1)N1CCCC1
352 | COC(=O)c1cc(NCc2cccc(Br)c2)ccc1N1CCOCC1
353 | CN(C)CCCOc1ccc(-c2cn3c(-c4ccc5ncccc5c4)cnc3cn2)cn1
354 | COc1cc(Cl)ccc1C1NC(=O)Nc2cc(OC)c(OC)c(OC)c21
355 | O=[N+]([O-])c1ccc(S(=O)(=O)N(CC(O)CN2CCCCC2)c2ccccc2)cc1
356 | O=C(CC1CCC2(CC1)OCCO2)NC1CCC(CCN2CCC(c3noc4cc(Cl)ccc34)CC2)CC1
357 | CC1=NC2(CCC3CN(S(=O)(=O)CC(C)C)CC32)C(=O)N1C
358 | Cn1c(=O)c2c(nc(N3CCCC(N)C3)n2Cc2cc(F)ccc2Cl)c2cc(C(=O)O)ccc21
359 | CCOc1ccccc1N1CCN(CC(=O)C(O)(c2ccccc2)C2CCC2)CC1
360 | CC1(CCNCc2ccc3c(c2)OCCO3)OCCc2sccc21
361 | CC(=NNC(=O)Cc1c(C)n[nH]c1O)c1ccc(Cl)c(Cl)c1
362 | CC(CS)C(=O)N(CC(=O)O)c1ccc(Oc2ccccc2)cc1
363 | O=c1[nH]c(-c2ccc(-c3nnn[nH]3)cc2)c(-c2ccc(-c3nnn[nH]3)cc2)cc1O
364 | Clc1ccc(-c2cc3nc(N4CCCC4)c4ccccc4c3nn2)cc1
365 | CC(C)N1CCc2c(sc(N)c2C#N)C1
366 | CC(=O)C1=C(O)C(=O)N(c2cc(C(C)(C)C)on2)C1c1ccc(C(C)C)cc1
367 | COc1cc(C=NNC(=O)C(OC)c2ccc3c(c2)OCCO3)ccc1F
368 | COc1ccccc1C(=O)c1c(N)nc2ccc(C(=O)c3c(F)cccc3F)cn12
369 | CC(C(=O)NCCNc1c2c(nc3ccccc13)CCCC2)c1ccc(-c2ccc(OCCCCCCO[N+](=O)[O-])cc2)c(F)c1
370 | COc1ccc(OCCOC(=O)C2CCCCN2C(=O)C(=O)C2(O)CCCCC2C)cc1OC
371 | O=C(Cn1nnc(-c2ccc(Cl)cc2)n1)NCc1ccco1
372 | COC(=O)C(c1ccc(Cl)cc1)C1CCCN1
373 | COc1cc(NS(=O)(=O)c2ccccc2)ccc1-c1cncc2ccccc12
374 | CCOC(=O)C1CCCN(c2ncnc3c2oc2ccccc23)C1
375 | CC(C)(C)NS(=O)(=O)c1ccccc1-c1ccc(-c2cnc(N)cn2)c(F)c1F
376 | CCOC(=O)N1CCN(C(=O)C(CCC(=O)O)NC(=O)c2cc(-c3cn[nH]c3)nc(-c3ccccc3)n2)CC1
377 | NC(=O)N1CC(Oc2cccc(C(F)(F)F)c2)C1
378 | CNC(=O)c1ccc(CNCc2ccc(SC)c(OC)c2)cc1
379 | CC(C)(C)Sc1c(CC(C)(C)C(=O)O)n(Cc2ccc(-c3cncnc3)cc2)c2ccc(OCc3ccccn3)cc12
380 | NCCC(=O)N1CCOc2ccccc2C1
381 | CCN(CC)CCCNCc1cc2c3ccccc3n(C)c2c(-c2cc(OC)c(OC)c(OC)c2)n1
382 | CC(=O)NCC1(c2cn3c4c(cccc24)CCC3)CCCCC1
383 | CC(CO)n1c(=O)n(C)c2cnc3ccc(-c4c[nH]c5nncc-5c4)nc3c21
384 | Cc1nn(-c2cccc(CN)c2)c2c(F)c(-c3ccc(N4CCCCC4=O)cc3)ccc12
385 | CCc1ccccc1N1CCN(C(=O)c2cc(OC)c(OC)c(OC)c2)CC1
386 | NC(=S)NN=C(c1ccccc1)c1ccccc1F
387 | COC(C)(C)C1CCCN1C1=C(C)C(=O)OC1
388 | COc1ccc2ccc(C(=O)Nc3ccccc3)c(OC(C)C)c2c1
389 | CCCCNC(=O)CCCCCCCCCCOCC1Cc2ccccc2CN1C(=O)c1ccc(OC)cc1
390 | CC(=O)NC1C(OCc2ccccc2)OC(CO)C(O)C1OC(C)C(=O)NC(C(=O)NC(CCC(=O)O)C(N)=O)C(C)C
391 | O=C(NCc1ccn[nH]1)c1[nH]c2ccc(Br)cc2c1S(=O)(=O)N1CCCC1
392 | COc1ncc(-c2cc3c(n2C(C)C)C(c2ccc(Cl)cc2)N(c2ccc(F)c(Cl)c2)C3=O)c(OC)n1
393 | Cc1c(Cl)cc(OCC(F)(F)F)c2nc(CCc3nc(N4CCCC4)nn3C)nn12
394 | CC(C)(C)NCc1c(C(=O)O)nn(-c2ccccc2)c1-c1ccc([N+](=O)[O-])cc1
395 | Cc1cc(C)c(Cn2c(C3CC(=O)N(c4ccc(F)cc4)C3)nc3ccccc32)c(C)c1
396 | Cc1ccc(C(=O)NCC(=O)NCCCc2ccccc2)cc1
397 | COC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)c1ccc(NC(=O)C(N)CC(=O)O)c(OCCc2c[nH]c3ccccc23)c1
398 | CCN(CC)CCNC(=O)c1ccccc1NC(=O)C(=O)Nc1ccccc1C(=O)NCCN(CC)CC
399 | CC1(C)C(=O)C=CC2(C)C1CC(=O)C1(C)C2CCC2(C)C(c3ccoc3)C(=O)C3OC321
400 | CC(C)NC(=O)c1ccc(CC2CCN(C3CCN(C(=O)c4ccccc4Cl)CC3)CC2)cc1
401 | Cc1cc(C)cc(NC(=O)Nc2ccc3nc(N4CCN(C)CC4)cc(C)c3c2)c1
402 | CCNC(=O)Nc1nc2ccc(-c3cccc(C)c3)cc2[nH]1
403 | Cc1ccc2oc(C(=O)NCc3ccccc3)cc(=O)c2c1
404 | CC(CC(=O)NC1CCCC1)=NNC(=O)Cc1csc(N)n1
405 | C=CCN(Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1)c1nc(C)ncc1C(=O)O
406 | C=C1C(=O)NC(C)C1c1ccc(Br)cc1
407 | C=CCn1c2ccccc2c2c3c(ccc21)CN(Cc1ccc(C)cc1)CO3
408 | OCCN1CCN(C(=S)Nc2ccccc2)CC1
409 | CCCc1nnc2n1N=C(c1ccc(-c3cc(Cl)ccc3Cl)o1)CS2
410 | COc1ccc2c(c1)OC1(C)CC2C(C(=O)c2ccccc2OC)C(=O)N1
411 | COc1ccc(C=Cc2ncc([N+](=O)[O-])n2CCOC(=O)c2cccc3c2OCCO3)cc1
412 | Brc1cnc(NCc2cccnc2)nc1Nc1cc(C2CC2)[nH]n1
413 | CC(=O)OCC1OC(CC(=O)C=Cc2ccc(O)cc2)C(OC(C)=O)C(OC(C)=O)C1OC(C)=O
414 | Cc1ccc2nc(O)c(N3CCOCC3)c(-c3ccccc3)c2c1
415 | CCOC(=O)C(C(=O)OCC)=C1NC2(c3ccccc3Nc3ccccc32)C(C#N)S1
416 | CCN(CC(=O)NC)C(=O)c1cc(-c2cccc(Cl)c2)no1
417 | Cc1cc(C)c2c(c1)n(CC(=O)Nc1ccc3c(c1)CC1(C3)C(=O)NC(=O)N1C)c(=O)n2CC(=O)N(C)C
418 | CCOc1ccccc1C(=O)Nc1ccccc1C(=O)Nc1ccccn1
419 | CN(C)c1cc2c(cc1NC(=O)c1ccccc1Cl)n(C)c(=O)n2C
420 | NC(=S)NN=C1C(=O)Nc2ccccc21
421 | O=C(O)Cn1c(=O)c(=O)[nH]c2cc([N+](=O)[O-])c(-n3ccc(CNCCCc4ccccc4)c3)cc21
422 | COc1cc(OC)c(CN2CCCN(C)CC2)cc1Br
423 | NC(N)=NCCc1cccc2c(-c3ccc(C(F)(F)F)cc3)cccc12
424 | CC(C)(C)NC(=O)C(c1cccnc1)N(C(=O)c1ccco1)c1ccc(NS(=O)(=O)c2ccccc2)cc1
425 | CC(=NN)C(C)=NN=C(C)C(C)=NN
426 | CCCCCC(=O)N1CC(n2cc(C3CC3)nn2)C(O)CC1c1ccccc1
427 | COC(=O)N(NC(=O)c1c(CN2CCN(S(=O)(=O)c3ccccc3)CC2)c(-c2ccccc2)nc2ccccc12)c1ccccc1
428 | C=C1CC(C)CC(=O)O1
429 | Cn1ncc(NC(=O)c2nc(-c3c(F)cccc3F)sc2N)c1N1CCNCCC1=O
430 | COc1ccc(-n2cnc3cc(C(=O)N4CCC5(CC4)OCCO5)ccc32)cc1
431 | O=C(O)c1cccc(O)n1
432 | Cc1nc(C)c(-c2ccc3cc(-c4c(C5CCCCC5)c5ccc6cc5n4CC(=O)NCCC=CCS(=O)(=O)NC6=O)ccc3n2)s1
433 | CC1=C(C(=O)Nc2ccc(C)cn2)C(c2cnn(C)c2)C2=C(O)CCCC2=N1
434 | CC1CCN(C(=O)Cn2cc(SCc3ccccc3)c3ccccc32)CC1
435 | CCn1c(NC(=O)C(C)C)c(-c2ccccc2)c(=O)c2ccccc21
436 | CN(c1ccc(C(=O)COC(=O)C2CC2)cc1)S(C)(=O)=O
437 | CCOC(=O)C1CCN(C(=O)CNC(=O)Nc2ccccc2Cl)CC1
438 | CC(OC(=O)Cc1coc2ccc3ccccc3c12)C(=O)Nc1ncc(Cl)cc1Cl
439 | Clc1cc(Cl)c2c(c1)oc1c(Cl)c(Cl)c(Cl)cc12
440 | O=C(Nc1cnccn1)Nc1ccnc2ccc(C(F)(F)F)cc12
441 | CC(CS(=O)(=O)c1ccc(Oc2ccccc2)cc1)(NCc1ccc2ccccc2c1)C(=O)NO
442 | CCc1cc(S(=O)(=O)Nc2cscn2)c(F)cc1Oc1ccc(F)cc1-c1ccnn1C
443 | Clc1ccc2nc3ccccc3c(NCc3nc4ccccc4[nH]3)c2c1
444 | O=C(CCl)Nc1ccc(Cl)cc1NS(=O)(=O)c1cccc(F)c1
445 | O=C(c1ccc(F)cc1)n1nnc2ccccc21
446 | C=COc1c(OC)c(OC)cc2c1-c1ccc(OC)c(=O)cc1C(NC(C)=O)CC2
447 | O=S(=O)(O)c1ccc2c(S(=O)(=O)O)cc(S(=O)(=O)O)cc2c1
448 | COc1ccc(N(C=Nc2ccn(C3CSC(CO)O3)c(=O)n2)c2ccc(OC)cc2)cc1
449 | CSc1nc(-c2ccc(P(=O)(O)O)o2)c(CC(C)C)s1
450 | OCC(O)CN1C(CCc2ccc3c(c2)OCO3)CCCC1CCc1ccc2c(c1)OCO2
451 | COC(=O)C(Cc1ccccc1)NC(=O)C1CC(=O)N1C(Cc1ccccc1)C(=O)NCC(C)C
452 | CCOP(=O)(OCC)OC(=NN=C1C(=O)Nc2ccccc21)c1ccccc1P(=O)(OCC)OCC
453 | Cc1n[nH]c2nccc(-c3ccc(NC(=O)NCc4ccccn4)cc3)c12
454 | CCCCCCCCCCCCCCCCP(=O)(OC)OC
455 | Cc1ncn2c1Cn1cc(CO)nc1-c1cc(Br)ccc1-2
456 | O=c1[nH]c(N2CCOCC2)nc(NC2CCCNC2)c1-c1nc(-c2cccc(C(F)(F)F)c2)cs1
457 | CC1(C)OCC(=O)Nc2ccc(-c3cccc(F)c3)cc21
458 | CC(C)=CCc1cc2c(=O)c(-c3ccc(O)cc3O)coc2c(CC=C(C)C)c1O
459 | CCNCC(=O)Nc1c(Cc2nccc3ccccc23)ccc(OC)c1OC
460 | O=C(CCCN1CCC2(c3ccccc3)CC1Cc1ccccc12)c1ccc(F)cc1
461 | CCCCc1nc(C)c(CC(=O)OC2CCCCC2)c(=O)n1Cc1ccc(-c2ccccc2-c2noc(=O)[nH]2)cc1
462 | C=C1C(O)CC(=CC=C2CCCC3(C)C2CCC3C(C)C(=C)CCCC(C)(C)O)CC1O
463 | Cn1c(=O)c(Oc2ccc(F)cc2F)cc2cnc(N3CCCC(CO)C3)nc21
464 | O=C1c2ccccc2C(=O)N1CCn1cncn1
465 | C=CCC12CC(C(=O)OC)N3C1N(C(=N)C3(Cc1ccccc1)N1CCCC1)c1ccc(Br)cc12
466 | CC(C)N1CCc2ncn(C)c2C1C(=O)NCCN1CCCC1
467 | O=C(COC(=O)c1cccc2c(=O)c3ccccc3[nH]c12)Nc1ccc(C(=O)O)cc1
468 | COc1cccc(N2CCN(Cc3coc(-c4cccc5ccccc45)n3)CC2)c1
469 | COC(=O)c1cncc(N)n1
470 | CC1(C(=O)N2CCN(c3ccc([N+](=O)[O-])cc3)CC2)CC1(Cl)Cl
471 | O=C(O)COc1ccc(-c2nocc3c(C(=O)c4ccccc4F)ccc2-3)cc1
472 | COc1cc(NC(=S)NC(=O)c2ccccc2F)ccc1NC(=O)c1cccs1
473 | COc1ccc(C(=O)N2CCN(Cc3ccc(F)cc3)CC2)cc1
474 | C=C1C(C(C)C2CC=C(C)C(=O)O2)CCC2(C)C1CC1(O)C=C3C=CC(=O)OC(C)(C)C3C(OC(C)=O)CC12
475 | Cn1cc(NS(=O)(=O)CCOc2ccc(F)cc2)cn1
476 | COc1cc(=O)n(-c2ccc(Oc3ccnc(N)c3C#CCN(C)C)c(F)c2)cc1C(=O)NCc1ccc(F)cc1
477 | COC(=O)C(C)C1CCC(C)(CCC2=C(C)C(=O)CCC2(C)C)OO1
478 | CCCCNC(=O)N(O)C1N(N=Cc2ccccc2F)C(=S)SC1(C)C
479 | N#Cc1cccc(C=Cc2c(Cl)nc(N)nc2NC2CC(CO)C(O)C2O)c1
480 | COc1cc(-c2nc(C)nc3[nH]cc(F)c23)c(Cl)cc1Cl
481 | O=C(Nc1ccc2c(c1)OCO2)C1Cc2c(-c3ccc(Cl)cc3)ccnc2O1
482 | Cc1ccc(OCC(=O)Nc2nnc(S(=O)(=O)N3CCc4ccccc43)s2)cc1
483 | O=C1OC2(CCCc3ccccc32)CC(O)=C1Sc1ccccc1Cl
484 | Nc1nc(SC2CCCC2)c2ncn(C=C3CC3(CO)CO)c2n1
485 | Cc1cccn2c(=O)c(C(=O)N3CCN(c4ccc(Cl)cc4)CC3)cnc12
486 | COC(=O)Cc1ccc(C2C(CCCc3ccccc3)C(=O)N2c2ccc(F)cc2)cc1
487 | Cc1ccc(C(=O)N2CCN(c3ccccn3)CC2)cc1C
488 | NC(=O)N1CCC(Nc2ncnc3ccc(-c4cncs4)cc23)CC1
489 | COc1cc(C(=O)c2c[nH]c3cccc(OC)c23)c(N)c(OC)c1OC
490 | C=CCc1cc(C=C2CN(S(=O)(=O)CCC)CC(=Cc3ccc(O)c(CC=C)c3)C2=O)ccc1O
491 | Cc1cn(C2CC(n3nncc3-c3ccccc3)C(CO[Si](C)(C)C(C)(C)C)O2)c(=O)[nH]c1=O
492 | c1cc(-c2cc3cc(C4=NCCN4)ccc3[nH]2)ccc1C1=NCCN1
493 | Oc1cc2c(cc1O)-c1[nH]c3ccccc3c1C2
494 | Cc1csc(NN=C(C=Cc2ccc(F)c(C)c2)c2nc3ccccc3[nH]2)n1
495 | O=C(O)c1cn(Cc2ccc(-c3cncnc3)cc2)c2c(F)cccc2c1=O
496 | O=C(O)C1=C(C(=O)Nc2sc(C3CC3)cc2-c2nc(C3CC3)no2)CCC1
497 | CC(C)n1cnnc1C1CCCN(C(=O)COCc2ccccc2)C1
498 | COC(=O)c1ccc(CNC(=O)c2ccc(Cl)s2)c(NC(=O)c2nc3c(s2)CN(C)CC3)c1
499 | Cc1cc(C)c2c(c1)c1nnc(SCCN3CCCCC3)nc1n2C
500 | CCC(NC(=O)c1c(Br)c(-c2ccccc2)nc2ccccc12)c1ccccc1
501 | 


--------------------------------------------------------------------------------