├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── esol.png
    ├── gradio_demo.png
    └── overview.jpg
├── conda.yml
├── configs
    ├── qed_eval.json
    └── rt_small.json
├── dev_requirements.txt
├── examples
    ├── example.smi
    └── qed_property_example.txt
├── pyproject.toml
├── requirements.txt
├── scripts
    ├── create_vocabulary.py
    ├── eval_language_modeling.py
    ├── eval_lm_nlp.py
    ├── eval_regressionhead.py
    ├── generate_example_data.py
    ├── run_language_modeling.py
    ├── run_lm_nlp.py
    └── run_regressionhead.py
├── setup.cfg
├── setup.py
├── terminator
    ├── __init__.py
    ├── args.py
    ├── collator_utils.py
    ├── collators.py
    ├── datasets.py
    ├── evaluator.py
    ├── factories.py
    ├── functional_groups.py
    ├── nlp.py
    ├── numerical_encodings.py
    ├── property_predictors.py
    ├── py.typed
    ├── search.py
    ├── tokenization.py
    ├── trainer.py
    ├── trainer_utils.py
    └── utils.py
├── training_configs
    ├── qed_alternated_cc.json
    ├── qed_proponly.json
    └── reactions_alternating_cc.json
└── vocabs
    ├── proteins.txt
    ├── reactions.txt
    └── smallmolecules.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Mac
 10 | *.DS_Store
 11 | 
 12 | # Model files
 13 | data/
 14 | models/
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | pip-wheel-metadata/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | *.py,cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | docs/api/*
 81 | 
 82 | # PyBuilder
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | *.ipynb
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | .python-version
 95 | 
 96 | # pipenv
 97 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 98 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 99 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | #   install all needed dependencies.
101 | #Pipfile.lock
102 | 
103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104 | __pypackages__/
105 | 
106 | # Celery stuff
107 | celerybeat-schedule
108 | celerybeat.pid
109 | 
110 | # SageMath parsed files
111 | *.sage.py
112 | 
113 | # Environments
114 | .env
115 | .venv
116 | env/
117 | venv/
118 | ENV/
119 | env.bak/
120 | venv.bak/
121 | 
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 | 
126 | # Rope project settings
127 | .ropeproject
128 | 
129 | # mkdocs documentation
130 | /site
131 | 
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 | 
137 | # Pyre type checker
138 | .pyre/
139 | 
140 | # Visual Studio Code settings
141 | .vscode/
142 | 
143 | # PyCharm settings
144 | .idea/
145 | 
146 | runs
147 | examples/models
148 | bash
149 | sandbox


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 International Business Machines
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Regression Transformer
  2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  3 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
  4 | [![Gradio demo](https://img.shields.io/website-up-down-green-red/https/hf.space/gradioiframe/GT4SD/regression_transformer/+.svg?label=demo%20status)](https://huggingface.co/spaces/GT4SD/regression_transformer)
  5 | [![DOI](https://zenodo.org/badge/449377638.svg)](https://zenodo.org/badge/latestdoi/449377638)
  6 | 
  7 | A multitask Transformer that reformulates regression as a conditional sequence modeling task.
  8 | This yields a dichotomous language model that seamlessly integrates regression with property-driven conditional generation.
  9 | 
 10 | ![Summary](assets/overview.jpg)
 11 | 
 12 | This repo contains the development code. Read the paper in [*Nature Machine Intelligence*](https://www.nature.com/articles/s42256-023-00639-z).
 13 | 
 14 | ## Demo with UI
 15 | 🤗 A gradio demo with a simple UI is available on [HuggingFace spaces](https://huggingface.co/spaces/GT4SD/regression_transformer)
 16 | ![Summary](assets/gradio_demo.png)
 17 | 
 18 | 
 19 | ## Building upon this research
 20 | 
 21 | #### You want to use a pretrained RT-model or finetune it on your own data? Then read here, otherwise the development setup can be found [below](#development-setup).
 22 | 
 23 | The Regression Transformer is implemented in the [GT4SD](https://github.com/GT4SD/gt4sd-core) library.
 24 | Via GT4SD, using several pretrained RegressionTransformers is a matter of a few lines of code :rocket:.
 25 | A complete tutorial of running inference, finetuning a RT model (or training it from scratch) and sharing and deploying it to the GT4SD model hub, can be found [here](https://github.com/GT4SD/gt4sd-core/tree/main/examples/regression_transformer).
 26 | 
 27 | For example, via GT4SD you can use the RT pretrained on small molecules with some properties as shown in the paper, in particular [QED](https://www.nature.com/articles/nchem.1243) and [ESOL](https://pubs.acs.org/doi/10.1021/ci034243x) (water solubility). There is also several multiproperty variants of the RT: e.g., a model trained jointly on logP and synthesizability (aka [SCScore](https://pubs.acs.org/doi/10.1021/acs.jcim.7b00622)).
 28 | For protein language modeling, you will also find a RT trained on a [peptide stability](https://www.science.org/doi/full/10.1126/science.aan0693) dataset from the [TAPE](https://github.com/songlab-cal/tape) benchmark.
 29 | In sum, GT4SD provides RT models pretrained on:
 30 | - **small molecules**: single (`qed`, `esol`, `crippen_logp`) or multiple (`logp_and_synthesizability`, `cosmo_acdl`, `pfas`) properties. All those models use SELFIES apart from `crippen_logp` which uses SMILES.
 31 | - **proteins**: `stability`
 32 | - **chemical reactions**: `uspto` (using reaction SMILES)
 33 | - **polymers**: `rop_catalyst` and `block_copolymer` are both described in [Park et al., (2023; *Nature Communications*)](https://www.nature.com/articles/s41467-023-39396-3). The `rop_catalyst` uses conventional SELFIES but the `block_copolymer` model uses a novel polymer language called CMDL described also in [Park et al., (2023; *Nature Communications*)](https://www.nature.com/articles/s41467-023-39396-3). 
 34 | 
 35 | A jupyter notebook with a toy usecase on adapting a molecule toward solubility is provided in [GT4SD](https://github.com/GT4SD/gt4sd-core/blob/main/notebooks/regression-transformer-demo.ipynb) too.
 36 | If you use [GT4SD](https://github.com/GT4SD/gt4sd-core), you can generate molecules like this:
 37 | ```python
 38 | from gt4sd.algorithms.conditional_generation.regression_transformer import (
 39 |     RegressionTransformer, RegressionTransformerMolecules
 40 | )
 41 | 
 42 | buturon = "CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1"
 43 | target_esol = -3.53 
 44 | config = RegressionTransformerMolecules(
 45 |     algorithm_version="solubility",
 46 |     search="sample",
 47 |     temperature=2, 
 48 |     tolerance=5,
 49 |     sampling_wrapper={
 50 |         'property_goal': {'<esol>': target_esol}, 
 51 |         'fraction_to_mask': 0.2
 52 |     }
 53 | )
 54 | esol_generator = RegressionTransformer(configuration=config, target=buturon)
 55 | generations = list(esol_generator.sample(8))
 56 | ```
 57 | 
 58 | Explore the solubility of the local chemical space around Buturon. Upon varying the property primers, you might obtain something like this:
 59 | ![Esol](assets/esol.png)
 60 | 
 61 | ## Development setup
 62 | This is mainly intended to reproduce or extend the results of the paper.
 63 | ```console
 64 | conda env create -f conda.yml
 65 | conda activate terminator
 66 | pip install -e .
 67 | ```
 68 | 
 69 | ### Data
 70 | The processed data used to train the models is available via [Box](https://ibm.box.com/s/kijawq3rf4191bbcyflsxx7kp9m74jnx).
 71 | 
 72 | ### Training a model
 73 | You can download the data and launch a training by pointing to train and test data:
 74 | ```console
 75 | python scripts/run_language_modeling.py --output_dir rt_example \
 76 |     --config_name configs/rt_small.json --tokenizer_name ./vocabs/smallmolecules.txt \
 77 |     --do_train --do_eval --learning_rate 1e-4 --num_train_epochs 5 --save_total_limit 2 \
 78 |     --save_steps 500 --per_gpu_train_batch_size 16 --evaluate_during_training --eval_steps 5 \
 79 |     --eval_data_file ./examples/qed_property_example.txt --train_data_file ./examples/qed_property_example.txt \
 80 |     --line_by_line --block_size 510 --seed 42 --logging_steps 100 --eval_accumulation_steps 2 \
 81 |     --training_config_path training_configs/qed_alternated_cc.json
 82 | ```
 83 | :warning: This configuration uses dummy data, do not use as is :no_good:
 84 | The `training_config_path` argument points to a file that specifies the training regime. This is optional, if the argument is not given, we default to vanilla PLM training that masks everywhere with equal probability (recommended for initial pretraining only). For refined examples, please see `training_configs` folder.
 85 | 
 86 | Also note that the `vocabs` folder contains the vocabulary files for training on small molecules, proteins and chemical reactions.
 87 | 
 88 | Exemplary model configurations (number of heads, layers, etc.) can be found in the [configs](./configs) folder.
 89 | 
 90 | :warning: **XLNet trains relatively slowly. It is recommended to start a training/finetuning from a pretrained model, ideally with the GT4SD trainer** (see above) :warning:
 91 | 
 92 | 
 93 | ### Evaluating a model
 94 | To evaluate a model trained e.g., on the QED task, run the following:
 95 | ```console
 96 | python scripts/eval_language_modeling.py --output_dir path_to_model \
 97 | --eval_file ./examples/qed_property_example.txt --eval_accumulation_steps 2 --param_path configs/qed_eval.json
 98 | ```
 99 | 
100 | ### Pretrained models
101 | Pretrained models are available via the GT4SD model hub. There's a total of 9 models that can also be used via [HuggingFace Spaces](https://huggingface.co/spaces/jannisborn/regression_transformer). Models that are part of the publication are also available via the [Box folder mentioned above](https://ibm.box.com/s/kijawq3rf4191bbcyflsxx7kp9m74jnx). 
102 | 
103 | #### Generate some data
104 | To generate custom data for the QED task in a RT-compatible format, run [scripts/generate_example_data.py](./scripts/generate_example_data.py) and point to a `.smi` file with SMILES in the first column.
105 | ```console
106 | python scripts/generate_example_data.py examples/example.smi examples/qed_property_example.txt
107 | ```
108 | For user-defined properties, please adapt the file or open an issue.
109 | 
110 | If you need to create a new vocabulary for a dataset you can use [scripts/create_vocabulary.py](./scripts/create_vocabulary.py) it will also automatically add some special tokens at the top of your vocabulary file.
111 | ```console
112 | python scripts/create_vocabulary.py examples/qed_property_example.txt examples/vocab.txt
113 | ```
114 | 
115 | At this point the folder containing the vocabulary file can be used to load a tokenizer compatible with any `ExpressionBertTokenizer`:
116 | ```python
117 | >>> from terminator.tokenization import ExpressionBertTokenizer
118 | >>> tokenizer = ExpressionBertTokenizer.from_pretrained('examples')
119 | >>> text = '<qed>0.3936|CBr'
120 | >>> tokens = tokenizer.tokenize(text)
121 | >>> print(tokens)
122 | ['<qed>', '_0_0_', '_._', '_3_-1_', '_9_-2_', '_3_-3_', '_6_-4_', '|', 'C', 'Br']
123 | >>> token_indexes = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
124 | >>> print(token_indexes)
125 | [16, 17, 18, 28, 45, 34, 35, 19, 15, 63]
126 | >>> tokenizer.build_inputs_with_special_tokens(token_indexes)
127 | [12, 16, 17, 18, 28, 45, 34, 35, 19, 15, 63, 13]
128 | ```
129 | 
130 | ## Citation
131 | If you use the regression transformer, please cite:
132 | ```bib
133 | @article{born2023regression,
134 |   title={Regression Transformer enables concurrent sequence regression and generation for molecular language modelling},
135 |   author={Born, Jannis and Manica, Matteo},
136 |   journal={Nature Machine Intelligence},
137 |   volume={5},
138 |   number={4},
139 |   pages={432--444},
140 |   year={2023},
141 |   publisher={Nature Publishing Group UK London}
142 | }
143 | ```
144 | 


--------------------------------------------------------------------------------
/assets/esol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/regression-transformer/6820b45e4548ee2648557fcf120b3361414a8cd9/assets/esol.png


--------------------------------------------------------------------------------
/assets/gradio_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/regression-transformer/6820b45e4548ee2648557fcf120b3361414a8cd9/assets/gradio_demo.png


--------------------------------------------------------------------------------
/assets/overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/regression-transformer/6820b45e4548ee2648557fcf120b3361414a8cd9/assets/overview.jpg


--------------------------------------------------------------------------------
/conda.yml:
--------------------------------------------------------------------------------
 1 | name: terminator
 2 | channels:
 3 |   - https://conda.anaconda.org/rdkit
 4 | dependencies:
 5 |   - rdkit=2019.03.1
 6 |   - python=3.7
 7 |   - pip>=19.1,<20.3
 8 |   - pip:
 9 |     - transformers==v3.1.0
10 |     - -r file:requirements.txt
11 |     - -r file:dev_requirements.txt
12 |     - -e .
13 | 


--------------------------------------------------------------------------------
/configs/qed_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "plm_probability": 0.4,
 3 |     "checkpoint-str": "rmse",
 4 |     "max_span_length": 7,
 5 |     "conditioning_range": [
 6 |         [
 7 |             0.051,
 8 |             0.151,
 9 |             0.251,
10 |             0.351,
11 |             0.451,
12 |             0.551,
13 |             0.651,
14 |             0.751,
15 |             0.851,
16 |             0.951
17 |         ]
18 |     ],
19 |     "line_by_line": true,
20 |     "property_tokens": [
21 |         "<qed>"
22 |     ],
23 |     "property_tokens_to_mask": [
24 |         [
25 |             5
26 |         ]
27 |     ],
28 |     "property_token_masking_order": [
29 |         [
30 |             2,
31 |             3,
32 |             4,
33 |             0,
34 |             1
35 |         ]
36 |     ],
37 |     "beam_width": 3
38 | }


--------------------------------------------------------------------------------
/configs/rt_small.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "XLNetLMHeadModel"
 4 |   ],
 5 |   "attn_type": "bi",
 6 |   "bi_data": false,
 7 |   "bos_token_id": 14,
 8 |   "clamp_len": -1,
 9 |   "d_head": 16,
10 |   "d_inner": 1024,
11 |   "d_model": 256,
12 |   "dropout": 0.2,
13 |   "end_n_top": 5,
14 |   "eos_token_id": 14,
15 |   "ff_activation": "gelu",
16 |   "initializer_range": 0.02,
17 |   "language": "selfies",
18 |   "layer_norm_eps": 1e-12,
19 |   "mem_len": null,
20 |   "model_type": "xlnet",
21 |   "n_head": 16,
22 |   "n_layer": 32,
23 |   "numerical_encodings_dim": 16,
24 |   "numerical_encodings_format": "sum",
25 |   "numerical_encodings_type": "float",
26 |   "pad_token_id": 0,
27 |   "reuse_len": null,
28 |   "same_length": false,
29 |   "start_n_top": 5,
30 |   "summary_activation": "tanh",
31 |   "summary_last_dropout": 0.1,
32 |   "summary_type": "last",
33 |   "summary_use_proj": true,
34 |   "task_specific_params": {
35 |     "text-generation": {
36 |       "do_sample": true,
37 |       "max_length": 250
38 |     }
39 |   },
40 |   "untie_r": true,
41 |   "use_numerical_encodings": true,
42 |   "vmax": 1.0,
43 |   "vocab_size": 507
44 | }


--------------------------------------------------------------------------------
/dev_requirements.txt:
--------------------------------------------------------------------------------
 1 | flake8==3.8.4
 2 | mypy==0.800
 3 | pytest==6.1.1
 4 | pytest-cov==2.10.1
 5 | black==20.8b1
 6 | sphinx==3.4.3
 7 | sphinx-autodoc-typehints==1.11.1
 8 | better-apidoc==0.3.1
 9 | sphinx_rtd_theme==0.5.1
10 | myst-parser==0.13.3
11 | flask==1.1.2
12 | flask_login==0.5.0


--------------------------------------------------------------------------------
/examples/example.smi:
--------------------------------------------------------------------------------
   1 | CCO	CHEMBL545
   2 | C	CHEMBL17564
   3 | CO	CHEMBL14688
   4 | NCCS	CHEMBL602
   5 | NCCN	CHEMBL816
   6 | CN	CHEMBL43280
   7 | C=O	CHEMBL1255
   8 | CCN	CHEMBL14449
   9 | CSC	CHEMBL15580
  10 | CBr	CHEMBL48339
  11 | CI	CHEMBL115849
  12 | CF	CHEMBL116838
  13 | CC	CHEMBL135626
  14 | CNC=O	CHEMBL9240
  15 | CCCN	CHEMBL14409
  16 | CCCO	CHEMBL14687
  17 | O=CC#C	CHEMBL722
  18 | C=CC=O	CHEMBL721
  19 | CC#N	CHEMBL45211
  20 | CCCl	CHEMBL46058
  21 | NC#N	CHEMBL56279
  22 | CC=O	CHEMBL76101
  23 | SC#N	CHEMBL84336
  24 | FCF	CHEMBL115186
  25 | C#C	CHEMBL116336
  26 | CCl	CHEMBL117545
  27 | C=C	CHEMBL117822
  28 | COC	CHEMBL119178
  29 | CNC	CHEMBL120433
  30 | CCNCC	CHEMBL1189
  31 | CCC	CHEMBL135416
  32 | N#N	CHEMBL142438
  33 | CNO	CHEMBL144761
  34 | CNN	CHEMBL160520
  35 | C#N	CHEMBL183419
  36 | CC(C)O	CHEMBL582
  37 | CNC=O	CHEMBL9081
  38 | CCCCON	CHEMBL6960
  39 | CCNC=O	CHEMBL9421
  40 | CC(O)=O	CHEMBL539
  41 | CCCCO	CHEMBL14245
  42 | CCCCN	CHEMBL13968
  43 | COCOC	CHEMBL15537
  44 | CCC#N	CHEMBL15871
  45 | CCCCC	CHEMBL16102
  46 | CCOCC	CHEMBL16264
  47 | NC(N)=N	CHEMBL821
  48 | ClCCl	CHEMBL45967
  49 | NCC=C	CHEMBL57286
  50 | NC(N)=O	CHEMBL985
  51 | NCCO	CHEMBL104943
  52 | OCCF	CHEMBL115586
  53 | CC=C	CHEMBL117213
  54 | OC=O	CHEMBL116736
  55 | CC#C	CHEMBL116902
  56 | CCCC	CHEMBL134702
  57 | CCBr	CHEMBL156378
  58 | CNNC	CHEMBL162921
  59 | CC=O	CHEMBL170365
  60 | OCCS	CHEMBL254951
  61 | NC=O	CHEMBL266160
  62 | ON=C	CHEMBL324784
  63 | OCCO	CHEMBL457299
  64 | CON	CHEMBL1213633
  65 | CCCCl	CHEMBL15697
  66 | CS(C)=O	CHEMBL504
  67 | ON=C	CHEMBL185198
  68 | Cn1ccnc1	CHEMBL543
  69 | CCCCCO	CHEMBL14568
  70 | CCCCCC	CHEMBL15939
  71 | ClCCCl	CHEMBL16370
  72 | CCCC#C	CHEMBL16262
  73 | OCC(O)CO	CHEMBL692
  74 | CN1CCCC1	CHEMBL665
  75 | CC(=O)NO	CHEMBL734
  76 | NCC(O)=O	CHEMBL773
  77 | CCCCCF	CHEMBL42434
  78 | CCOC=O	CHEMBL44215
  79 | CCCCCl	CHEMBL47259
  80 | NCCCCN	CHEMBL46257
  81 | NNC(N)=O	CHEMBL903
  82 | CCNCCN	CHEMBL70445
  83 | CNCCO	CHEMBL104083
  84 | N=C=N	CHEMBL116583
  85 | NCCCO	CHEMBL115530
  86 | C=C=C	CHEMBL116960
  87 | CCC=C	CHEMBL117210
  88 | CCSCC	CHEMBL117181
  89 | CC#CC	CHEMBL119108
  90 | NCCCN	CHEMBL174324
  91 | OCCCl	CHEMBL191244
  92 | OCC=C	CHEMBL234926
  93 | NC(=O)NO	CHEMBL467
  94 | CCC=O	CHEMBL275626
  95 | CSCCO	CHEMBL277871
  96 | COC=O	CHEMBL295026
  97 | ClCBr	CHEMBL346918
  98 | C1CCSC1	CHEMBL1379
  99 | COCCO	CHEMBL444144
 100 | OCCCO	CHEMBL379652
 101 | OCCBr	CHEMBL468583
 102 | C1CN1	CHEMBL540990
 103 | CCON	CHEMBL1213044
 104 | OC#N	CHEMBL1161700
 105 | NCCF	CHEMBL1162280
 106 | NCC=O	CHEMBL296723
 107 | CCNC	CHEMBL1232589
 108 | CCCS	CHEMBL1236818
 109 | CSSC	CHEMBL1347061
 110 | CCNN	CHEMBL1359929
 111 | SC#N	CHEMBL1161685
 112 | NCCCC(O)=O	CHEMBL96
 113 | c1cnoc1	CHEMBL13257
 114 | Nc1ccccc1	CHEMBL538
 115 | CCCCCCO	CHEMBL14085
 116 | CC(C)=O	CHEMBL14253
 117 | c1cscn1	CHEMBL15605
 118 | CC(N)=O	CHEMBL16081
 119 | CCCCC=O	CHEMBL18602
 120 | C1CCNC1	CHEMBL22830
 121 | CC(N)=S	CHEMBL38737
 122 | CCC(C)O	CHEMBL45462
 123 | CN(C)CCO	CHEMBL1135
 124 | CC(C)N	CHEMBL117080
 125 | CCOC=C	CHEMBL116745
 126 | CCOCCO	CHEMBL119596
 127 | ClCCBr	CHEMBL160255
 128 | CCCCBr	CHEMBL160949
 129 | COCC#C	CHEMBL162694
 130 | OCCCCO	CHEMBL171623
 131 | CNCCCN	CHEMBL174165
 132 | C1CNCCN1	CHEMBL1412
 133 | OCNC=O	CHEMBL268447
 134 | NCCCON	CHEMBL281021
 135 | CCCC=C	CHEMBL295337
 136 | ClCCNCCCl	CHEMBL913
 137 | CC(F)F	CHEMBL325493
 138 | CSCCCO	CHEMBL332887
 139 | CCNNCC	CHEMBL350303
 140 | CN=C=S	CHEMBL396000
 141 | CN(C)C	CHEMBL439723
 142 | C=CC#N	CHEMBL445612
 143 | BrCCBr	CHEMBL452370
 144 | OCC(S)CS	CHEMBL1597
 145 | ClCC=C	CHEMBL451126
 146 | OCCCBr	CHEMBL466545
 147 | C1CC1	CHEMBL1796999
 148 | ClCC=O	CHEMBL506976
 149 | C1CO1	CHEMBL1743219
 150 | C=CC=C	CHEMBL537970
 151 | NCCCF	CHEMBL1162286
 152 | NCC#N	CHEMBL1193997
 153 | NCCCl	CHEMBL1190279
 154 | BrCBr	CHEMBL1229889
 155 | CCCBr	CHEMBL1230095
 156 | O=C=O	CHEMBL1231871
 157 | S=C=S	CHEMBL1365180
 158 | OCC#C	CHEMBL1563026
 159 | NCCBr	CHEMBL1697693
 160 | ClC=C	CHEMBL2311071
 161 | SCC=C	CHEMBL3222024
 162 | NNCCO	CHEMBL3183346
 163 | C1CS1	CHEMBL3184935
 164 | COCCl	CHEMBL3185256
 165 | CCCCS	CHEMBL3188256
 166 | OCCCS	CHEMBL3234722
 167 | NCC#C	CHEMBL3263480
 168 | BrC#N	CHEMBL3561885
 169 | CC=CCO	CHEMBL116709
 170 | CC=CCO	CHEMBL118459
 171 | CC(N)CS	CHEMBL37279
 172 | N#CN1CCC1	CHEMBL8123
 173 | Cc1ccccc1	CHEMBL9113
 174 | NCc1ccccc1	CHEMBL522
 175 | c1c[nH]cn1	CHEMBL540
 176 | COC(C)=O	CHEMBL14079
 177 | CCC(N)CC	CHEMBL14178
 178 | CCC(O)=O	CHEMBL14021
 179 | c1cncnc1	CHEMBL15562
 180 | c1ncncn1	CHEMBL15698
 181 | c1cnccn1	CHEMBL15797
 182 | CCC(C)=O	CHEMBL15849
 183 | C1CCCCC1	CHEMBL15980
 184 | C1CCNCC1	CHEMBL15487
 185 | c1ccnnc1	CHEMBL15719
 186 | NCCNCCNCCN	CHEMBL609
 187 | OCc1ccccc1	CHEMBL720
 188 | CCCC(C)C	CHEMBL30909
 189 | CC1CCCC1	CHEMBL30940
 190 | C=CCCC=C	CHEMBL31747
 191 | CC(C)CCN	CHEMBL42003
 192 | CC(Cl)Cl	CHEMBL45079
 193 | CCCCNC=O	CHEMBL45466
 194 | CCCC(C)O	CHEMBL45065
 195 | CCC(O)CC	CHEMBL47100
 196 | CCNCCNCC	CHEMBL54723
 197 | NCCSSCCN	CHEMBL61350
 198 | CNCCCCNC	CHEMBL61621
 199 | CN1CCNCC1	CHEMBL1011
 200 | NCC(N)=O	CHEMBL86954
 201 | FC(F)Cl	CHEMBL116155
 202 | FC(F)=C	CHEMBL116020
 203 | CC(N)CO	CHEMBL116663
 204 | CSCCCCO	CHEMBL117865
 205 | OCCNCCO	CHEMBL119604
 206 | CSCCC#N	CHEMBL119837
 207 | NC(O)=O	CHEMBL125278
 208 | CCCCCCC	CHEMBL134658
 209 | CC(C)NO	CHEMBL140282
 210 | CCCCCCS	CHEMBL153339
 211 | CCCCCBr	CHEMBL155850
 212 | ClCCCBr	CHEMBL156560
 213 | ClCCCCl	CHEMBL157427
 214 | NCCC[P]	CHEMBL160548
 215 | CNCCCNC	CHEMBL174776
 216 | OCC=C=C	CHEMBL230163
 217 | CC=NC#N	CHEMBL259526
 218 | NC(N)=S	CHEMBL260876
 219 | CC(C)CO	CHEMBL269630
 220 | C1CCOC1	CHEMBL276521
 221 | c1ccsc1	CHEMBL278958
 222 | c1ccoc1	CHEMBL278980
 223 | CC(O)CO	CHEMBL286398
 224 | CCCCCNC	CHEMBL298481
 225 | CCCNC=O	CHEMBL297216
 226 | NCCNCCN	CHEMBL303429
 227 | CC(O)CN	CHEMBL326602
 228 | NCCCCCO	CHEMBL333552
 229 | CSCCC=O	CHEMBL333298
 230 | CCCCCCl	CHEMBL348039
 231 | OCCSCCO	CHEMBL444480
 232 | OCCCCCO	CHEMBL448289
 233 | OC=CC=O	CHEMBL446036
 234 | OC1CCC1	CHEMBL449234
 235 | CC(C)Br	CHEMBL451810
 236 | CCN(C)C	CHEMBL609099
 237 | CNCC#C	CHEMBL1192210
 238 | NOCC=C	CHEMBL1213179
 239 | NCCCCCN	CHEMBL119296
 240 | CNC(C)=O	CHEMBL11544
 241 | COCCOC	CHEMBL1232411
 242 | OCCCCl	CHEMBL1230102
 243 | BrCC=C	CHEMBL1429506
 244 | IC(I)I	CHEMBL1451116
 245 | CCCC=O	CHEMBL1478334
 246 | C1COC1	CHEMBL1538076
 247 | CCON=O	CHEMBL1551365
 248 | O=CC=O	CHEMBL1606435
 249 | CN=C=O	CHEMBL1608558
 250 | CCCCNN	CHEMBL1619936
 251 | NCCC#N	CHEMBL1618272
 252 | CC(C)S	CHEMBL1897156
 253 | CSCSSC	CHEMBL1988732
 254 | OCCNCO	CHEMBL2000638
 255 | C1CNC1	CHEMBL2171713
 256 | CC(C)C	CHEMBL2106398
 257 | BrCC#N	CHEMBL2139477
 258 | CN1CC1	CHEMBL2448851
 259 | CC1CO1	CHEMBL2251584
 260 | CC1CN1	CHEMBL3183775
 261 | COCCCN	CHEMBL3186458
 262 | CCNCCO	CHEMBL3188262
 263 | ClCC#N	CHEMBL3187297
 264 | CNCC=C	CHEMBL3558338
 265 | ClC=CCl	CHEMBL157026
 266 | N=[N]#N	CHEMBL186537
 267 | FCC1CO1	CHEMBL501668
 268 | O=S1CCCC1	CHEMBL1207
 269 | CC1CO1	CHEMBL1901974
 270 | NC1CONC1=O	CHEMBL771
 271 | COC(=O)C=C	CHEMBL9019
 272 | S=C1NCCN1	CHEMBL11860
 273 | O=C1NCCN1	CHEMBL12034
 274 | CC(C)(C)N	CHEMBL13782
 275 | CCOC(C)=O	CHEMBL14152
 276 | CCn1ccnc1	CHEMBL13911
 277 | Oc1ccccc1	CHEMBL14060
 278 | CCCC(O)=O	CHEMBL14227
 279 | OC(=O)CCl	CHEMBL14090
 280 | Cc1ccncc1	CHEMBL15544
 281 | Cc1cccnc1	CHEMBL15722
 282 | Cc1ccccn1	CHEMBL15732
 283 | C1CCC=CC1	CHEMBL16396
 284 | CC(C)(C)O	CHEMBL16502
 285 | CCCCCCC=O	CHEMBL18104
 286 | CCC(Cl)CC	CHEMBL18780
 287 | CC(C)CC=O	CHEMBL18360
 288 | O=CC1CCC1	CHEMBL18475
 289 | O=C1CCCC1	CHEMBL18620
 290 | CC1CCNCC1	CHEMBL21533
 291 | CC1CCCCN1	CHEMBL21454
 292 | Nc1ccccn1	CHEMBL21619
 293 | OC1CCCNC1	CHEMBL22463
 294 | CCCCCCCCO	CHEMBL26215
 295 | Nc1cccnc1	CHEMBL25541
 296 | NCC(F)CON	CHEMBL27061
 297 | NCC(F)CNN	CHEMBL26945
 298 | CCCCN=C=O	CHEMBL27104
 299 | NCCCSCCCN	CHEMBL29299
 300 | NCC(O)CON	CHEMBL26857
 301 | NCCCNCCCN	CHEMBL28743
 302 | NCCCOCCCN	CHEMBL29910
 303 | O=C1CNCN1	CHEMBL30446
 304 | NCCCCCCCN	CHEMBL28242
 305 | OC1CCCCC1	CHEMBL32010
 306 | CCCC(C)CC	CHEMBL31377
 307 | CC(Cl)CCl	CHEMBL44641
 308 | CCCC(C)=O	CHEMBL45345
 309 | ClC(Cl)Cl	CHEMBL44618
 310 | CCCC(O)CC	CHEMBL46678
 311 | CCC(=O)CC	CHEMBL45315
 312 | CCCCC(C)O	CHEMBL45425
 313 | CCCCOCCCC	CHEMBL48132
 314 | O=CNC1CC1	CHEMBL49963
 315 | O=C1CCCS1	CHEMBL56395
 316 | NC(=O)CBr	CHEMBL60628
 317 | OC(=O)CBr	CHEMBL60851
 318 | Cn1ccnc1F	CHEMBL62383
 319 | Cn1ccnc1N	CHEMBL64053
 320 | NC1=NCCO1	CHEMBL69446
 321 | OCc1ccsc1	CHEMBL76469
 322 | NN=C(N)NO	CHEMBL80352
 323 | Nc1ncccn1	CHEMBL88580
 324 | NN1CCNCC1	CHEMBL89042
 325 | Nc1ncncn1	CHEMBL89436
 326 | O=C1CCCO1	CHEMBL95681
 327 | FC1NCC=N1	CHEMBL98364
 328 | CN1CC=NC1	CHEMBL99100
 329 | Fc1ccccc1	CHEMBL16070
 330 | OC(=O)CS	CHEMBL116455
 331 | CCN(CC)CCO	CHEMBL1183
 332 | C1OCC=C1	CHEMBL117135
 333 | CN(C)N=O	CHEMBL117311
 334 | NC(CO)CO	CHEMBL116834
 335 | FC(Cl)Cl	CHEMBL116813
 336 | OCCN1CC1	CHEMBL118671
 337 | OC1CCNC1	CHEMBL118705
 338 | CCCCCCCC	CHEMBL134886
 339 | CCC(C)Br	CHEMBL156276
 340 | CCCCCCCl	CHEMBL156095
 341 | CC(C)CCl	CHEMBL160966
 342 | NC(=O)CF	CHEMBL160811
 343 | NCC(O)CN	CHEMBL177097
 344 | N#CC1CN1	CHEMBL177264
 345 | CCC(I)CC	CHEMBL177307
 346 | CCC(F)CC	CHEMBL177481
 347 | NN=C(N)N	CHEMBL225304
 348 | BrCC=C=C	CHEMBL226728
 349 | OCC(O)=O	CHEMBL252557
 350 | NNC(N)=S	CHEMBL256250
 351 | CN(C)C=O	CHEMBL268291
 352 | c1ccncc1	CHEMBL266158
 353 | CCCCCCCO	CHEMBL273459
 354 | c1ccccc1	CHEMBL277500
 355 | C1COCCN1	CHEMBL276518
 356 | NC(=O)CI	CHEMBL276727
 357 | CCCCCC=O	CHEMBL280331
 358 | CCCCOCCO	CHEMBL284588
 359 | OC1CCCC1	CHEMBL288998
 360 | Nc1ncns1	CHEMBL295053
 361 | CCC(C)CN	CHEMBL294955
 362 | NCCCCCCN	CHEMBL303004
 363 | NC(=N)NO	CHEMBL309499
 364 | NCC1CCCCC1	CHEMBL1049
 365 | C1NCN=C1	CHEMBL317004
 366 | CCC(N)CO	CHEMBL327032
 367 | OP(O)(O)=O	CHEMBL1187
 368 | O=C1CCN1	CHEMBL344042
 369 | Nc1nccs1	CHEMBL344760
 370 | ClCCCCBr	CHEMBL350215
 371 | CCC(C)Cl	CHEMBL346529
 372 | CC(C)CBr	CHEMBL346532
 373 | OCCCC(O)=O	CHEMBL1342
 374 | CSC(N)=N	CHEMBL356703
 375 | CCC(C)CC	CHEMBL357767
 376 | NCCCNCCO	CHEMBL361813
 377 | CC(C)CCO	CHEMBL372396
 378 | OC(=O)CI	CHEMBL376280
 379 | CCCN=C=O	CHEMBL441027
 380 | CC(=O)OO	CHEMBL444965
 381 | CCC(C)CO	CHEMBL451923
 382 | C1COCCO1	CHEMBL453716
 383 | OCCCCCCO	CHEMBL458616
 384 | CCC(C)C	CHEMBL1797287
 385 | OC(=O)CF	CHEMBL509273
 386 | SC(S)=S	CHEMBL1207991
 387 | C1NN=CO1	CHEMBL541688
 388 | OS(O)=O	CHEMBL1161699
 389 | NCC(F)F	CHEMBL1162281
 390 | OC(O)=O	CHEMBL1161632
 391 | NCCc1ccccc1	CHEMBL610
 392 | OCCOCCO	CHEMBL1235226
 393 | C1CCCC1	CHEMBL1370850
 394 | CC(N)CN	CHEMBL1319459
 395 | CCCCCCN	CHEMBL1320720
 396 | CCC1CO1	CHEMBL1378095
 397 | ClC=CCl	CHEMBL1385560
 398 | ClCCC#N	CHEMBL1451739
 399 | CCCCC#N	CHEMBL1503158
 400 | OCC1CO1	CHEMBL1530150
 401 | CCCCC=C	CHEMBL1548726
 402 | CCCCNCC	CHEMBL1598939
 403 | CC(N)Cl	CHEMBL1697721
 404 | FCCCCCF	CHEMBL1697728
 405 | CCCC=NO	CHEMBL1729186
 406 | C1CSCN1	CHEMBL1916078
 407 | c1cnon1	CHEMBL2171711
 408 | c1cocn1	CHEMBL2171710
 409 | c1cnsc1	CHEMBL2171712
 410 | C=COC=C	CHEMBL2105883
 411 | N#CNC#N	CHEMBL2365294
 412 | C[N+]#N	CHEMBL2419248
 413 | OCCCCCl	CHEMBL2260957
 414 | NCCCC=O	CHEMBL2261442
 415 | CCN=C=S	CHEMBL2251727
 416 | CC(N)=N	CHEMBL2227684
 417 | CCCOC=O	CHEMBL2270393
 418 | SCCSCCS	CHEMBL3182274
 419 | NCCOCCO	CHEMBL3183757
 420 | NCCOCCN	CHEMBL3183428
 421 | CCC(C)S	CHEMBL3183438
 422 | CCSCCCl	CHEMBL3183525
 423 | N#CCC#N	CHEMBL3187514
 424 | OCC#CCO	CHEMBL3187551
 425 | C1COCO1	CHEMBL3187281
 426 | ClCOCCl	CHEMBL3185875
 427 | CCCNCCC	CHEMBL3185961
 428 | NCCNCCO	CHEMBL3186403
 429 | CCC(C)N	CHEMBL3186956
 430 | CCCCCCI	CHEMBL3188734
 431 | CCCOCCC	CHEMBL3187166
 432 | CCCOCCO	CHEMBL3189002
 433 | COCCC#N	CHEMBL3560782
 434 | CC=CC=O	CHEMBL3561468
 435 | CCCCOCC	CHEMBL3561108
 436 | ClCC=CCl	CHEMBL155926
 437 | NC1CONC1=O	CHEMBL8151
 438 | CCC1OC1C	CHEMBL177905
 439 | Cc1ccccc1N	CHEMBL1381
 440 | CCC=CC=O	CHEMBL256368
 441 | CC=CC=O	CHEMBL1086445
 442 | CCCC=NO	CHEMBL1869638
 443 | ClC=CCl	CHEMBL1441128
 444 | CCC=CCO	CHEMBL2269088
 445 | OCC=CCO	CHEMBL3188586
 446 | NCC(O)CON	CHEMBL26435
 447 | CC(O)CBr	CHEMBL446288
 448 | ClCC1CO1	CHEMBL448626
 449 | BrCC1CO1	CHEMBL504705
 450 | CC(N)CO	CHEMBL1229871
 451 | NC(N)=O	CHEMBL2096635
 452 | NC(N)=O	CHEMBL2096648
 453 | CCN(=O)=O	CHEMBL15625
 454 | CN(=O)=O	CHEMBL276924
 455 | CCS(C)=O	CHEMBL278882
 456 | O=S1CCC1	CHEMBL368734
 457 | CSS(C)=O	CHEMBL403038
 458 | OCC(O)CI	CHEMBL467595
 459 | O=C1NC=CC=C1	CHEMBL662
 460 | CN1CCCNC1=S	CHEMBL6954
 461 | CN(CCCl)CCCl	CHEMBL427
 462 | OCN1CCC1=O	CHEMBL10689
 463 | NCCc1c[nH]cn1	CHEMBL90
 464 | S=C1NCCCN1	CHEMBL11938
 465 | O=C1CCCCN1	CHEMBL12193
 466 | O=C1NCCCN1	CHEMBL12593
 467 | CN1CCCC1=O	CHEMBL12543
 468 | Oc1ccc(O)cc1	CHEMBL537
 469 | NCC(Cl)CON	CHEMBL15650
 470 | c1nc[nH]n1	CHEMBL15571
 471 | c1cn[nH]c1	CHEMBL15967
 472 | c1cc[nH]c1	CHEMBL16225
 473 | Clc1ccccc1	CHEMBL16200
 474 | CCCC(=O)OC	CHEMBL15859
 475 | Brc1ccccc1	CHEMBL16068
 476 | CCCCCCCC=O	CHEMBL18407
 477 | CCCCC(O)CN	CHEMBL18843
 478 | O=C1CCCCC1	CHEMBL18850
 479 | NCCCCNCCCN	CHEMBL19612
 480 | CCC1CCCCN1	CHEMBL22270
 481 | CCc1ccncc1	CHEMBL22977
 482 | CCc1cccnc1	CHEMBL23025
 483 | CCN1CCCCC1	CHEMBL25053
 484 | CCCCCCCCCO	CHEMBL24563
 485 | NCCc1nccs1	CHEMBL25414
 486 | CNCC(O)CON	CHEMBL26797
 487 | NCCCCSCCCN	CHEMBL28866
 488 | NCCCCCCCCN	CHEMBL29392
 489 | NCCCS(O)=O	CHEMBL32102
 490 | NCc1ccccn1	CHEMBL32189
 491 | ON1CCCC1=O	CHEMBL31629
 492 | CCC(C)(C)O	CHEMBL44658
 493 | CCOC(=O)CC	CHEMBL44115
 494 | CCCOC(C)=O	CHEMBL44857
 495 | OCCNCCNCCO	CHEMBL47248
 496 | CCCCCN(C)C	CHEMBL47794
 497 | O=CNC1CCC1	CHEMBL49774
 498 | O=C1CCC=C1	CHEMBL52190
 499 | CCN(CC)CCN	CHEMBL52701
 500 | CC(=O)NC#N	CHEMBL56672
 501 | CN(C)CCCCN	CHEMBL59625
 502 | NCCCCNCC=C	CHEMBL61421
 503 | O=Cc1ccsc1	CHEMBL72211
 504 | CSc1cnccn1	CHEMBL94743
 505 | ClC1NCC=N1	CHEMBL98381
 506 | CCc1cnccn1	CHEMBL97525
 507 | COc1cnccn1	CHEMBL97794
 508 | CSC1NCC=N1	CHEMBL99226
 509 | CCC1NCC=N1	CHEMBL98596
 510 | NC1NCC=N1	CHEMBL102029
 511 | OCC(Cl)Cl	CHEMBL113957
 512 | Ic1ccccc1	CHEMBL116296
 513 | Sc1ccccc1	CHEMBL119405
 514 | NNC(=O)NO	CHEMBL134941
 515 | Cc1cn[nH]c1	CHEMBL1308
 516 | ClC(Cl)=C	CHEMBL156455
 517 | CCC(C)CBr	CHEMBL156329
 518 | CC(C)=CBr	CHEMBL157418
 519 | ClC(Br)Br	CHEMBL157093
 520 | CC(=C)CCl	CHEMBL157368
 521 | CC(C)CCBr	CHEMBL158800
 522 | CC(Br)CCl	CHEMBL160835
 523 | NC1=NCCC1	CHEMBL161318
 524 | CC(C)=CCl	CHEMBL160508
 525 | CON(C)N=O	CHEMBL163675
 526 | CCN(C)N=O	CHEMBL164852
 527 | NC(=O)CCO	CHEMBL170341
 528 | OC(=C)C=O	CHEMBL170721
 529 | C=CCSCC=C	CHEMBL170458
 530 | COCC(N)=O	CHEMBL170742
 531 | OCC(O)C=O	CHEMBL173813
 532 | OCCCCSC#N	CHEMBL176782
 533 | NC1CCNCC1	CHEMBL174570
 534 | N#CCCSC#N	CHEMBL177036
 535 | NCC(=O)CN	CHEMBL175201
 536 | NCC(=O)NO	CHEMBL216796
 537 | C=CCN=C=S	CHEMBL233248
 538 | NCc1cccs1	CHEMBL237711
 539 | Oc1cccnc1	CHEMBL237847
 540 | NCC(=O)NN	CHEMBL241347
 541 | NCc1ccsc1	CHEMBL252602
 542 | ON1CCCCC1	CHEMBL277887
 543 | O=C1CCCN1	CHEMBL276849
 544 | CC1CCCNC1	CHEMBL279512
 545 | OCP(CO)CO	CHEMBL279546
 546 | CN1CCCCC1	CHEMBL281417
 547 | CCN(CC)CC	CHEMBL284057
 548 | OC1CCNCC1	CHEMBL284022
 549 | Nc1ccncc1	CHEMBL284348
 550 | CNCC(O)CO	CHEMBL286961
 551 | CC(C)NC=O	CHEMBL296027
 552 | NCCC(O)=O	CHEMBL297569
 553 | Nc1ccncn1	CHEMBL302453
 554 | CNCC(O)=O	CHEMBL304383
 555 | OCc1ccco1	CHEMBL308187
 556 | CCSC(N)=N	CHEMBL321691
 557 | O=C1CNCS1	CHEMBL338595
 558 | CCCCCCCCC	CHEMBL335900
 559 | CONC(N)=O	CHEMBL339711
 560 | NC(=O)C=C	CHEMBL348107
 561 | NOCC(O)=O	CHEMBL347862
 562 | O=NN1CCNCC1	CHEMBL1333
 563 | BrC(Br)Br	CHEMBL345248
 564 | ClC(Cl)Br	CHEMBL346231
 565 | NC(=O)CCl	CHEMBL346368
 566 | CNCc1ccccc1	CHEMBL1338
 567 | O=NN1CCC1	CHEMBL351479
 568 | OC(=O)CCS	CHEMBL358697
 569 | NC1=NCCS1	CHEMBL362148
 570 | S=C=NCC#C	CHEMBL401514
 571 | OCc1ccoc1	CHEMBL440914
 572 | CNC(N)=NN	CHEMBL447555
 573 | C1CCCCCC1	CHEMBL453194
 574 | ClCCSCCCl	CHEMBL455341
 575 | CCOC(N)=O	CHEMBL462547
 576 | FC(F)CCl	CHEMBL1797000
 577 | C1CC=CC1	CHEMBL1797299
 578 | Cc1cnccn1	CHEMBL479791
 579 | OOC1CCCO1	CHEMBL505384
 580 | Nn1cnnc1	CHEMBL1868166
 581 | COC(N)=O	CHEMBL1085707
 582 | NC1CCCC1	CHEMBL1171859
 583 | CSCCNCCN	CHEMBL1191800
 584 | NCCCNCCS	CHEMBL1201382
 585 | O=C1CCO1	CHEMBL1200627
 586 | CC(O)C#N	CHEMBL3559764
 587 | NCCCNCCN	CHEMBL1213267
 588 | CC1NCC=N1	CHEMBL330591
 589 | ClCC(Cl)Cl	CHEMBL43882
 590 | CCCn1ccnc1	CHEMBL95929
 591 | SCc1cccs1	CHEMBL152603
 592 | OCCSSCCO	CHEMBL1233278
 593 | COCCOCCO	CHEMBL1235250
 594 | CCC(N)=O	CHEMBL1235716
 595 | NCC(S)=O	CHEMBL1233056
 596 | Cn1cccc1	CHEMBL1234459
 597 | C=CC1CO1	CHEMBL1299388
 598 | CC(O)CCl	CHEMBL1361129
 599 | CN(C)CCS	CHEMBL1395579
 600 | CC(C)C=O	CHEMBL1404017
 601 | OCC(O)CS	CHEMBL1398948
 602 | CNC(S)=S	CHEMBL1413694
 603 | ClCC1CO1	CHEMBL1421613
 604 | Cc1ccco1	CHEMBL1445555
 605 | CC(C)C#N	CHEMBL1492874
 606 | C1OCOCO1	CHEMBL1495792
 607 | N#CCCC#N	CHEMBL1562258
 608 | Cc1cscn1	CHEMBL1566946
 609 | CC1CCCO1	CHEMBL1580503
 610 | C=CCCC#N	CHEMBL1595985
 611 | Nc1nncs1	CHEMBL1650237
 612 | C=CCOC=O	CHEMBL1697703
 613 | CSC(N)=S	CHEMBL1673038
 614 | OCCCSCCO	CHEMBL1741874
 615 | CCOCCOCC	CHEMBL1877517
 616 | NC(=N)CF	CHEMBL1962624
 617 | CNC(N)NC	CHEMBL2009606
 618 | C1OOCOO1	CHEMBL2071269
 619 | OCS(O)=O	CHEMBL2111064
 620 | C1CSCCN1	CHEMBL2333141
 621 | COC(S)=S	CHEMBL3039753
 622 | NCCCCC=O	CHEMBL2261443
 623 | FC1CCNC1	CHEMBL2448948
 624 | CCCN(C)C	CHEMBL2448976
 625 | CCN(C)CC	CHEMBL2448813
 626 | OC1CCOC1	CHEMBL2287517
 627 | CCCN=C=S	CHEMBL2251726
 628 | CC(CO)CS	CHEMBL3098154
 629 | CC(=O)NN	CHEMBL3091859
 630 | CCCCOC=O	CHEMBL2270394
 631 | CC(CO)CO	CHEMBL3183047
 632 | BrCC1CO1	CHEMBL3183066
 633 | C1CSCCS1	CHEMBL3183037
 634 | COC(C)OC	CHEMBL3183607
 635 | CCN(O)CC	CHEMBL3184786
 636 | CCCCON=O	CHEMBL3181968
 637 | CC=CCC#N	CHEMBL3181969
 638 | CCC=CC#N	CHEMBL3185046
 639 | COCC(C)N	CHEMBL3184692
 640 | CC(O)CCO	CHEMBL3186475
 641 | COCC(C)O	CHEMBL3186306
 642 | O=CNNC=O	CHEMBL3185965
 643 | ON=CC=NO	CHEMBL3185538
 644 | BrCCCCBr	CHEMBL3185714
 645 | CCCCCCBr	CHEMBL3187491
 646 | SC1CCCC1	CHEMBL3186752
 647 | NCCCCCCS	CHEMBL3247584
 648 | NCC(N)CS	CHEMBL3302693
 649 | NC(=N)CS	CHEMBL3304035
 650 | NC(=S)NO	CHEMBL3274945
 651 | OCC(O)CF	CHEMBL3276496
 652 | ClCC=CCl	CHEMBL3561804
 653 | CCCCCCCN	CHEMBL3561940
 654 | CCCCOC=C	CHEMBL3561125
 655 | OC(CI)CI	CHEMBL3707258
 656 | CC(=O)C=NO	CHEMBL17940
 657 | Nc1ccccc1O	CHEMBL28319
 658 | CC1CCOC1=O	CHEMBL36365
 659 | Cc1ccccc1C	CHEMBL45005
 660 | Cc1ccccc1O	CHEMBL46931
 661 | CC=C(C)C=O	CHEMBL53493
 662 | CC1CCNC1=O	CHEMBL59378
 663 | CC=CC(N)=O	CHEMBL58562
 664 | Nc1ccccc1N	CHEMBL70582
 665 | Cc1nccnc1C	CHEMBL96425
 666 | CN1CC=NC1N	CHEMBL97965
 667 | CCNC(N)=S	CHEMBL116961
 668 | ClCC=CCCl	CHEMBL468582
 669 | Cc1ccoc1C	CHEMBL108232
 670 | CC=CCC#N	CHEMBL1322495
 671 | CCC=CC#N	CHEMBL2138413
 672 | CCC=CCCO	CHEMBL2251452
 673 | CCCC=CCO	CHEMBL2228463
 674 | CCC=CCCO	CHEMBL3184538
 675 | CC=CCC=C	CHEMBL3182034
 676 | CCC=CC#N	CHEMBL3185899
 677 | CN1C=CNC1=S	CHEMBL1515
 678 | CC(O)CCO	CHEMBL1231501
 679 | CCC(N)CO	CHEMBL3184640
 680 | CSCCC(N)CS	CHEMBL36661
 681 | CCS(=O)CC	CHEMBL174477
 682 | NCC(O)CON	CHEMBL284573
 683 | NC(CS)C(O)=O	CHEMBL863
 684 | ONN(=O)=O	CHEMBL369802
 685 | CC(Cl)CCl	CHEMBL373466
 686 | CNN(=O)=O	CHEMBL405641
 687 | CSC(N)=N	CHEMBL2112024
 688 | ON(O)N=O	CHEMBL1741048
 689 | OC1COCC1O	CHEMBL350524
 690 | OC1CNCC1O	CHEMBL396701
 691 | OC1CCCC1O	CHEMBL399324
 692 | S=C1SSC=C1	CHEMBL368700
 693 | FC1CCNCC1	CHEMBL1162291
 694 | CN1CCCCC1=O	CHEMBL12011
 695 | S=C1NCCCCN1	CHEMBL11693
 696 | CN(C)C(C)=O	CHEMBL11873
 697 | CC(N)C(O)=O	CHEMBL12198
 698 | CCN1CCCC1=O	CHEMBL12221
 699 | O=C1NCCCCN1	CHEMBL12376
 700 | COc1ccc(O)cc1	CHEMBL544
 701 | CCCCCC(O)=O	CHEMBL14184
 702 | Cn1cncc1CCN	CHEMBL14722
 703 | N#Cc1ccccc1	CHEMBL15819
 704 | O=Cc1ccccc1	CHEMBL15972
 705 | CCCCCC(O)CN	CHEMBL18576
 706 | O=C1CCCCCC1	CHEMBL18607
 707 | CCCCCC(C)=O	CHEMBL18893
 708 | NCCC1CCCCC1	CHEMBL19428
 709 | COC1CNC=NC1	CHEMBL21779
 710 | CCCc1ccccn1	CHEMBL21824
 711 | CCCC1CCCCN1	CHEMBL21867
 712 | CC(C)(C)COO	CHEMBL23860
 713 | CS(C)(=O)=O	CHEMBL25028
 714 | CC(C)(C)CCO	CHEMBL25029
 715 | CNCCCNCCCNC	CHEMBL29194
 716 | C1CCCC=CCC1	CHEMBL30773
 717 | ON1CNCCC1=O	CHEMBL31155
 718 | CCCCC(CC)CO	CHEMBL31637
 719 | NCCc1ccccn1	CHEMBL32813
 720 | NCCCCNCCCCN	CHEMBL36119
 721 | CC(N)P(O)=O	CHEMBL37702
 722 | OC(=O)C1CO1	CHEMBL35308
 723 | CCNCCCNCCCN	CHEMBL37901
 724 | CC(=O)OCCBr	CHEMBL42088
 725 | CSCCOC(C)=O	CHEMBL42606
 726 | NCP(O)(O)=O	CHEMBL41873
 727 | CCCCCCCNC=O	CHEMBL43719
 728 | CCCC(=O)OCC	CHEMBL44800
 729 | Cc1nsc(N)n1	CHEMBL47803
 730 | CCOC(=O)C=C	CHEMBL52084
 731 | O=C1OCCC=C1	CHEMBL55078
 732 | CCOC(=O)C#C	CHEMBL53384
 733 | CNCCCCNCC=C	CHEMBL60417
 734 | CCN(CC)C(S)=S	CHEMBL961
 735 | CNCCCCNCC#C	CHEMBL59623
 736 | NC1=CONC1=O	CHEMBL67409
 737 | NS(O)(=O)=O	CHEMBL68253
 738 | CCCCN=C(N)N	CHEMBL73004
 739 | CN(CCN)CCCN	CHEMBL76497
 740 | NCCCCCC(O)=O	CHEMBL1046
 741 | NC1=NCCSCC1	CHEMBL88308
 742 | NCCCCCNCCCN	CHEMBL89035
 743 | NNCCc1ccccc1	CHEMBL1089
 744 | CCOc1cnccn1	CHEMBL93554
 745 | CCSc1cnccn1	CHEMBL93555
 746 | CCN1CCC=CC1	CHEMBL98544
 747 | CCCCn1ccnc1	CHEMBL97667
 748 | O=Nc1ccccc1	CHEMBL98797
 749 | NCCCNOCCCN	CHEMBL105749
 750 | C[S+](C)C	CHEMBL1237171
 751 | Nc1ccc(O)cc1	CHEMBL1142
 752 | NCCCP(O)=O	CHEMBL112203
 753 | CSCCC(O)=O	CHEMBL116212
 754 | OCC(F)(F)F	CHEMBL116675
 755 | C1NC=NC=C1	CHEMBL122235
 756 | NP(O)(O)=O	CHEMBL121754
 757 | CP(O)(O)=S	CHEMBL122577
 758 | CC(C)(N)CO	CHEMBL122588
 759 | CP(O)(O)=O	CHEMBL122938
 760 | OCCN1CCCC1	CHEMBL122581
 761 | CCCCCC(C)N	CHEMBL123693
 762 | CCCCCCCCCC	CHEMBL134537
 763 | CSc1nccn1C	CHEMBL136263
 764 | CCCCCNCC#N	CHEMBL139751
 765 | CCC(C)(C)C	CHEMBL142735
 766 | NC(=N)SCCF	CHEMBL148951
 767 | CCOC(=O)NO	CHEMBL153081
 768 | NCCc1cscn1	CHEMBL155328
 769 | ClCC(Cl)=C	CHEMBL156075
 770 | CCCCCCCCBr	CHEMBL156047
 771 | CCCCCCCCCl	CHEMBL158445
 772 | NC1=NCCCN1	CHEMBL158626
 773 | NC1=NCCCO1	CHEMBL161118
 774 | NNCCC(O)=O	CHEMBL159205
 775 | CN(C=C)N=O	CHEMBL163957
 776 | COCN(C)N=O	CHEMBL163961
 777 | CCN(CC)N=O	CHEMBL164290
 778 | CCCN(C)N=O	CHEMBL165385
 779 | O=C1OCC=C1	CHEMBL166223
 780 | NC(=O)CCCl	CHEMBL171266
 781 | CNc1ccccc1	CHEMBL170781
 782 | NC(=O)CCCO	CHEMBL174258
 783 | OCCCCCSC#N	CHEMBL176455
 784 | O=Cc1ccco1	CHEMBL189362
 785 | CSc1ccccc1	CHEMBL192899
 786 | CCCCC(C)=O	CHEMBL195861
 787 | NCCC(=O)NO	CHEMBL218945
 788 | O=Cc1cscn1	CHEMBL225650
 789 | N#CN1CCCC1	CHEMBL262697
 790 | Cn1cnc(CCN)c1	CHEMBL507
 791 | CCCCC(O)=O	CHEMBL268736
 792 | N=C1CCCCN1	CHEMBL269058
 793 | CCN(CC)C#N	CHEMBL274120
 794 | CCC(CC)C=O	CHEMBL273782
 795 | O=CC1CCCC1	CHEMBL274711
 796 | COc1ccccc1	CHEMBL278024
 797 | CCc1ccccn1	CHEMBL279305
 798 | O=C1CCCNN1	CHEMBL283612
 799 | ON1CNCC1=O	CHEMBL286024
 800 | NOCC(O)CON	CHEMBL287667
 801 | NCCCC(N)CF	CHEMBL290500
 802 | NCCCCNCC#C	CHEMBL292770
 803 | CC(N)P(O)O	CHEMBL300051
 804 | CC(C)CNC=O	CHEMBL299094
 805 | Cn1ccnc1Br	CHEMBL305538
 806 | NC1=NCCCS1	CHEMBL306541
 807 | ONc1ccccc1	CHEMBL320474
 808 | Brc1ccncc1	CHEMBL325044
 809 | O=Cc1cccs1	CHEMBL328441
 810 | Brc1ccccn1	CHEMBL331374
 811 | BrC1NCC=N1	CHEMBL330406
 812 | NP(N)(O)=O	CHEMBL333905
 813 | CC(C)(C)OO	CHEMBL348399
 814 | CC(C)(C)Br	CHEMBL347644
 815 | COC(OC)C#C	CHEMBL349188
 816 | OCCN(CCO)N=O	CHEMBL1334
 817 | CC(C)(C)Cl	CHEMBL346997
 818 | O=NN1CCCC1	CHEMBL351175
 819 | O=NN1CCOC1	CHEMBL351189
 820 | CC(Cl)CCCl	CHEMBL352037
 821 | CC1CCN=C1N	CHEMBL359703
 822 | CCCCC(N)CS	CHEMBL357138
 823 | C=CCSSCC=C	CHEMBL366603
 824 | CCc1ccccc1	CHEMBL371561
 825 | CNCC(C)CNC	CHEMBL367458
 826 | O=Cc1cocn1	CHEMBL444137
 827 | CC(C)C(C)O	CHEMBL443470
 828 | CN(C)CCCCl	CHEMBL449411
 829 | BrC(=C)C=O	CHEMBL447065
 830 | NCCSC(N)=N	CHEMBL454761
 831 | CCCCCC(C)O	CHEMBL449522
 832 | CCCCC(O)CC	CHEMBL452729
 833 | C1CCCCCCC1	CHEMBL452651
 834 | CC(C)=CC=O	CHEMBL453815
 835 | NNc1ccccc1	CHEMBL456807
 836 | CC(C)(C)CO	CHEMBL458630
 837 | NC1CCCCC1	CHEMBL1794762
 838 | CCCCCCCCl	CHEMBL1797136
 839 | OC(CCl)CCl	CHEMBL468581
 840 | CCCSC(N)=N	CHEMBL483092
 841 | CCCCC(C)C	CHEMBL1797267
 842 | OC1CCCCCC1	CHEMBL503332
 843 | CCNC(N)=NN	CHEMBL507240
 844 | OCCC(O)=O	CHEMBL1205969
 845 | Clc1ccccn1	CHEMBL509579
 846 | CSSc1cccs1	CHEMBL554538
 847 | CC(C)(C)NO	CHEMBL555486
 848 | OCCn1ccnc1	CHEMBL555293
 849 | ClC(=C)C=C	CHEMBL555660
 850 | O=C1NCCO1	CHEMBL1867161
 851 | CCCCN=C=S	CHEMBL1814588
 852 | NCCc1cccs1	CHEMBL252803
 853 | CN(C)CCCO	CHEMBL1209424
 854 | CC(=O)CBr	CHEMBL1085947
 855 | OC(=O)C=C	CHEMBL1213529
 856 | OC(=O)C#C	CHEMBL1213530
 857 | CSCC(N)CS	CHEMBL1159811
 858 | CCCCCCCCN	CHEMBL1160509
 859 | Fc1ccccn1	CHEMBL1162360
 860 | OC(=O)C=O	CHEMBL1162545
 861 | FC1CCCNC1	CHEMBL1162289
 862 | Fc1cccnc1	CHEMBL1162361
 863 | Nc1cnccn1	CHEMBL1834089
 864 | COC(=O)CN	CHEMBL1193103
 865 | CNC(=N)NC	CHEMBL1193979
 866 | CSCCNCCCN	CHEMBL1191798
 867 | CCSCCNCCN	CHEMBL1195713
 868 | NCCC(N)=O	CHEMBL1229081
 869 | O=C1CCCCO1	CHEMBL452383
 870 | NC(=N)NN=O	CHEMBL447467
 871 | CCCCCCCCCCO	CHEMBL25363
 872 | OCCCCCCBr	CHEMBL1231334
 873 | CCCC(N)=O	CHEMBL1231396
 874 | CCOCCOCCO	CHEMBL1230841
 875 | OCC(=O)CO	CHEMBL1229937
 876 | [C-]#[O+]	CHEMBL1231840
 877 | CCCCCCC#N	CHEMBL1231869
 878 | O=CCCCC=O	CHEMBL1235482
 879 | CCOP(O)=O	CHEMBL1231082
 880 | CC(=O)CCl	CHEMBL1231084
 881 | CN(C)CCCN	CHEMBL1232234
 882 | CNC(=O)NC	CHEMBL1234380
 883 | COCCOCCOC	CHEMBL1234162
 884 | OCCC(O)CO	CHEMBL1356759
 885 | COC(=O)CS	CHEMBL1341329
 886 | CC(=O)NBr	CHEMBL1256514
 887 | NCCS(O)=O	CHEMBL1256480
 888 | C1CCCNCC1	CHEMBL1375444
 889 | CCCCCCC=C	CHEMBL1376677
 890 | CC(=C)C#N	CHEMBL1529759
 891 | N#CSCSC#N	CHEMBL1524617
 892 | CCCC(C)CO	CHEMBL1569610
 893 | CC(=C)C=C	CHEMBL1566132
 894 | CC(=O)C=C	CHEMBL1600824
 895 | ClCCOCCCl	CHEMBL1613350
 896 | ONCC(O)=O	CHEMBL1645222
 897 | COCC(O)=O	CHEMBL1697714
 898 | OCCNCNC=O	CHEMBL1984734
 899 | CONC(C)=O	CHEMBL1990145
 900 | O=S1CSCS1	CHEMBL1971386
 901 | OCC#CCNCl	CHEMBL1998044
 902 | N=C1SCCS1	CHEMBL2009648
 903 | ICC1CCCO1	CHEMBL1999579
 904 | NC(=N)CCl	CHEMBL2365371
 905 | S=C1NCCS1	CHEMBL2398099
 906 | CCOC(S)=S	CHEMBL3039661
 907 | CC(C)CCCO	CHEMBL2260955
 908 | OCCCCCCCl	CHEMBL2260959
 909 | CN(C)CC#N	CHEMBL2448941
 910 | CCCCN(C)C	CHEMBL2448977
 911 | COCCN(C)C	CHEMBL2448835
 912 | CN1CCOCC1	CHEMBL2448839
 913 | O=COC1CC1	CHEMBL2924223
 914 | OCC1CCOC1	CHEMBL2287525
 915 | CCC(Cl)Cl	CHEMBL2287704
 916 | OCC1CCCO1	CHEMBL2287521
 917 | CCc1ccco1	CHEMBL2269084
 918 | CCC(O)C=C	CHEMBL2269086
 919 | CCC(C)C=O	CHEMBL2270060
 920 | CCCSSSCCC	CHEMBL3222023
 921 | NNC(=S)NN	CHEMBL3181818
 922 | CC(C)(C)S	CHEMBL3182458
 923 | CNC(=S)NN	CHEMBL3182946
 924 | CCC(=O)OC	CHEMBL3183973
 925 | COC(=O)NN	CHEMBL3183780
 926 | CCC(CC)CO	CHEMBL3181836
 927 | BrCCCCCBr	CHEMBL3182198
 928 | COC(Cl)=O	CHEMBL3182300
 929 | CCC(C)=NO	CHEMBL3181847
 930 | CC(C)=CCO	CHEMBL3184952
 931 | OCCCCOC=C	CHEMBL3182123
 932 | ClCC=CCCl	CHEMBL3182125
 933 | CCCCCCCCS	CHEMBL3182056
 934 | COP(=O)OC	CHEMBL3183964
 935 | COC(=O)OC	CHEMBL3185216
 936 | C1COC=CC1	CHEMBL3184439
 937 | CC(C)OC=O	CHEMBL3184082
 938 | O=C1OCCO1	CHEMBL3181803
 939 | CCCCNCCCC	CHEMBL3184528
 940 | COP(OC)OC	CHEMBL3186364
 941 | CC(C)CC#N	CHEMBL3186839
 942 | COC(C)CCO	CHEMBL3186019
 943 | CCOCC(C)O	CHEMBL3188294
 944 | OCC(O)CCl	CHEMBL3185949
 945 | CCCSSCC=C	CHEMBL3187351
 946 | COC(OC)OC	CHEMBL3187679
 947 | COCC(O)CO	CHEMBL3187682
 948 | CCCCCON=O	CHEMBL3188202
 949 | SC1CCCCC1	CHEMBL3187982
 950 | C1C=CC=C1	CHEMBL3188826
 951 | C=CCNCC=C	CHEMBL3186706
 952 | CCC(CO)CO	CHEMBL3187400
 953 | N#CCNCC#N	CHEMBL3186090
 954 | CC(C)OCCO	CHEMBL3187409
 955 | CNC(=S)NC	CHEMBL3189044
 956 | CSCCCCCCN	CHEMBL3247585
 957 | OCC(=O)CF	CHEMBL3276492
 958 | OCC(O)CBr	CHEMBL3276497
 959 | Ic1ccccn1	CHEMBL3274303
 960 | OCC(CS)CS	CHEMBL3425833
 961 | CC(=C)CCO	CHEMBL3561140
 962 | SCc1ccco1	CHEMBL3560314
 963 | O=S1OCCO1	CHEMBL3561007
 964 | NCc1ccco1	CHEMBL3561633
 965 | CCCCSCCCC	CHEMBL3561568
 966 | CN(C)CCCl	CHEMBL3580424
 967 | CCCCNCC=C	CHEMBL3558346
 968 | CN1CCCNC1=O	CHEMBL12319
 969 | NCc1ccccc1F	CHEMBL12892
 970 | COc1ccccc1O	CHEMBL13766
 971 | ON=Cc1cnsn1	CHEMBL19205
 972 | ON=Cc1cnns1	CHEMBL19953
 973 | NCC=CC(O)=O	CHEMBL32307
 974 | NCC=CC(O)=O	CHEMBL33086
 975 | CCC1CCOC1=O	CHEMBL35976
 976 | CCC1CCSC1=O	CHEMBL36472
 977 | CCC1CCNC1=O	CHEMBL57505
 978 | CSc1nccnc1C	CHEMBL97593
 979 | CN1CC=NC1Br	CHEMBL99153
 980 | CC(=C)C=NO	CHEMBL104456
 981 | Nc1ccccc1S	CHEMBL116835
 982 | Cc1ncccc1O	CHEMBL134348
 983 | CCCCCCC=NO	CHEMBL137790
 984 | CN1CCSC1=N	CHEMBL167256
 985 | Nc1ccccc1F	CHEMBL195328
 986 | Oc1ccccc1F	CHEMBL224144
 987 | Oc1ccccc1I	CHEMBL225564
 988 | Oc1ccccc1O	CHEMBL280998
 989 | CC1CCSC1=O	CHEMBL287617
 990 | Cc1cccnc1N	CHEMBL291544
 991 | CN1CC=NC1F	CHEMBL327284
 992 | CC(Cl)=CCl	CHEMBL346519
 993 | Cc1ccccc1F	CHEMBL352215
 994 | Nc1ccncc1N	CHEMBL354077
 995 | CN1CCSC1=S	CHEMBL397404
 996 | CCCCC=CC=O	CHEMBL454759
 997 | BrCC=CCBr	CHEMBL1324439
 998 | N#CC=CC#N	CHEMBL1451833
 999 | COCC=CCOC	CHEMBL1994550
1000 | 


--------------------------------------------------------------------------------
/examples/qed_property_example.txt:
--------------------------------------------------------------------------------
 1 | <qed>0.4068|CCO
 2 | <qed>0.3598|C
 3 | <qed>0.3853|CO
 4 | <qed>0.419|NCCS
 5 | <qed>0.4035|NCCN
 6 | <qed>0.3847|CN
 7 | <qed>0.3606|C=O
 8 | <qed>0.4062|CCN
 9 | <qed>0.4028|CSC
10 | <qed>0.3936|CBr
11 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 88
 3 | skip-string-normalization = false
 4 | target-version = ['py37']
 5 | 
 6 | [tool.isort]
 7 | multi_line_output = 3
 8 | include_trailing_comma = true
 9 | force_grid_wrap = 0
10 | use_parentheses = true
11 | ensure_newline_before_comments = true
12 | line_length = 88
13 | force_to_top = ["rdkit"]


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==v3.1.0
2 | torch==1.7.1
3 | tqdm
4 | numpy
5 | modlamp>=4.3.0
6 | selfies==1.0.4
7 | psutil


--------------------------------------------------------------------------------
/scripts/create_vocabulary.py:
--------------------------------------------------------------------------------
 1 | """Create a vocabulary."""
 2 | import argparse
 3 | import os
 4 | from collections import Counter
 5 | 
 6 | from tqdm import tqdm
 7 | 
 8 | from terminator.tokenization import ExpressionTokenizer
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument(
12 |     "input_filepath", type=str, help="data used to create a vocabulary."
13 | )
14 | parser.add_argument(
15 |     "output_filepath", type=str, help="output where to store the vocabulary."
16 | )
17 | parser.add_argument(
18 |     "--max_exponent", type=int, default=5, help="maximum exponent for num-tokens."
19 | )
20 | 
21 | 
22 | def main() -> None:
23 |     """Create a vocabulary using an ExpressionTokenizer."""
24 |     args = parser.parse_args()
25 |     input_filepath = args.input_filepath
26 |     output_filepath = args.output_filepath
27 |     max_exponent = args.max_exponent
28 | 
29 |     vocabulary_counter = Counter()
30 |     tokenizer = ExpressionTokenizer()
31 | 
32 |     # tokens for properties
33 |     vocabulary_counter.update(
34 |         [
35 |             "<qed>",
36 |             "<logp>",
37 |             "<molwt>",
38 |             "<sas>",
39 |             "<scs>",
40 |             "<esol>",
41 |             "<plogp>",
42 |             "<lipinski>",
43 |             "<rxnretro>",
44 |             "<aromatic>",
45 |         ]
46 |     )
47 |     # tokens for property numerical values
48 |     digits = list(range(10))
49 |     vocabulary_counter.update(
50 |         [
51 |             f"_{digit}_{exponent}_"
52 |             for exponent in range(max_exponent + 1)
53 |             for digit in digits
54 |         ]
55 |         + [
56 |             f"_{digit}_-{exponent}_"
57 |             for exponent in range(max_exponent + 1)
58 |             for digit in digits
59 |         ]
60 |     )
61 |     with open(input_filepath, "rt") as fp:
62 |         for line in tqdm(fp):
63 |             vocabulary_counter.update(tokenizer.tokenize(line.strip()))
64 | 
65 |     # special tokens for the model training and keeping the possibility to extend the vocabulart
66 |     special_tokens = [
67 |         "[PAD]",
68 |         "[unused1]",
69 |         "[unused2]",
70 |         "[unused3]",
71 |         "[unused4]",
72 |         "[unused5]",
73 |         "[unused6]",
74 |         "[unused7]",
75 |         "[unused8]",
76 |         "[unused9]",
77 |         "[unused10]",
78 |         "[UNK]",
79 |         "[CLS]",
80 |         "[SEP]",
81 |         "[MASK]",
82 |     ]
83 | 
84 |     with open(output_filepath, "wt") as fp:
85 |         tokens = special_tokens + [
86 |             token for token, _ in vocabulary_counter.most_common()
87 |         ]
88 |         fp.write(os.linesep.join(tokens))
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     main()
93 | 


--------------------------------------------------------------------------------
/scripts/eval_language_modeling.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Language modeling evaluation script
  3 | """
  4 | import json
  5 | import logging
  6 | import math
  7 | import os
  8 | import sys
  9 | from time import time
 10 | 
 11 | import pandas as pd
 12 | from transformers import (
 13 |     AutoConfig,
 14 |     AutoModelWithLMHead,
 15 |     DataCollatorForPermutationLanguageModeling,
 16 |     HfArgumentParser,
 17 |     set_seed,
 18 | )
 19 | 
 20 | from terminator.args import CustomTrainingArguments, EvalArguments
 21 | from terminator.collators import (
 22 |     ConditionalGenerationEvaluationCollator,
 23 |     PropertyCollator,
 24 | )
 25 | from terminator.datasets import get_dataset
 26 | from terminator.evaluator import Evaluator
 27 | from terminator.property_predictors import PREDICT_FACTORY
 28 | from terminator.tokenization import ExpressionBertTokenizer
 29 | from terminator.trainer import get_trainer_dict
 30 | from terminator.utils import (
 31 |     disable_rdkit_logging,
 32 |     find_safe_path,
 33 |     get_latest_checkpoint,
 34 |     get_equispaced_ranges,
 35 | )
 36 | 
 37 | logger = logging.getLogger(__name__)
 38 | 
 39 | 
 40 | # setup logging
 41 | logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 42 | 
 43 | 
 44 | def main():
 45 | 
 46 |     parser = HfArgumentParser((CustomTrainingArguments, EvalArguments))
 47 |     training_args, eval_args = parser.parse_args_into_dataclasses()
 48 | 
 49 |     # Setup logging
 50 |     logging.basicConfig(
 51 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 52 |         datefmt="%m/%d/%Y %H:%M:%S",
 53 |         level=logging.INFO,
 54 |     )
 55 | 
 56 |     with open(eval_args.param_path, "r") as f:
 57 |         eval_params = json.load(f)
 58 | 
 59 |     param_filename = eval_args.param_path.split("/")[-1].split(".json")[0]
 60 | 
 61 |     # Wrap into args to be safe
 62 |     eval_args.__dict__.update(eval_params)
 63 | 
 64 |     # NOTE: Results will be stored in model folder
 65 |     model_dir = training_args.output_dir
 66 |     if "checkpoint" not in model_dir:
 67 |         model_dir = get_latest_checkpoint(
 68 |             model_dir, must_contain=eval_params.get("checkpoint-str", "best")
 69 |         )
 70 | 
 71 |     config_name = os.path.join(model_dir, "config.json")
 72 |     with open(config_name, "r") as f:
 73 |         model_params = json.load(f)
 74 | 
 75 |     config = AutoConfig.from_pretrained(
 76 |         config_name, mem_len=model_params.get("mem_len", 1024)
 77 |     )
 78 | 
 79 |     tokenizer = ExpressionBertTokenizer.from_pretrained(model_dir)
 80 |     sep = tokenizer.expression_separator
 81 | 
 82 |     model = AutoModelWithLMHead.from_pretrained(
 83 |         model_dir, from_tf=bool(".ckpt" in model_dir), config=config
 84 |     )
 85 |     logger.info(f"Model restored from {model_dir}")
 86 | 
 87 |     model.resize_token_embeddings(len(tokenizer))
 88 | 
 89 |     if eval_params.get("block_size", -1) <= 0:
 90 |         eval_params["block_size"] = tokenizer.max_len
 91 |         # Our input block size will be the max possible for the model
 92 |     else:
 93 |         eval_params["block_size"] = min(training_args.block_size, tokenizer.max_len)
 94 | 
 95 |     # Get datasets
 96 |     eval_dataset = get_dataset(
 97 |         eval_args.eval_file,
 98 |         block_size=eval_params["block_size"],
 99 |         tokenizer=tokenizer,
100 |         line_by_line=eval_params.get("line_by_line", True),
101 |     )
102 | 
103 |     logger.info(f"Dataset size {len(eval_dataset)}.")
104 |     num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
105 |     logger.info(f"Number of parameters {num_params} of type {type(model)}")
106 | 
107 |     plm_prob = eval_params["plm_probability"]
108 |     perplexity_plm_prob = eval_params.get("perplexity_plm_prob", 0.2)
109 |     # NOTE: This collator does not provide an attention mask (unlike the refined training
110 |     # collators which prevent attention on padding), however, the model will largely
111 |     # ignore the paddings.
112 |     vanilla_collator = DataCollatorForPermutationLanguageModeling(
113 |         tokenizer=tokenizer,
114 |         plm_probability=perplexity_plm_prob,
115 |         max_span_length=eval_params["max_span_length"],
116 |     )
117 | 
118 |     custom_trainer_params = get_trainer_dict(model_params)
119 | 
120 |     # Initialize our Evaluator
121 |     evaluator = Evaluator(
122 |         model=model,
123 |         args=training_args,
124 |         eval_params=eval_params,
125 |         data_collator=vanilla_collator,
126 |         eval_dataset=eval_dataset,
127 |         tokenizer=tokenizer,
128 |         prediction_loss_only=False,
129 |         **custom_trainer_params,
130 |     )
131 | 
132 |     # Evaluation
133 |     result_dir = os.path.join(model_dir, "results")
134 |     os.makedirs(result_dir, exist_ok=True)
135 |     eval_filename = eval_args.eval_file.split("/")[-1].split("_")[-1].split(".")[0]
136 |     logger.info("*** Evaluate perplexity ***")
137 | 
138 |     with open(eval_args.eval_file, "r") as f:
139 |         prefix = sep.join(f.readline().split(sep)[:-1]) + sep
140 | 
141 |     # Set seed
142 |     if eval_params.get("set_seed", True):
143 |         set_seed(eval_params.get("seed", int(time())))
144 | 
145 |     eval_output = evaluator.evaluate()
146 |     perplexity = math.exp(eval_output["eval_loss"])
147 |     results = {"perplexity": perplexity}
148 |     path = os.path.join(
149 |         result_dir, f"{eval_filename}_perplexity_plm_{perplexity_plm_prob}.txt"
150 |     )
151 | 
152 |     with open(find_safe_path(path), "w") as writer:
153 |         logger.info("***** Eval results *****")
154 |         for key in sorted(results.keys()):
155 |             logger.info("  %s = %s", key, str(results[key]))
156 |             writer.write("%s = %s\n" % (key, str(results[key])))
157 | 
158 |     disable_rdkit_logging()
159 |     property_results = []
160 |     properties = eval_params["property_tokens"]
161 |     orders = eval_params.get("property_token_masking_order", None)
162 |     tokens_to_mask = eval_params.get("property_tokens_to_mask", None)
163 |     conditioning_ranges = eval_params.get(
164 |         "conditioning_range",
165 |         get_equispaced_ranges(
166 |             eval_args.eval_file,
167 |             properties,
168 |             precisions=eval_params.get("property_precisions", [2] * len(properties)),
169 |         ),
170 |     )
171 |     logger.info(f"Conditioning range is {conditioning_ranges}")
172 | 
173 |     # If the token masking orders is not specified we just evaluate all properties together
174 |     if not orders:
175 |         property_collator = PropertyCollator(
176 |             tokenizer=tokenizer,
177 |             property_tokens=properties,
178 |             num_tokens_to_mask=tokens_to_mask,
179 |             mask_token_order=orders,
180 |         )
181 |         ps, rs = evaluator.multi_property_prediction(
182 |             property_collator,
183 |             save_path=os.path.join(result_dir, eval_filename),
184 |             rmse_factor=eval_params.get("rmse_factor", 1),
185 |         )
186 |     else:
187 | 
188 |         for prop, order, mask in zip(properties, orders, tokens_to_mask):
189 |             logger.info(f"*** Evaluate property {prop} ***")
190 | 
191 |             for to_mask in mask:
192 | 
193 |                 # We iteratively make the task harder by masking 1-4 tokens.
194 |                 # The order of this is determined by `property_token_masking_order`.
195 |                 property_collator = PropertyCollator(
196 |                     tokenizer=tokenizer,
197 |                     property_tokens=[prop],
198 |                     num_tokens_to_mask=[to_mask],
199 |                     mask_token_order=[order],
200 |                 )
201 |                 print(f"Masking {to_mask} in order {order}")
202 |                 ps, rs, ss = evaluator.property_prediction(
203 |                     property_collator,
204 |                     save_path=os.path.join(
205 |                         result_dir, f"{prop[1:-1]}_{eval_filename}_mask_{to_mask}.csv"
206 |                     ),
207 |                     rmse_factor=eval_params.get("rmse_factor", 1),
208 |                 )
209 |                 for p, r, s, n in zip(ps, rs, ss, ["Greedy", "Sampling", "Beam"]):
210 |                     prop_res_dict = {
211 |                         "prop": prop[1:-1],
212 |                         "pearson": p,
213 |                         "spearman": s,
214 |                         "rmse": r,
215 |                         "search": n,
216 |                         "num_masked": to_mask,
217 |                     }
218 |                     property_results.append(prop_res_dict)
219 | 
220 |             pd.DataFrame(property_results).to_csv(
221 |                 os.path.join(result_dir, f"property_prediction_{eval_filename}.csv")
222 |             )
223 |     for prop, cr in zip(properties, conditioning_ranges):
224 |         logger.info(f"Evaluating conditional generation for {prop} with {cr}")
225 |         conditional_generation_collator = ConditionalGenerationEvaluationCollator(
226 |             tokenizer=tokenizer,
227 |             property_token=prop,
228 |             conditioning_range=cr,
229 |             plm_probability=plm_prob,
230 |             max_span_length=eval_params["max_span_length"],
231 |             entity_to_mask=eval_params.get("entity_to_mask", None),
232 |             entity_separator_token=eval_params.get("entity_separator_token", None),
233 |         )
234 | 
235 |         # Retrieve the property prediction function from dictionary
236 |         if prop[1:-1] in PREDICT_FACTORY.keys():
237 |             evaluate_fn = PREDICT_FACTORY[prop[1:-1]]
238 |             logger.info(f"Found property predictor for {prop}")
239 |             property_collator = None
240 |         else:
241 |             # If unavailable property is predicted
242 |             evaluate_fn = None
243 | 
244 |             if orders:
245 |                 # In single property prediction mode we just mask the property
246 |                 property_collator = PropertyCollator(
247 |                     tokenizer=tokenizer,
248 |                     property_tokens=[prop],
249 |                     num_tokens_to_mask=[-1],
250 |                     mask_token_order=None,
251 |                 )
252 |             else:
253 |                 # in this case, we use the property predictor from above where all tokens are masked
254 |                 pass
255 | 
256 |             logger.info(
257 |                 f"No property predictor for {prop}, using model itself for evaluation"
258 |             )
259 | 
260 |         evaluator.conditional_generation(
261 |             conditional_generation_collator,
262 |             save_path=os.path.join(
263 |                 result_dir,
264 |                 f"{prop[1:-1]}_conditional_generation_{param_filename}_{eval_filename}.csv",
265 |             ),
266 |             passed_eval_fn=evaluate_fn,
267 |             property_collator=property_collator,
268 |             denormalize_params=eval_params.get("denormalize", {}).get(prop, None),
269 |             prefix=prefix,
270 |         )
271 | 
272 |     print("Done, shutting down.")
273 | 
274 | 
275 | if __name__ == "__main__":
276 |     main()
277 | 


--------------------------------------------------------------------------------
/scripts/eval_lm_nlp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Language modeling adapted from Huggingface transformers.
  4 | 
  5 | The file is an adaptation of https://github.com/huggingface/transformers/blob/v3.1.0/examples/language-modeling/run_language_modeling.py
  6 | 
  7 | """
  8 | 
  9 | import json
 10 | import logging
 11 | import math
 12 | import os
 13 | import warnings
 14 | from dataclasses import dataclass, field
 15 | from typing import Optional
 16 | 
 17 | import pandas as pd
 18 | import torch
 19 | import transformers
 20 | from transformers import (
 21 |     CONFIG_MAPPING,
 22 |     MODEL_WITH_LM_HEAD_MAPPING,
 23 |     AutoConfig,
 24 |     AutoModelWithLMHead,
 25 |     DataCollatorForLanguageModeling,
 26 |     DataCollatorForPermutationLanguageModeling,
 27 |     HfArgumentParser,
 28 |     LineByLineTextDataset,
 29 |     PreTrainedTokenizer,
 30 |     TextDataset,
 31 |     XLNetConfig,
 32 |     XLNetLMHeadModel,
 33 |     set_seed,
 34 | )
 35 | 
 36 | from terminator.args import CustomTrainingArguments, EvalArguments, ModelArguments
 37 | from terminator.collators import TRAIN_COLLATORS, PropertyCollator
 38 | from terminator.datasets import get_dataset
 39 | from terminator.evaluator import Evaluator
 40 | from terminator.tokenization import PropertyTokenizerSquare, XLNetRTTokenizer
 41 | from terminator.trainer import CustomTrainer, get_trainer_dict
 42 | from terminator.utils import get_latest_checkpoint
 43 | 
 44 | logger = logging.getLogger(__name__)
 45 | 
 46 | 
 47 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 48 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 49 | 
 50 | 
 51 | @dataclass
 52 | class DataTrainingArguments:
 53 |     """
 54 |     Arguments pertaining to what data we are going to input our model for training and eval.
 55 |     """
 56 | 
 57 |     eval_data_file: Optional[str] = field(
 58 |         default=None,
 59 |         metadata={
 60 |             "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
 61 |         },
 62 |     )
 63 |     line_by_line: bool = field(
 64 |         default=False,
 65 |         metadata={
 66 |             "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."
 67 |         },
 68 |     )
 69 | 
 70 |     mlm: bool = field(
 71 |         default=False,
 72 |         metadata={
 73 |             "help": "Train with masked-language modeling loss instead of language modeling."
 74 |         },
 75 |     )
 76 |     mlm_probability: float = field(
 77 |         default=0.15,
 78 |         metadata={"help": "Ratio of tokens to mask for masked language modeling loss"},
 79 |     )
 80 |     plm_probability: float = field(
 81 |         default=1 / 6,
 82 |         metadata={
 83 |             "help": "Ratio of length of a span of masked tokens to surrounding context length for permutation language modeling."
 84 |         },
 85 |     )
 86 |     max_span_length: int = field(
 87 |         default=5,
 88 |         metadata={
 89 |             "help": "Maximum length of a span of masked tokens for permutation language modeling."
 90 |         },
 91 |     )
 92 | 
 93 |     block_size: int = field(
 94 |         default=-1,
 95 |         metadata={
 96 |             "help": "Optional input sequence length after tokenization."
 97 |             "The training dataset will be truncated in block of this size for training."
 98 |             "Default to the model max input length for single sentence inputs (take into account special tokens)."
 99 |         },
100 |     )
101 |     overwrite_cache: bool = field(
102 |         default=False,
103 |         metadata={"help": "Overwrite the cached training and evaluation sets"},
104 |     )
105 | 
106 | 
107 | def main():
108 |     # See all possible arguments in src/transformers/training_args.py
109 |     # or by passing the --help flag to this script.
110 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
111 | 
112 |     parser = HfArgumentParser((CustomTrainingArguments, EvalArguments))
113 |     training_args, eval_args = parser.parse_args_into_dataclasses()
114 | 
115 |     # Setup logging
116 |     logging.basicConfig(
117 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
118 |         datefmt="%m/%d/%Y %H:%M:%S",
119 |         level=logging.INFO,
120 |     )
121 | 
122 |     with open(eval_args.param_path, "r") as f:
123 |         eval_params = json.load(f)
124 | 
125 |     # Wrap into args to be safe
126 |     eval_args.__dict__.update(eval_params)
127 | 
128 |     if not os.path.exists(training_args.output_dir):
129 |         raise ValueError(
130 |             f"Output directory ({training_args.output_dir}) does not exist"
131 |         )
132 |     # Setup logging
133 |     logging.basicConfig(
134 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
135 |         datefmt="%m/%d/%Y %H:%M:%S",
136 |         level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
137 |     )
138 | 
139 |     # Set seed
140 |     set_seed(training_args.seed)
141 | 
142 |     model_dir = training_args.output_dir
143 |     if "checkpoint" not in model_dir:
144 |         model_dir = get_latest_checkpoint(
145 |             model_dir, must_contain=eval_params.get("checkpoint-str", "best")
146 |         )
147 | 
148 |     config_name = os.path.join(model_dir, "config.json")
149 |     with open(config_name, "r") as f:
150 |         model_params = json.load(f)
151 | 
152 |     config = AutoConfig.from_pretrained(config_name)
153 | 
154 |     model = XLNetLMHeadModel.from_pretrained(model_dir, config=config)
155 |     logger.info(f"Model restored from {model_dir}")
156 | 
157 |     tokenizer = XLNetRTTokenizer.from_pretrained(model_dir)
158 |     property_tokenizer = PropertyTokenizerSquare()
159 |     tokenizer.set_property_tokenizer(property_tokenizer)
160 |     tokenizer.set_vocab()
161 |     # Otherwise the freshly added tokens are added as special tokens.
162 |     # tokenizer.unique_no_split_tokens = tokenizer.unique_no_split_tokens[:9]
163 | 
164 |     logger.info(f"PyTorch version: {torch.__version__}")
165 |     # model.resize_token_embeddings(len(tokenizer))
166 | 
167 |     if eval_params.get("block_size", -1) <= 0:
168 |         eval_params["block_size"] = tokenizer.max_len
169 |         # Our input block size will be the max possible for the model
170 |     else:
171 |         eval_params["block_size"] = min(training_args.block_size, tokenizer.max_len)
172 | 
173 |     eval_dataset = get_dataset(
174 |         eval_args.eval_file,
175 |         tokenizer=tokenizer,
176 |         block_size=eval_params["block_size"],
177 |         line_by_line=eval_params.get("line_by_line", True),
178 |     )
179 | 
180 |     logger.info(f"Dataset sizes, {len(eval_dataset)}.")
181 |     num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
182 |     logger.info(f"Number of parameters {num_params} of type {type(model)}")
183 | 
184 |     custom_trainer_params = get_trainer_dict(model_params)
185 | 
186 |     _unused_collator = DataCollatorForPermutationLanguageModeling(
187 |         tokenizer=tokenizer, plm_probability=0.1, max_span_length=2
188 |     )
189 | 
190 |     # Initialize our Evaluator
191 |     evaluator = Evaluator(
192 |         model=model,
193 |         args=training_args,
194 |         eval_params=eval_params,
195 |         data_collator=_unused_collator,
196 |         eval_dataset=eval_dataset,
197 |         tokenizer=tokenizer,
198 |         prediction_loss_only=False,
199 |         **custom_trainer_params,
200 |     )
201 | 
202 |     # Evaluation
203 |     result_dir = os.path.join(model_dir, "results")
204 |     os.makedirs(result_dir, exist_ok=True)
205 |     eval_filename = eval_args.eval_file.split("/")[-1].split("_")[-1].split(".")[0]
206 |     logger.info("*** Evaluate perplexity ***")
207 | 
208 |     property_results = []
209 |     properties = eval_params["property_token"]
210 |     orders = eval_params.get("property_token_masking_order", None)
211 |     tokens_to_mask = eval_params.get("property_tokens_to_mask", None)
212 | 
213 |     for prop, order, mask in zip(properties, orders, tokens_to_mask):
214 |         logger.info(f"*** Evaluate property {prop} ***")
215 | 
216 |         for to_mask in mask:
217 | 
218 |             # We iteratively make the task harder by masking 1-4 tokens.
219 |             # The order of this is determined by `property_token_masking_order`.
220 |             property_collator = PropertyCollator(
221 |                 tokenizer=tokenizer,
222 |                 property_tokens=[prop],
223 |                 num_tokens_to_mask=[to_mask],
224 |                 mask_token_order=[order],
225 |             )
226 |             print(f"Masking {to_mask} in order {order}")
227 |             ps, rs, ss = evaluator.property_prediction(
228 |                 property_collator,
229 |                 save_path=os.path.join(
230 |                     result_dir, f"{prop[1:-1]}_{eval_filename}_mask_{to_mask}.csv"
231 |                 ),
232 |             )
233 |             for p, r, s, n in zip(ps, rs, ss, ["Greedy", "Sampling", "Beam"]):
234 |                 prop_res_dict = {
235 |                     "prop": prop[1:-1],
236 |                     "pearson": p,
237 |                     "spearman": s,
238 |                     "rmse": r,
239 |                     "search": n,
240 |                     "num_masked": to_mask,
241 |                 }
242 |                 property_results.append(prop_res_dict)
243 | 
244 |         pd.DataFrame(property_results).to_csv(
245 |             os.path.join(result_dir, f"property_prediction_{eval_filename}.csv")
246 |         )
247 | 
248 | 
249 | if __name__ == "__main__":
250 |     main()
251 | 


--------------------------------------------------------------------------------
/scripts/eval_regressionhead.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Language modeling adapted from Huggingface transformers.
  4 | """
  5 | import json
  6 | import logging
  7 | import os
  8 | from dataclasses import dataclass, field
  9 | from typing import Dict, List, Optional, Tuple, Union
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | import torch
 14 | import torch.nn as nn
 15 | import transformers
 16 | from scipy.stats import pearsonr, spearmanr
 17 | from selfies import decoder, encoder
 18 | from sklearn.metrics import mean_squared_error
 19 | from torch.optim import AdamW
 20 | from torch.utils.data import DataLoader, Dataset
 21 | from tqdm import tqdm
 22 | from transformers import (
 23 |     CONFIG_MAPPING,
 24 |     MODEL_WITH_LM_HEAD_MAPPING,
 25 |     AutoConfig,
 26 |     AutoModelWithLMHead,
 27 |     DataCollatorForLanguageModeling,
 28 |     DataCollatorForPermutationLanguageModeling,
 29 |     HfArgumentParser,
 30 |     LineByLineTextDataset,
 31 |     PreTrainedTokenizer,
 32 |     TextDataset,
 33 |     XLNetConfig,
 34 |     XLNetForSequenceClassification,
 35 |     XLNetLMHeadModel,
 36 |     get_linear_schedule_with_warmup,
 37 |     set_seed,
 38 | )
 39 | from transformers.tokenization_utils_base import BatchEncoding
 40 | 
 41 | from terminator.args import CustomTrainingArguments, ModelArguments
 42 | from terminator.collators import TRAIN_COLLATORS
 43 | from terminator.datasets import get_dataset
 44 | from terminator.tokenization import ExpressionBertTokenizer
 45 | from terminator.trainer import CustomTrainer, get_trainer_dict
 46 | from terminator.utils import get_latest_checkpoint
 47 | 
 48 | transformers.logging.set_verbosity_info()
 49 | logger = logging.getLogger(__name__)
 50 | # logger.setLevel(level=logging.DEBUG)
 51 | 
 52 | 
 53 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 54 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 55 | 
 56 | 
 57 | @dataclass
 58 | class DataTrainingArguments:
 59 |     """
 60 |     Arguments pertaining to what data we are going to input our model for training and eval.
 61 |     """
 62 | 
 63 |     eval_data_file: Optional[str] = field(
 64 |         default=None,
 65 |         metadata={
 66 |             "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
 67 |         },
 68 |     )
 69 |     line_by_line: bool = field(
 70 |         default=False,
 71 |         metadata={
 72 |             "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."
 73 |         },
 74 |     )
 75 | 
 76 |     block_size: int = field(
 77 |         default=-1,
 78 |         metadata={
 79 |             "help": "Optional input sequence length after tokenization."
 80 |             "The training dataset will be truncated in block of this size for training."
 81 |             "Default to the model max input length for single sentence inputs (take into account special tokens)."
 82 |         },
 83 |     )
 84 |     overwrite_cache: bool = field(
 85 |         default=False,
 86 |         metadata={"help": "Overwrite the cached training and evaluation sets"},
 87 |     )
 88 |     batch_size: Optional[int] = field(default=16, metadata={"help": "Batch size"})
 89 | 
 90 | 
 91 | class XLNetRegressionDataset(Dataset):
 92 |     def __init__(self, tokenizer, data_path):
 93 | 
 94 |         self.tokenizer = tokenizer
 95 | 
 96 |         # Lazy data loading
 97 |         with open(data_path, "r") as f:
 98 |             self.examples = [line.strip() for line in f.readlines()]
 99 | 
100 |     def __len__(self):
101 |         return len(self.examples)
102 | 
103 |     def __getitem__(self, i):
104 |         prop, molecules = self.examples[i].split("|")
105 |         label = float(prop.split(">")[-1])
106 |         model_input = self.tokenizer(molecules)
107 |         return model_input, label
108 | 
109 | 
110 | @dataclass
111 | class Collator(DataCollatorForPermutationLanguageModeling):
112 |     def finalize(self, batch: torch.Tensor, val: int = 0) -> torch.Tensor:
113 |         """Sequence length has to be even for PLM collator, see:
114 |         https://github.com/huggingface/transformers/issues/7341
115 | 
116 |         Args:
117 |             batch (torch.Tensor): 2D Tensor (batch_size x seq_len)
118 |             val (float): Value to fill with.
119 | 
120 |         Returns:
121 |             torch.Tensor: 2D Tensor (batch_size x seq_len)
122 |         """
123 |         if batch.size(1) % 2 != 0:
124 |             return torch.cat([batch, torch.ones(batch.size(0), 1).long() * val], axis=1)
125 |         return batch.long()
126 | 
127 |     def attention_mask(self, batch: torch.Tensor, dropout: float = 0.0) -> torch.Tensor:
128 |         attention_mask = (~(batch == 0)).to(float)
129 |         return attention_mask
130 | 
131 |     def __call__(
132 |         self, examples: List[Tuple[Dict[str, List[int]], float]]
133 |     ) -> Dict[str, torch.Tensor]:
134 |         device = "cuda" if torch.cuda.is_available() else "cpu"
135 |         model_inputs = [e[0]["input_ids"] for e in examples]
136 |         inputs = self._tensorize_batch(model_inputs)
137 |         inputs = self.finalize(inputs)
138 | 
139 |         attention_mask = self.attention_mask(inputs)
140 | 
141 |         labels = torch.Tensor([e[-1] for e in examples])
142 |         return labels.to(device), {
143 |             "input_ids": inputs.to(device),
144 |             "attention_mask": attention_mask.to(device),
145 |         }
146 | 
147 | 
148 | def main():
149 | 
150 |     # Switch off comet
151 |     os.environ["COMET_MODE"] = "DISABLED"
152 | 
153 |     parser = HfArgumentParser(
154 |         (ModelArguments, DataTrainingArguments, CustomTrainingArguments)
155 |     )
156 |     model_args, data_args, train_args = parser.parse_args_into_dataclasses()
157 |     print(model_args)
158 |     print(data_args)
159 | 
160 |     if not os.path.exists(train_args.output_dir):
161 |         raise ValueError(f"Output directory ({train_args.output_dir}) does not exists!")
162 | 
163 |     # Setup logging
164 |     logging.basicConfig(
165 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
166 |         datefmt="%m/%d/%Y %H:%M:%S",
167 |         level=logging.INFO if train_args.local_rank in [-1, 0] else logging.WARN,
168 |     )
169 |     logger.warning(
170 |         "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
171 |         train_args.local_rank,
172 |         train_args.device,
173 |         train_args.n_gpu,
174 |         bool(train_args.local_rank != -1),
175 |         train_args.fp16,
176 |     )
177 |     logger.info("Training/evaluation parameters %s", train_args)
178 | 
179 |     # Set seed
180 |     set_seed(train_args.seed)
181 | 
182 |     output_dir = train_args.output_dir
183 |     model = XLNetForSequenceClassification.from_pretrained(
184 |         output_dir,
185 |         cache_dir=model_args.cache_dir,
186 |         mem_len=1024,
187 |         return_dict=True,
188 |     )
189 | 
190 |     logger.info(f"Model restored from {output_dir}")
191 | 
192 |     tokenizer = ExpressionBertTokenizer.from_pretrained(model_args.tokenizer_name)
193 | 
194 |     logger.info(f"PyTorch version: {torch.__version__}")
195 |     # model.resize_token_embeddings(len(tokenizer))
196 | 
197 |     if data_args.block_size <= 0:
198 |         data_args.block_size = tokenizer.max_len
199 |         # Our input block size will be the max possible for the model
200 |     else:
201 |         data_args.block_size = min(data_args.block_size, tokenizer.max_len)
202 | 
203 |     # Get datasets
204 |     device = "cuda" if torch.cuda.is_available() else "cpu"
205 |     fileprefix = data_args.eval_data_file.split("/")[-1].split(".")[0]
206 |     logger.info(f"Results will be saved in {output_dir} with prefix {fileprefix}")
207 |     # WHY ARE THE CORRELATIONS NEGATIVE? YEST WITH VALIDATIAON DATA
208 |     dataset = XLNetRegressionDataset(
209 |         tokenizer=tokenizer, data_path=data_args.eval_data_file
210 |     )
211 |     model = model.to(device)
212 |     collator = Collator(tokenizer=tokenizer)
213 |     logger.info(f"Evaluation dataset size: {len(dataset)}.")
214 |     num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
215 |     total_params = sum(p.numel() for p in model.parameters())
216 |     logger.info(
217 |         f"{total_params} parameters, {num_params} trainable. Model: {type(model)}"
218 |     )
219 | 
220 |     loader = DataLoader(
221 |         dataset,
222 |         batch_size=data_args.batch_size,
223 |         drop_last=False,
224 |         shuffle=False,
225 |         collate_fn=collator,
226 |     )
227 | 
228 |     eval_seqs = [dataset.examples[i].split("|")[-1] for i in range(len(dataset))]
229 | 
230 |     model.eval()
231 |     labels, predictions = [], []
232 |     with torch.no_grad():
233 |         for idx, (labs, inputs) in enumerate(loader):
234 |             output = model(**inputs, labels=labs)
235 |             prediction = output.logits.cpu().detach().squeeze().numpy()
236 | 
237 |             labels.extend(list(labs.cpu().detach().numpy()))
238 |             predictions.extend(list(prediction))
239 | 
240 |     rmse = np.sqrt(mean_squared_error(predictions, labels))
241 |     pearson = pearsonr(predictions, labels)[0]
242 |     spearman = spearmanr(predictions, labels)[0]
243 | 
244 |     logger.info(
245 |         f"Eval: RMSE:{rmse:.5f}, pearson:{pearson:.5f}, spearman:{spearman:.5f}"
246 |     )
247 | 
248 |     with open(os.path.join(output_dir, f"{fileprefix}_results.json"), "w") as f:
249 |         json.dump(
250 |             {"RMSE": str(rmse), "Pearson": str(pearson), "Spearman": str(spearman)},
251 |             f,
252 |             indent=4,
253 |         )
254 |     pd.DataFrame(
255 |         {
256 |             "sequence": eval_seqs,
257 |             "predictions": list(predictions),
258 |             "labels": list(labels),
259 |         }
260 |     ).to_csv(os.path.join(output_dir, f"{fileprefix}_predictions.csv"))
261 | 
262 | 
263 | if __name__ == "__main__":
264 |     main()
265 | 


--------------------------------------------------------------------------------
/scripts/generate_example_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Generate example data starting from a .smi file.
 3 | 
 4 | We use QED of molecules as an example.
 5 | """
 6 | 
 7 | import argparse
 8 | import os
 9 | 
10 | from rdkit import Chem
11 | from rdkit.Chem import QED
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("input_filepath", type=str, help="path to the .smi file.")
15 | parser.add_argument("output_filepath", type=str, help="output where to store the data.")
16 | 
17 | 
18 | def main() -> None:
19 |     """Generate example data."""
20 |     args = parser.parse_args()
21 |     input_filepath = args.input_filepath
22 |     output_filepath = args.output_filepath
23 | 
24 |     with open(input_filepath, "rt") as fpr:
25 |         with open(output_filepath, "wt") as fpw:
26 |             smiles_generator = (line.strip().split("\t")[0] for line in fpr)
27 |             for smiles in smiles_generator:
28 |                 try:
29 |                     fpw.write(
30 |                         f"<qed>{QED.qed(Chem.MolFromSmiles(smiles)):.4}|{smiles}{os.linesep}"
31 |                     )
32 |                 except Exception:
33 |                     print(f"Problem processing SMILES={smiles}")
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     main()
38 | 


--------------------------------------------------------------------------------
/scripts/run_language_modeling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Language modeling adapted from Huggingface transformers.
  4 | 
  5 | The file is an adaptation of https://github.com/huggingface/transformers/blob/v3.1.0/examples/language-modeling/run_language_modeling.py
  6 | 
  7 | """
  8 | 
  9 | import json
 10 | import logging
 11 | import math
 12 | import os
 13 | import warnings
 14 | from dataclasses import dataclass, field
 15 | from typing import Optional
 16 | 
 17 | import pandas as pd
 18 | import torch
 19 | import transformers
 20 | from transformers import (
 21 |     CONFIG_MAPPING,
 22 |     MODEL_WITH_LM_HEAD_MAPPING,
 23 |     AutoConfig,
 24 |     AutoModelWithLMHead,
 25 |     DataCollatorForLanguageModeling,
 26 |     DataCollatorForPermutationLanguageModeling,
 27 |     HfArgumentParser,
 28 |     LineByLineTextDataset,
 29 |     PreTrainedTokenizer,
 30 |     TextDataset,
 31 |     XLNetLMHeadModel,
 32 |     set_seed,
 33 | )
 34 | 
 35 | from terminator.args import CustomTrainingArguments, ModelArguments
 36 | from terminator.collators import TRAIN_COLLATORS
 37 | from terminator.datasets import get_dataset
 38 | from terminator.tokenization import ExpressionBertTokenizer
 39 | from terminator.trainer import CustomTrainer, get_trainer_dict
 40 | from terminator.utils import get_latest_checkpoint
 41 | 
 42 | transformers.logging.set_verbosity_info()
 43 | logger = logging.getLogger(__name__)
 44 | # logger.setLevel(level=logging.DEBUG)
 45 | 
 46 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 47 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 48 | 
 49 | 
 50 | @dataclass
 51 | class DataTrainingArguments:
 52 |     """
 53 |     Arguments pertaining to what data we are going to input our model for training and eval.
 54 |     """
 55 | 
 56 |     train_data_file: Optional[str] = field(
 57 |         default=None, metadata={"help": "The input training data file (a text file)."}
 58 |     )
 59 |     eval_data_file: Optional[str] = field(
 60 |         default=None,
 61 |         metadata={
 62 |             "help": "Input evaluation data file to evaluate the perplexity on (a text file)."
 63 |         },
 64 |     )
 65 |     line_by_line: bool = field(
 66 |         default=False,
 67 |         metadata={
 68 |             "help": "Whether lines of text in the dataset are to be handled as distinct samples."
 69 |         },
 70 |     )
 71 |     plm_probability: float = field(
 72 |         default=1 / 6,
 73 |         metadata={
 74 |             "help": "Ratio of length of a span of masked tokens to surrounding context length for PLM."
 75 |         },
 76 |     )
 77 |     max_span_length: int = field(
 78 |         default=5, metadata={"help": "Max length of a span of masked tokens for PLM."}
 79 |     )
 80 | 
 81 |     block_size: int = field(
 82 |         default=-1,
 83 |         metadata={"help": "Optional input sequence length after tokenization."},
 84 |     )
 85 |     overwrite_cache: bool = field(
 86 |         default=False,
 87 |         metadata={"help": "Overwrite the cached training and evaluation sets"},
 88 |     )
 89 | 
 90 | 
 91 | def main():
 92 |     # See all possible arguments in src/transformers/training_args.py
 93 |     # or by passing the --help flag to this script.
 94 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
 95 | 
 96 |     # Switch off comet
 97 |     os.environ["COMET_MODE"] = "DISABLED"
 98 | 
 99 |     parser = HfArgumentParser(
100 |         (ModelArguments, DataTrainingArguments, CustomTrainingArguments)
101 |     )
102 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
103 | 
104 |     if data_args.eval_data_file is None and training_args.do_eval:
105 |         raise ValueError(
106 |             "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
107 |             "or remove the --do_eval argument."
108 |         )
109 | 
110 |     if (
111 |         os.path.exists(training_args.output_dir)
112 |         and os.listdir(training_args.output_dir)
113 |         and training_args.do_train
114 |         and not training_args.overwrite_output_dir
115 |     ):
116 |         raise ValueError(
117 |             f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
118 |         )
119 |     os.makedirs(training_args.output_dir, exist_ok=True)
120 | 
121 |     # Setup logging
122 |     logging.basicConfig(
123 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
124 |         datefmt="%m/%d/%Y %H:%M:%S",
125 |         level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
126 |     )
127 |     logger.warning(
128 |         "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
129 |         training_args.local_rank,
130 |         training_args.device,
131 |         training_args.n_gpu,
132 |         bool(training_args.local_rank != -1),
133 |     )
134 |     logger.info("Training/evaluation parameters %s", training_args)
135 | 
136 |     # Set seed
137 |     set_seed(training_args.seed)
138 |     # Load the training configuration file
139 |     if training_args.training_config_path is not None:
140 |         with open(training_args.training_config_path, "r") as f:
141 |             train_config = json.load(f)
142 | 
143 |         # Store training config file in model directory
144 |         with open(
145 |             os.path.join(training_args.output_dir, "training_configs.json"), "w"
146 |         ) as f:
147 |             json.dump(train_config, f, indent="\t")
148 |     else:
149 |         train_config = {}
150 | 
151 |     if model_args.config_name:
152 |         with open(model_args.config_name, "r") as f:
153 |             model_params = json.load(f)
154 | 
155 |         config = AutoConfig.from_pretrained(
156 |             model_args.config_name,
157 |             cache_dir=model_args.cache_dir,
158 |             mem_len=model_params.get("mem_len", 1024),
159 |         )
160 | 
161 |     elif model_args.model_name_or_path:
162 |         if "checkpoint" not in model_args.model_name_or_path:
163 |             model_args.model_name_or_path = get_latest_checkpoint(
164 |                 model_args.model_name_or_path,
165 |                 must_contain=train_config.get("checkpoint-str", "best"),
166 |             )
167 | 
168 |         config = AutoConfig.from_pretrained(
169 |             model_args.model_name_or_path,
170 |             cache_dir=model_args.cache_dir,
171 |         )
172 |         model_params = config.__dict__
173 | 
174 |     else:
175 |         config = CONFIG_MAPPING[model_args.model_type]()
176 |         model_params = config.__dict__
177 |         logger.warning("You are instantiating a new config instance from scratch.")
178 | 
179 |     if model_args.tokenizer_name:
180 |         tokenizer = ExpressionBertTokenizer.from_pretrained(
181 |             model_args.tokenizer_name, cache_dir=model_args.cache_dir
182 |         )
183 | 
184 |     elif model_args.model_name_or_path:
185 |         tokenizer = ExpressionBertTokenizer.from_pretrained(
186 |             model_args.model_name_or_path, cache_dir=model_args.cache_dir
187 |         )
188 |     else:
189 |         raise ValueError(
190 |             "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
191 |             "and load it from here, using --tokenizer_name"
192 |         )
193 | 
194 |     if model_args.model_name_or_path:
195 | 
196 |         # Restore checkpoint if available
197 |         if "checkpoint" not in model_args.model_name_or_path:
198 |             model_args.model_name_or_path = get_latest_checkpoint(
199 |                 model_args.model_name_or_path,
200 |                 must_contain=train_config.get("checkpoint-str", "best"),
201 |             )
202 | 
203 |         model = AutoModelWithLMHead.from_pretrained(
204 |             model_args.model_name_or_path,
205 |             from_tf=bool(".ckpt" in model_args.model_name_or_path),
206 |             config=config,
207 |             cache_dir=model_args.cache_dir,
208 |         )
209 |         logger.info("Model restored")
210 | 
211 |         # Get min loss so far
212 |         try:
213 |             loss_df = pd.read_csv(
214 |                 os.path.join(model_args.model_name_or_path, "training_log.csv"),
215 |                 index_col=0,
216 |             )
217 |             model_params.update({"training_logs": list(loss_df.T.to_dict().values())})
218 |             logger.info("Restored training loss history.")
219 |         except Exception:
220 |             logger.warning(
221 |                 "Could not find loss history, might overwrite good checkpoints."
222 |             )
223 | 
224 |     else:
225 |         logger.info("Training new model from scratch")
226 |         model = AutoModelWithLMHead.from_config(config)
227 | 
228 |     logger.info(f"PyTorch version: {torch.__version__}")
229 |     model.resize_token_embeddings(len(tokenizer))
230 | 
231 |     if data_args.block_size <= 0:
232 |         data_args.block_size = tokenizer.max_len
233 |         # Our input block size will be the max possible for the model
234 |     else:
235 |         data_args.block_size = min(data_args.block_size, tokenizer.max_len)
236 | 
237 |     # Get datasets
238 |     train_dataset = (
239 |         get_dataset(
240 |             data_args.train_data_file,
241 |             tokenizer=tokenizer,
242 |             block_size=data_args.block_size,
243 |         )
244 |         if training_args.do_train
245 |         else None
246 |     )
247 |     eval_dataset = (
248 |         get_dataset(
249 |             data_args.eval_data_file,
250 |             tokenizer=tokenizer,
251 |             block_size=data_args.block_size,
252 |             line_by_line=data_args.line_by_line,
253 |         )
254 |         if training_args.do_eval
255 |         else None
256 |     )
257 |     if training_args.do_eval:
258 |         logger.info(f"Dataset sizes {len(train_dataset)}, {len(eval_dataset)}.")
259 |     num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
260 |     logger.info(f"Number of parameters {num_params} of type {type(model)}")
261 | 
262 |     if config.model_type != "xlnet":
263 |         warnings.warn(f"Full functionality only with XLNet; not {config.model_type}")
264 | 
265 |     # Set up the training strategy (PLM vs. alternating tasks) + loss function
266 |     if train_config.get("alternate_tasks", False):
267 |         logger.info("Training with alternate tasks")
268 |         # The main collator is the one for property prediction
269 |         data_collator = TRAIN_COLLATORS["property"](
270 |             tokenizer=tokenizer,
271 |             property_tokens=train_config["property_tokens"],
272 |             num_tokens_to_mask=train_config.get("num_tokens_to_mask", None),
273 |             mask_token_order=train_config.get("mask_token_order", None),
274 |         )
275 |         alternating_collator = TRAIN_COLLATORS[train_config["cg_collator"]](
276 |             tokenizer=tokenizer, **train_config["cg_collator_params"]
277 |         )
278 | 
279 |     else:
280 |         if train_config["task"] == "proponly":
281 |             data_collator = TRAIN_COLLATORS["property"](
282 |                 tokenizer=tokenizer,
283 |                 property_tokens=train_config["property_tokens"],
284 |                 num_tokens_to_mask=train_config.get("num_tokens_to_mask", None),
285 |                 mask_token_order=train_config.get("mask_token_order", None),
286 |             )
287 |             logger.warning("Training only on property predict")
288 |         elif train_config["task"] == "gen_only":
289 | 
290 |             data_collator = TRAIN_COLLATORS[train_config["cg_collator"]](
291 |                 tokenizer=tokenizer, **train_config["cg_collator_params"]
292 |             )
293 |             logger.warning("Training ONLY on conditional generation")
294 | 
295 |         elif train_config["task"] == "plm":
296 | 
297 |             logger.info("Training with PLM")
298 |             # Only vanilla PLM training
299 |             data_collator = DataCollatorForPermutationLanguageModeling(
300 |                 tokenizer=tokenizer,
301 |                 plm_probability=data_args.plm_probability,
302 |                 max_span_length=data_args.max_span_length,
303 |             )
304 |         alternating_collator = None
305 | 
306 |     custom_trainer_params = get_trainer_dict(model_params)
307 | 
308 |     # Initialize our Trainer
309 |     trainer = CustomTrainer(
310 |         model=model,
311 |         args=training_args,
312 |         data_collator=data_collator,
313 |         train_dataset=train_dataset,
314 |         eval_dataset=eval_dataset,
315 |         tokenizer=tokenizer,
316 |         prediction_loss_only=False,
317 |         alternating_collator=alternating_collator,
318 |         train_config=train_config,
319 |         **custom_trainer_params,
320 |     )
321 | 
322 |     # Training
323 |     if training_args.do_train:
324 |         model_path = (
325 |             model_args.model_name_or_path
326 |             if model_args.model_name_or_path is not None
327 |             and os.path.isdir(model_args.model_name_or_path)
328 |             else None
329 |         )
330 |         trainer.train(model_path=model_path)
331 |         trainer.save_model()
332 |         # For convenience, we also re-save the tokenizer to the same directory,
333 |         # so that you can share your model easily on huggingface.co/models =)
334 |         if trainer.is_world_master():
335 |             tokenizer.save_pretrained(training_args.output_dir)
336 | 
337 |     # Evaluation
338 |     results = {}
339 |     if training_args.do_eval:
340 |         logger.info("*** Evaluate ***")
341 | 
342 |         eval_output = trainer.evaluate()
343 | 
344 |         perplexity = math.exp(eval_output["eval_loss"])
345 |         result = {"perplexity": perplexity}
346 | 
347 |         output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
348 |         if trainer.is_world_master():
349 |             with open(output_eval_file, "w") as writer:
350 |                 logger.info("***** Eval results *****")
351 |                 for key in sorted(result.keys()):
352 |                     logger.info("  %s = %s", key, str(result[key]))
353 |                     writer.write("%s = %s\n" % (key, str(result[key])))
354 | 
355 |         results.update(result)
356 | 
357 |     return results
358 | 
359 | 
360 | if __name__ == "__main__":
361 |     main()
362 | 


--------------------------------------------------------------------------------
/scripts/run_lm_nlp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Language modeling adapted from Huggingface transformers.
  4 | 
  5 | The file is an adaptation of https://github.com/huggingface/transformers/blob/v3.1.0/examples/language-modeling/run_language_modeling.py
  6 | """
  7 | 
  8 | 
  9 | import json
 10 | import logging
 11 | import math
 12 | import os
 13 | import warnings
 14 | from dataclasses import dataclass, field
 15 | from typing import Optional
 16 | 
 17 | import pandas as pd
 18 | import torch
 19 | import transformers
 20 | from transformers import (
 21 |     CONFIG_MAPPING,
 22 |     MODEL_WITH_LM_HEAD_MAPPING,
 23 |     AutoConfig,
 24 |     AutoModelWithLMHead,
 25 |     DataCollatorForLanguageModeling,
 26 |     DataCollatorForPermutationLanguageModeling,
 27 |     HfArgumentParser,
 28 |     LineByLineTextDataset,
 29 |     PreTrainedTokenizer,
 30 |     TextDataset,
 31 |     XLNetConfig,
 32 |     XLNetLMHeadModel,
 33 |     set_seed,
 34 | )
 35 | 
 36 | from terminator.args import CustomTrainingArguments, ModelArguments
 37 | from terminator.collators import TRAIN_COLLATORS
 38 | from terminator.datasets import get_dataset
 39 | from terminator.tokenization import PropertyTokenizerSquare, XLNetRTTokenizer
 40 | from terminator.trainer import CustomTrainer, get_trainer_dict
 41 | from terminator.utils import get_latest_checkpoint
 42 | 
 43 | logger = logging.getLogger(__name__)
 44 | 
 45 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 46 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 47 | 
 48 | 
 49 | @dataclass
 50 | class DataTrainingArguments:
 51 |     """
 52 |     Arguments pertaining to what data we are going to input our model for training and eval.
 53 |     """
 54 | 
 55 |     train_data_file: Optional[str] = field(
 56 |         default=None, metadata={"help": "The input training data file (a text file)."}
 57 |     )
 58 |     eval_data_file: Optional[str] = field(
 59 |         default=None,
 60 |         metadata={
 61 |             "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
 62 |         },
 63 |     )
 64 |     line_by_line: bool = field(
 65 |         default=False,
 66 |         metadata={
 67 |             "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."
 68 |         },
 69 |     )
 70 | 
 71 |     mlm: bool = field(
 72 |         default=False,
 73 |         metadata={
 74 |             "help": "Train with masked-language modeling loss instead of language modeling."
 75 |         },
 76 |     )
 77 |     mlm_probability: float = field(
 78 |         default=0.15,
 79 |         metadata={"help": "Ratio of tokens to mask for masked language modeling loss"},
 80 |     )
 81 |     plm_probability: float = field(
 82 |         default=1 / 6,
 83 |         metadata={
 84 |             "help": "Ratio of length of a span of masked tokens to surrounding context length for permutation language modeling."
 85 |         },
 86 |     )
 87 |     max_span_length: int = field(
 88 |         default=5,
 89 |         metadata={
 90 |             "help": "Maximum length of a span of masked tokens for permutation language modeling."
 91 |         },
 92 |     )
 93 | 
 94 |     block_size: int = field(
 95 |         default=-1,
 96 |         metadata={
 97 |             "help": "Optional input sequence length after tokenization."
 98 |             "The training dataset will be truncated in block of this size for training."
 99 |             "Default to the model max input length for single sentence inputs (take into account special tokens)."
100 |         },
101 |     )
102 |     overwrite_cache: bool = field(
103 |         default=False,
104 |         metadata={"help": "Overwrite the cached training and evaluation sets"},
105 |     )
106 | 
107 | 
108 | def main():
109 |     # See all possible arguments in src/transformers/training_args.py
110 |     # or by passing the --help flag to this script.
111 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
112 | 
113 |     # Switch off comet
114 |     os.environ["COMET_MODE"] = "DISABLED"
115 | 
116 |     parser = HfArgumentParser(
117 |         (ModelArguments, DataTrainingArguments, CustomTrainingArguments)
118 |     )
119 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
120 | 
121 |     if data_args.eval_data_file is None and training_args.do_eval:
122 |         raise ValueError(
123 |             "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
124 |             "or remove the --do_eval argument."
125 |         )
126 | 
127 |     if (
128 |         os.path.exists(training_args.output_dir)
129 |         and os.listdir(training_args.output_dir)
130 |         and training_args.do_train
131 |         and not training_args.overwrite_output_dir
132 |     ):
133 |         raise ValueError(
134 |             f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
135 |         )
136 | 
137 |     # Setup logging
138 |     logging.basicConfig(
139 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
140 |         datefmt="%m/%d/%Y %H:%M:%S",
141 |         level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
142 |     )
143 |     logger.warning(
144 |         "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
145 |         training_args.local_rank,
146 |         training_args.device,
147 |         training_args.n_gpu,
148 |         bool(training_args.local_rank != -1),
149 |         training_args.fp16,
150 |     )
151 |     logger.info("Training/evaluation parameters %s", training_args)
152 | 
153 |     # Set seed
154 |     set_seed(training_args.seed)
155 | 
156 |     configuration = XLNetConfig("xlnet-base-cased")
157 |     config_dict = configuration.to_dict()
158 | 
159 |     model = XLNetLMHeadModel.from_pretrained(
160 |         "xlnet-base-cased",
161 |         cache_dir=model_args.cache_dir,
162 |         mem_len=1024,
163 |         return_dict=True,
164 |     )
165 | 
166 |     if not os.path.exists(training_args.output_dir):
167 |         os.makedirs(training_args.output_dir)
168 | 
169 |     tokenizer = XLNetRTTokenizer.from_pretrained(
170 |         model_args.tokenizer_name, cache_dir=model_args.cache_dir
171 |     )
172 |     property_tokenizer = PropertyTokenizerSquare()
173 |     tokenizer.set_property_tokenizer(property_tokenizer)
174 |     tokenizer.set_vocab()
175 |     # Otherwise the freshly added tokens are added as special tokens.
176 |     tokenizer.unique_no_split_tokens = tokenizer.unique_no_split_tokens[:9]
177 | 
178 |     if model_args.model_name_or_path:
179 | 
180 |         # Restore checkpoint if available
181 |         if "checkpoint" not in model_args.model_name_or_path:
182 |             ckpt_path = get_latest_checkpoint(
183 |                 model_args.model_name_or_path, must_contain="rmse"
184 |             )
185 |         else:
186 |             ckpt_path = model_args.model_name_or_path
187 | 
188 |         model = XLNetLMHeadModel.from_pretrained(
189 |             ckpt_path,
190 |             cache_dir=model_args.cache_dir,
191 |             mem_len=1024,
192 |             return_dict=True,
193 |         )
194 |         logger.info(f"Model restored from {ckpt_path}")
195 | 
196 |         # Get min loss so far
197 |         try:
198 |             loss_df = pd.read_csv(
199 |                 os.path.join(ckpt_path, "training_log.csv"),
200 |                 index_col=0,
201 |             )
202 |             configuration.update({"training_logs": list(loss_df.T.to_dict().values())})
203 |             logger.info("Restored training loss history.")
204 |         except Exception:
205 |             logger.warning(
206 |                 "Could not find loss history, might overwrite good checkpoints."
207 |             )
208 | 
209 |     logger.info(f"PyTorch version: {torch.__version__}")
210 |     model.resize_token_embeddings(len(tokenizer))
211 | 
212 |     if data_args.block_size <= 0:
213 |         data_args.block_size = tokenizer.max_len
214 |         # Our input block size will be the max possible for the model
215 |     else:
216 |         data_args.block_size = min(data_args.block_size, tokenizer.max_len)
217 | 
218 |     # Get datasets
219 |     train_dataset = get_dataset(
220 |         data_args.train_data_file,
221 |         tokenizer=tokenizer,
222 |         block_size=data_args.block_size,
223 |     )
224 | 
225 |     eval_dataset = get_dataset(
226 |         data_args.eval_data_file,
227 |         tokenizer=tokenizer,
228 |         block_size=data_args.block_size,
229 |         line_by_line=data_args.line_by_line,
230 |     )
231 | 
232 |     logger.info(f"Dataset sizes {len(train_dataset)}, {len(eval_dataset)}.")
233 |     num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
234 |     logger.info(f"Number of parameters {num_params} of type {type(model)}")
235 | 
236 |     # Set up the training strategy (PLM vs. alternating tasks) + loss function
237 |     if training_args.training_config_path is not None:
238 |         with open(training_args.training_config_path, "r") as f:
239 |             train_config = json.load(f)
240 | 
241 |         # Store training config file in model directory
242 |         with open(
243 |             os.path.join(training_args.output_dir, "training_configs.json"), "w"
244 |         ) as f:
245 |             json.dump(train_config, f, indent="\t")
246 |     else:
247 |         train_config = {}
248 | 
249 |     if train_config.get("alternate_tasks", False):
250 |         logger.info("Training with alternate tasks")
251 |         # The main collator is the one for property prediction
252 |         data_collator = TRAIN_COLLATORS["property"](
253 |             tokenizer=tokenizer,
254 |             property_tokens=train_config["property_tokens"],
255 |             num_tokens_to_mask=train_config.get("num_tokens_to_mask", None),
256 |             mask_token_order=train_config.get("mask_token_order", None),
257 |         )
258 |         alternating_collator = TRAIN_COLLATORS[train_config["cg_collator"]](
259 |             tokenizer=tokenizer, **train_config["cg_collator_params"]
260 |         )
261 | 
262 |     else:
263 |         logger.info("Training with PLM")
264 |         # Only vanilla PLM training
265 |         data_collator = DataCollatorForPermutationLanguageModeling(
266 |             tokenizer=tokenizer,
267 |             plm_probability=data_args.plm_probability,
268 |             max_span_length=data_args.max_span_length,
269 |         )
270 |         alternating_collator = None
271 | 
272 |     custom_trainer_params = get_trainer_dict(config_dict)
273 | 
274 |     # Initialize our Trainer
275 |     print("***DATA COLLATOR", data_collator)
276 |     print("***ALTERNATING COLLATOR", alternating_collator)
277 |     trainer = CustomTrainer(
278 |         model=model,
279 |         args=training_args,
280 |         data_collator=data_collator,
281 |         train_dataset=train_dataset,
282 |         eval_dataset=eval_dataset,
283 |         tokenizer=tokenizer,
284 |         prediction_loss_only=False,
285 |         alternating_collator=alternating_collator,
286 |         train_config=train_config,
287 |         **custom_trainer_params,
288 |     )
289 | 
290 |     # Training
291 |     if training_args.do_train:
292 |         model_path = (
293 |             model_args.model_name_or_path
294 |             if model_args.model_name_or_path is not None
295 |             and os.path.isdir(model_args.model_name_or_path)
296 |             else None
297 |         )
298 |         trainer.train(model_path=model_path)
299 |         trainer.save_model()
300 |         # For convenience, we also re-save the tokenizer to the same directory,
301 |         # so that you can share your model easily on huggingface.co/models =)
302 |         if trainer.is_world_master():
303 |             tokenizer.save_pretrained(training_args.output_dir)
304 | 
305 |     # Evaluation
306 |     results = {}
307 |     if training_args.do_eval:
308 |         logger.info("*** Evaluate ***")
309 | 
310 |         eval_output = trainer.evaluate()
311 | 
312 |         perplexity = math.exp(eval_output["eval_loss"])
313 |         result = {"perplexity": perplexity}
314 | 
315 |         output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
316 |         if trainer.is_world_master():
317 |             with open(output_eval_file, "w") as writer:
318 |                 logger.info("***** Eval results *****")
319 |                 for key in sorted(result.keys()):
320 |                     logger.info("  %s = %s", key, str(result[key]))
321 |                     writer.write("%s = %s\n" % (key, str(result[key])))
322 | 
323 |         results.update(result)
324 | 
325 |     return results
326 | 
327 | 
328 | if __name__ == "__main__":
329 |     main()
330 | 


--------------------------------------------------------------------------------
/scripts/run_regressionhead.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Language modeling adapted from Huggingface transformers.
  4 | """
  5 | import json
  6 | import logging
  7 | import os
  8 | from dataclasses import dataclass, field
  9 | from typing import Dict, List, Optional, Tuple, Union
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | import torch
 14 | import torch.nn as nn
 15 | import transformers
 16 | from scipy.stats import pearsonr, spearmanr
 17 | from selfies import decoder, encoder
 18 | from sklearn.metrics import mean_squared_error
 19 | from torch.optim import AdamW
 20 | from torch.utils.data import DataLoader, Dataset
 21 | from tqdm import tqdm
 22 | from transformers import (
 23 |     CONFIG_MAPPING,
 24 |     MODEL_WITH_LM_HEAD_MAPPING,
 25 |     AutoConfig,
 26 |     AutoModelWithLMHead,
 27 |     DataCollatorForLanguageModeling,
 28 |     DataCollatorForPermutationLanguageModeling,
 29 |     HfArgumentParser,
 30 |     LineByLineTextDataset,
 31 |     PreTrainedTokenizer,
 32 |     TextDataset,
 33 |     XLNetConfig,
 34 |     XLNetForSequenceClassification,
 35 |     XLNetLMHeadModel,
 36 |     get_linear_schedule_with_warmup,
 37 |     set_seed,
 38 | )
 39 | from transformers.tokenization_utils_base import BatchEncoding
 40 | 
 41 | from terminator.args import CustomTrainingArguments, ModelArguments
 42 | from terminator.collators import TRAIN_COLLATORS
 43 | from terminator.datasets import get_dataset
 44 | from terminator.tokenization import ExpressionBertTokenizer
 45 | from terminator.trainer import CustomTrainer, get_trainer_dict
 46 | from terminator.utils import get_latest_checkpoint
 47 | 
 48 | transformers.logging.set_verbosity_info()
 49 | logger = logging.getLogger(__name__)
 50 | # logger.setLevel(level=logging.DEBUG)
 51 | 
 52 | 
 53 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 54 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 55 | 
 56 | 
 57 | @dataclass
 58 | class DataTrainingArguments:
 59 |     """
 60 |     Arguments pertaining to what data we are going to input our model for training and eval.
 61 |     """
 62 | 
 63 |     train_data_file: Optional[str] = field(
 64 |         default=None, metadata={"help": "The input training data file (a text file)."}
 65 |     )
 66 |     eval_data_file: Optional[str] = field(
 67 |         default=None,
 68 |         metadata={
 69 |             "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
 70 |         },
 71 |     )
 72 |     line_by_line: bool = field(
 73 |         default=False,
 74 |         metadata={
 75 |             "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."
 76 |         },
 77 |     )
 78 | 
 79 |     block_size: int = field(
 80 |         default=-1,
 81 |         metadata={
 82 |             "help": "Optional input sequence length after tokenization."
 83 |             "The training dataset will be truncated in block of this size for training."
 84 |             "Default to the model max input length for single sentence inputs (take into account special tokens)."
 85 |         },
 86 |     )
 87 |     overwrite_cache: bool = field(
 88 |         default=False,
 89 |         metadata={"help": "Overwrite the cached training and evaluation sets"},
 90 |     )
 91 |     batch_size: Optional[int] = field(default=16, metadata={"help": "Batch size"})
 92 | 
 93 | 
 94 | class XLNetRegressionDataset(Dataset):
 95 |     def __init__(self, tokenizer, data_path):
 96 | 
 97 |         self.tokenizer = tokenizer
 98 | 
 99 |         # Lazy data loading
100 |         with open(data_path, "r") as f:
101 |             self.examples = [line.strip() for line in f.readlines()]
102 | 
103 |     def __len__(self):
104 |         return len(self.examples)
105 | 
106 |     def __getitem__(self, i):
107 |         prop, molecules = self.examples[i].split("|")
108 |         label = float(prop.split(">")[-1])
109 |         model_input = self.tokenizer(molecules)
110 |         return model_input, label
111 | 
112 | 
113 | @dataclass
114 | class Collator(DataCollatorForPermutationLanguageModeling):
115 |     def finalize(self, batch: torch.Tensor, val: int = 0) -> torch.Tensor:
116 |         """Sequence length has to be even for PLM collator, see:
117 |         https://github.com/huggingface/transformers/issues/7341
118 | 
119 |         Args:
120 |             batch (torch.Tensor): 2D Tensor (batch_size x seq_len)
121 |             val (float): Value to fill with.
122 | 
123 |         Returns:
124 |             torch.Tensor: 2D Tensor (batch_size x seq_len)
125 |         """
126 |         if batch.size(1) % 2 != 0:
127 |             return torch.cat([batch, torch.ones(batch.size(0), 1).long() * val], axis=1)
128 |         return batch.long()
129 | 
130 |     def attention_mask(self, batch: torch.Tensor, dropout: float = 0.0) -> torch.Tensor:
131 |         attention_mask = (~(batch == 0)).to(float)
132 |         return attention_mask
133 | 
134 |     def __call__(
135 |         self, examples: List[Tuple[Dict[str, List[int]], float]]
136 |     ) -> Dict[str, torch.Tensor]:
137 |         device = "cuda" if torch.cuda.is_available() else "cpu"
138 |         model_inputs = [e[0]["input_ids"] for e in examples]
139 |         inputs = self._tensorize_batch(model_inputs)
140 |         inputs = self.finalize(inputs)
141 | 
142 |         attention_mask = self.attention_mask(inputs)
143 | 
144 |         labels = torch.Tensor([e[-1] for e in examples])
145 |         return labels.to(device), {
146 |             "input_ids": inputs.to(device),
147 |             "attention_mask": attention_mask.to(device),
148 |         }
149 | 
150 | 
151 | def main():
152 | 
153 |     # Switch off comet
154 |     os.environ["COMET_MODE"] = "DISABLED"
155 | 
156 |     parser = HfArgumentParser(
157 |         (ModelArguments, DataTrainingArguments, CustomTrainingArguments)
158 |     )
159 |     model_args, data_args, train_args = parser.parse_args_into_dataclasses()
160 | 
161 |     if (
162 |         os.path.exists(train_args.output_dir)
163 |         and os.listdir(train_args.output_dir)
164 |         and train_args.do_train
165 |         and not train_args.overwrite_output_dir
166 |     ):
167 |         raise ValueError(
168 |             f"Output directory ({train_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
169 |         )
170 | 
171 |     # Setup logging
172 |     logging.basicConfig(
173 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
174 |         datefmt="%m/%d/%Y %H:%M:%S",
175 |         level=logging.INFO if train_args.local_rank in [-1, 0] else logging.WARN,
176 |     )
177 |     logger.warning(
178 |         "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
179 |         train_args.local_rank,
180 |         train_args.device,
181 |         train_args.n_gpu,
182 |         bool(train_args.local_rank != -1),
183 |         train_args.fp16,
184 |     )
185 |     logger.info("Training/evaluation parameters %s", train_args)
186 | 
187 |     # Set seed
188 |     set_seed(train_args.seed)
189 | 
190 |     # Load pretrained model and tokenizer
191 |     #
192 |     # Distributed training:
193 |     # The .from_pretrained methods guarantee that only one local process can concurrently
194 |     # download model & vocab.
195 | 
196 |     model = XLNetForSequenceClassification.from_pretrained(
197 |         "xlnet-base-cased",
198 |         cache_dir=model_args.cache_dir,
199 |         mem_len=1024,
200 |         return_dict=True,
201 |     )
202 |     # Do Regression
203 |     model.num_labels = 1
204 |     model.logits_proj = nn.Linear(768, 1)
205 | 
206 |     if not os.path.exists(train_args.output_dir):
207 |         os.makedirs(train_args.output_dir)
208 |     print(model_args.tokenizer_name)
209 |     tokenizer = ExpressionBertTokenizer.from_pretrained(model_args.tokenizer_name)
210 | 
211 |     logger.info(f"PyTorch version: {torch.__version__}")
212 |     model.resize_token_embeddings(len(tokenizer))
213 | 
214 |     if data_args.block_size <= 0:
215 |         data_args.block_size = tokenizer.max_len
216 |         # Our input block size will be the max possible for the model
217 |     else:
218 |         data_args.block_size = min(data_args.block_size, tokenizer.max_len)
219 | 
220 |     # Get datasets
221 |     device = "cuda" if torch.cuda.is_available() else "cpu"
222 |     train_dataset = XLNetRegressionDataset(
223 |         tokenizer=tokenizer, data_path=data_args.train_data_file
224 |     )
225 |     eval_dataset = XLNetRegressionDataset(
226 |         tokenizer=tokenizer, data_path=data_args.eval_data_file
227 |     )
228 |     model = model.to(device)
229 |     collator = Collator(tokenizer=tokenizer)
230 |     logger.info(f"Dataset sizes {len(train_dataset)}, {len(eval_dataset)}.")
231 |     num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
232 |     total_params = sum(p.numel() for p in model.parameters())
233 |     logger.info(
234 |         f"{total_params} parameters, {num_params} trainable. Model: {type(model)}"
235 |     )
236 | 
237 |     train_loader = DataLoader(
238 |         train_dataset,
239 |         batch_size=data_args.batch_size,
240 |         drop_last=True,
241 |         shuffle=True,
242 |         collate_fn=collator,
243 |     )
244 |     eval_loader = DataLoader(
245 |         eval_dataset,
246 |         batch_size=data_args.batch_size,
247 |         drop_last=False,
248 |         shuffle=False,
249 |         collate_fn=collator,
250 |     )
251 |     lr = train_args.learning_rate
252 | 
253 |     # Set up the optimizer
254 |     no_decay = ["bias", "LayerNorm.weight"]
255 |     optimizer_grouped_parameters = [
256 |         {
257 |             "params": [
258 |                 p
259 |                 for n, p in model.named_parameters()
260 |                 if not any(nd in n for nd in no_decay)
261 |             ],
262 |             "weight_decay": 0,
263 |         },
264 |         {
265 |             "params": [
266 |                 p
267 |                 for n, p in model.named_parameters()
268 |                 if any(nd in n for nd in no_decay)
269 |             ],
270 |             "weight_decay": 0.0,
271 |         },
272 |     ]
273 |     optimizer = AdamW(optimizer_grouped_parameters, lr=lr, betas=(0.9, 0.999), eps=1e-7)
274 |     total_steps = len(train_loader) * train_args.num_train_epochs
275 |     scheduler = get_linear_schedule_with_warmup(
276 |         optimizer, num_warmup_steps=0, num_training_steps=total_steps
277 |     )
278 |     best_perf = [100, 0, 0]
279 |     epochs = int(train_args.num_train_epochs)
280 |     logger.info(f"Batch size: {data_args.batch_size}, LR={lr}")
281 | 
282 |     output_dir = train_args.output_dir
283 |     eval_seqs = [
284 |         eval_dataset.examples[i].split("|")[-1] for i in range(len(eval_dataset))
285 |     ]
286 | 
287 |     for epoch in range(epochs):
288 |         logger.info(f"Starting epoch {epoch}/{epochs}.")
289 | 
290 |         model.train()
291 |         for idx, (labels, inputs) in tqdm(
292 |             enumerate(train_loader), total=len(train_loader)
293 |         ):
294 |             output = model(**inputs, labels=labels)
295 |             loss = output.loss
296 |             optimizer.zero_grad()
297 |             loss.backward()
298 |             optimizer.step()
299 |             scheduler.step()
300 | 
301 |         model.eval()
302 |         labels, predictions = [], []
303 |         with torch.no_grad():
304 |             for idx, (labs, inputs) in enumerate(eval_loader):
305 |                 output = model(**inputs, labels=labs)
306 |                 prediction = output.logits.cpu().detach().squeeze().numpy()
307 | 
308 |                 labels.extend(list(labs.cpu().detach().numpy()))
309 |                 predictions.extend(list(prediction))
310 | 
311 |         rmse = np.sqrt(mean_squared_error(predictions, labels))
312 |         pearson = pearsonr(predictions, labels)[0]
313 |         spearman = spearmanr(predictions, labels)[0]
314 | 
315 |         logger.info(
316 |             f"Epoch {epoch}: RMSE:{rmse:.8f}, pearson:{pearson:.3f}, spearman:{spearman:.3f}"
317 |         )
318 |         if pearson > best_perf[1]:
319 |             best_perf[1] = pearson
320 |             logger.info(f"New best Pearson: {pearson}")
321 |             with open(
322 |                 os.path.join(output_dir, "best_eval_perf_pearson.json"), "w"
323 |             ) as f:
324 |                 json.dump(
325 |                     {
326 |                         "RMSE": str(best_perf[0]),
327 |                         "Pearson": str(best_perf[1]),
328 |                         "Spearman": str(best_perf[2]),
329 |                         "Epoch": str(epoch),
330 |                     },
331 |                     f,
332 |                     indent=4,
333 |                 )
334 |             pd.DataFrame(
335 |                 {
336 |                     "sequence": eval_seqs,
337 |                     "predictions": list(predictions),
338 |                     "labels": list(labels),
339 |                 }
340 |             ).to_csv(os.path.join(output_dir, "best_pearson_preds.csv"))
341 |         if rmse < best_perf[0]:
342 |             best_perf[0] = rmse
343 |             logger.info(f"New best RMSE: {rmse}")
344 |             torch.save(model.state_dict(), os.path.join(output_dir, "rmse.bin"))
345 |             with open(os.path.join(output_dir, "best_eval_perf_rmse.json"), "w") as f:
346 |                 json.dump(
347 |                     {
348 |                         "RMSE": str(best_perf[0]),
349 |                         "Pearson": str(best_perf[1]),
350 |                         "Spearman": str(best_perf[2]),
351 |                         "Epoch": epoch,
352 |                     },
353 |                     f,
354 |                     indent=4,
355 |                 )
356 |             pd.DataFrame(
357 |                 {
358 |                     "sequence": eval_seqs,
359 |                     "predictions": list(predictions),
360 |                     "labels": list(labels),
361 |                 }
362 |             ).to_csv(os.path.join(output_dir, "best_rmse_preds.csv"))
363 |         if spearman > best_perf[2]:
364 |             best_perf[2] = spearman
365 |             logger.info(f"New best Spearman: {spearman}")
366 |             torch.save(model.state_dict(), os.path.join(output_dir, "spearman.bin"))
367 |             with open(
368 |                 os.path.join(output_dir, "best_eval_perf_spearman.json"), "w"
369 |             ) as f:
370 |                 json.dump(
371 |                     {
372 |                         "RMSE": str(best_perf[0]),
373 |                         "Pearson": str(best_perf[1]),
374 |                         "Spearman": str(best_perf[2]),
375 |                         "Epoch": epoch,
376 |                     },
377 |                     f,
378 |                     indent=4,
379 |                 )
380 |             pd.DataFrame(
381 |                 {
382 |                     "sequence": eval_seqs,
383 |                     "predictions": list(predictions),
384 |                     "labels": list(labels),
385 |                 }
386 |             ).to_csv(os.path.join(output_dir, "best_spearman_preds.csv"))
387 | 
388 | 
389 | if __name__ == "__main__":
390 |     main()
391 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 80
 3 | select = C,E,F,W,B,B950
 4 | ignore = E203, E501, W503
 5 | 
 6 | [mypy]
 7 | check_untyped_defs = True
 8 | 
 9 | [mypy-pytest.*]
10 | ignore_missing_imports = True
11 | 
12 | [mypy-rdkit.*]
13 | ignore_missing_imports = True
14 | 
15 | [mypy-setuptools.*]
16 | ignore_missing_imports = True
17 | 
18 | [mypy-transformers.*]
19 | ignore_missing_imports = True
20 | 
21 | [mypy-numpy.*]
22 | ignore_missing_imports = True
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """Package setup."""
 2 | import io
 3 | import re
 4 | 
 5 | from setuptools import find_packages, setup
 6 | 
 7 | match = re.search(
 8 |     r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
 9 |     io.open("terminator/__init__.py", encoding="utf_8_sig").read(),
10 | )
11 | if match is None:
12 |     raise SystemExit("Version number not found.")
13 | __version__ = match.group(1)
14 | 
15 | setup(
16 |     name="terminator",
17 |     version=__version__,
18 |     author="IBM Resarch team",
19 |     author_email=["jannis.born@gmx.de, drugilsberg@gmail.com"],
20 |     packages=find_packages(),
21 |     long_description=open("README.md").read(),
22 |     long_description_content_type="text/markdown",
23 |     package_data={"terminator": ["py.typed"]},
24 |     install_requires=["transformers", "numpy", "tqdm", "selfies==1.0.4", "modlamp"],
25 | )
26 | 


--------------------------------------------------------------------------------
/terminator/__init__.py:
--------------------------------------------------------------------------------
1 | """Utiltities for transformer-based conditional molecule generation."""
2 | __version__ = "0.0.1"
3 | __name__ = "terminator"
4 | 


--------------------------------------------------------------------------------
/terminator/args.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import Optional
  3 | 
  4 | from transformers import MODEL_WITH_LM_HEAD_MAPPING
  5 | from transformers.training_args import TrainingArguments
  6 | 
  7 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
  8 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
  9 | 
 10 | 
 11 | @dataclass
 12 | class CustomTrainingArguments(TrainingArguments):
 13 |     """
 14 |     NOTE: Expanding TrainingArguments class from transformers with custom arguments.
 15 | 
 16 |     eval_accumulation_steps (:obj:`int`, `optional`):
 17 |             Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
 18 |             left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but
 19 |             requires more memory).
 20 |     """
 21 | 
 22 |     # Was introduced only in transformers 3.4.0
 23 |     eval_accumulation_steps: Optional[int] = field(
 24 |         default=None,
 25 |         metadata={
 26 |             "help": "Number of predictions steps to accumulate before moving the tensors to the CPU."
 27 |         },
 28 |     )
 29 |     training_config_path: Optional[str] = field(
 30 |         default=None,
 31 |         metadata={
 32 |             "help": """
 33 |             Path to a file specifying the training objective hyperparameter.
 34 | 
 35 |             Defaults to None, meaning the vanilla PLM objective is used.
 36 | 
 37 | 
 38 |             Optional keys include:
 39 |             - 'alternate_tasks' (bool): Whether the model is trained specifically on
 40 |                 property prediction and conditional generation task or not.
 41 |                 NOTE: If False, then all other keys are ignored and we fall back to the
 42 |                 PLM objective (identical to not providing a path). Default: False.
 43 |             - 'cc_loss' (bool): Whether the model is trained with the cycle-consistency
 44 |                 loss in the CG task or with a regular BCE between logits of generated
 45 |                 tokens and the real molecule. Default: False.
 46 |             - 'cg_collator' (str): Name of collator to use for conditional generation.
 47 |                 Should be either `vanilla_cg` or `bimodal_cg`.
 48 |             - 'generation_token' (str): Token which should be masked for CC loss. Only
 49 |                 required if cc_loss is True.
 50 | 
 51 |             - 'cg_collator_params' (dict): Parameters to pass to the collator. Keys e.g.
 52 |                 'do_sample' (bool): Whether property is sampled.
 53 |                 'property_value_ranges' (Iterable[float]):
 54 |                 'property_value_thresholds' (Iterable[float]):
 55 |                 'prob_near_sampling' (float): Probability of sampling nearby values.
 56 |             """
 57 |         },
 58 |     )
 59 | 
 60 | 
 61 | @dataclass
 62 | class ModelArguments:
 63 |     """
 64 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 65 |     """
 66 | 
 67 |     model_name_or_path: Optional[str] = field(
 68 |         default=None,
 69 |         metadata={
 70 |             "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
 71 |         },
 72 |     )
 73 |     model_type: Optional[str] = field(
 74 |         default=None,
 75 |         metadata={
 76 |             "help": "If training from scratch, pass a model type from the list: "
 77 |             + ", ".join(MODEL_TYPES)
 78 |         },
 79 |     )
 80 |     config_name: Optional[str] = field(
 81 |         default=None,
 82 |         metadata={
 83 |             "help": "Pretrained config name or path if not the same as model_name"
 84 |         },
 85 |     )
 86 |     tokenizer_name: Optional[str] = field(
 87 |         default=None,
 88 |         metadata={
 89 |             "help": "Pretrained tokenizer name or path if not the same as model_name"
 90 |         },
 91 |     )
 92 |     cache_dir: Optional[str] = field(
 93 |         default=None,
 94 |         metadata={
 95 |             "help": "Where do you want to store the pretrained models downloaded from s3"
 96 |         },
 97 |     )
 98 | 
 99 | 
100 | @dataclass
101 | class EvalArguments:
102 |     """
103 |     Argumnts for model evaluation.
104 | 
105 |     eval_accumulation_steps (:obj:`int`, `optional`):
106 |             Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
107 |             left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but
108 |             requires more memory).
109 |     """
110 | 
111 |     eval_file: str = field(metadata={"help": "Path to the data used for evaluation"})
112 |     param_path: str = field(
113 |         metadata={"help": "Path to the .json file with evaluation parameter"}
114 |     )
115 | 


--------------------------------------------------------------------------------
/terminator/collator_utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import torch
  4 | import transformers
  5 | 
  6 | 
  7 | def get_mask(
  8 |     labels: torch.Tensor,
  9 |     max_span_length: int,
 10 |     plm_probability: float,
 11 |     mask_start_idxs: Optional[torch.Tensor] = None,
 12 |     mask_end_idxs: Optional[torch.Tensor] = None,
 13 | ) -> (torch.Tensor, torch.Tensor):
 14 |     """Receives a tensor of labels and computes the masked_indices and the target
 15 |         mapping.
 16 | 
 17 |     Args:
 18 |         labels (torch.Tensor): Input tensor (2D)
 19 |         max_span_length (int): Maximal length for the span of masked tokens
 20 |         plm_probability (float): Probability for each token to be masked.
 21 |         mask_start_idxs (torch.Tensor, Optional): Tensor of length labels with indices
 22 |             for first possible token to mask.
 23 |         mask_end_idxs (torch.Tensor, Optional): Tensor of length labels with indices
 24 |             for last possible token to mask.
 25 | 
 26 |     Returns:
 27 |         masked_indices: 2D Tensor of masked indices.
 28 |         target_mapping: 3D tensor of diagonal matrices for each sample.
 29 |     """
 30 | 
 31 |     # Creating the mask and target_mapping tensors
 32 |     masked_indices = torch.full(labels.shape, 0, dtype=torch.bool)
 33 |     target_mapping = torch.zeros(
 34 |         (labels.size(0), labels.size(1), labels.size(1)), dtype=torch.float32
 35 |     )
 36 |     # If on-/offset for masking are not provided we can mask from start to end
 37 |     if mask_start_idxs is None:
 38 |         mask_start_idxs = [0] * labels.size(0)
 39 |     if mask_end_idxs is None:
 40 |         mask_end_idxs = [1 * labels.size(1)] * labels.size(0)
 41 | 
 42 |     for i in range(labels.size(0)):
 43 |         # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
 44 |         cur_len = mask_start_idxs[i]
 45 |         max_len = mask_end_idxs[i]
 46 | 
 47 |         # If the masking range is just a single token, we always mask it
 48 |         if cur_len == max_len:
 49 |             masked_indices[i, cur_len] = 1
 50 | 
 51 |         while cur_len < max_len:
 52 |             # Sample (length of span of tokens to be masked), take the minimum to avoid
 53 |             # that the span length is longer than the molecule length
 54 |             span_length = min(
 55 |                 torch.randint(1, max_span_length + 1, (1,)).item(), max_len - cur_len
 56 |             )
 57 |             # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
 58 |             context_length = int(span_length / plm_probability)
 59 |             # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
 60 |             # the min is needed to avoid that the span extends over max_len
 61 |             # the max is needed to avoid that the span starts before cur_len
 62 |             start_index = max(
 63 |                 min(
 64 |                     cur_len
 65 |                     + torch.randint(context_length - span_length + 1, (1,)).item(),
 66 |                     max_len - span_length,
 67 |                 ),
 68 |                 cur_len,
 69 |             )
 70 |             masked_indices[i, start_index : start_index + span_length] = 1
 71 |             # Set `cur_len = cur_len + context_length`
 72 |             cur_len += context_length
 73 | 
 74 |         # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
 75 |         # the i-th predict corresponds to the i-th token.
 76 |         target_mapping[i] = torch.eye(labels.size(1))
 77 | 
 78 |     return masked_indices, target_mapping
 79 | 
 80 | 
 81 | def get_permutation_order(
 82 |     labels: torch.Tensor,
 83 |     masked_indices: torch.Tensor,
 84 |     non_func_mask: torch.Tensor,
 85 |     device: str = "cpu",
 86 | ) -> torch.Tensor:
 87 | 
 88 |     perm_mask = torch.zeros(
 89 |         (labels.size(0), labels.size(1), labels.size(1)),
 90 |         dtype=torch.float32,
 91 |         device=device,
 92 |     )
 93 | 
 94 |     for i in range(labels.size(0)):
 95 |         # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
 96 |         # determine which tokens a given token can attend to (encoded in `perm_mask`).
 97 |         # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
 98 |         # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
 99 |         # we assume that reused length is half of sequence length and permutation length is equal to reused length.
100 |         # This requires that the sequence length be even.
101 | 
102 |         # Create a linear factorisation order
103 |         perm_index = torch.arange(labels.size(1), device=device)
104 |         # Split this into two halves, assuming that half the sequence is reused each time
105 |         perm_index = perm_index.reshape((-1, labels.size(1) // 2)).transpose(0, 1)
106 |         # Permute the two halves such that they do not cross over
107 |         perm_index = perm_index[torch.randperm(labels.size(1) // 2)]
108 |         # Flatten this out into the desired permuted factorisation order
109 |         perm_index = torch.flatten(perm_index.transpose(0, 1))
110 |         # Set the permutation indices of non-masked (non-functional) tokens to the
111 |         # smallest index (-1) so that:
112 |         # (1) They can be seen by all other positions
113 |         # (2) They cannot see masked positions, so there won't be information leak
114 |         perm_index.masked_fill_(~masked_indices[i] & non_func_mask[i], -1)
115 |         # The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
116 |         # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
117 |         # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
118 |         perm_mask[i] = (
119 |             perm_index.reshape((labels.size(1), 1))
120 |             <= perm_index.reshape((1, labels.size(1)))
121 |         ) & masked_indices[i]
122 | 
123 |     return perm_mask
124 | 


--------------------------------------------------------------------------------
/terminator/datasets.py:
--------------------------------------------------------------------------------
 1 | from transformers import LineByLineTextDataset, PreTrainedTokenizer, TextDataset
 2 | 
 3 | 
 4 | def get_dataset(
 5 |     filepath: str,
 6 |     tokenizer: PreTrainedTokenizer,
 7 |     block_size: int,
 8 |     line_by_line: bool = True,
 9 | ):
10 |     if line_by_line:
11 |         return LineByLineTextDataset(
12 |             tokenizer=tokenizer, file_path=filepath, block_size=block_size
13 |         )
14 |     else:
15 |         return TextDataset(
16 |             tokenizer=tokenizer,
17 |             file_path=filepath,
18 |             block_size=block_size,
19 |         )
20 | 


--------------------------------------------------------------------------------
/terminator/factories.py:
--------------------------------------------------------------------------------
1 | from .numerical_encodings import FloatEncoding, IntEncoding
2 | 
3 | NUM_ENCODING_FACTORY = {"float": FloatEncoding, "int": IntEncoding}
4 | 
5 | MODEL_TO_EMBEDDING_FN = {
6 |     "albert": "model.albert.embeddings",
7 |     "xlnet": "self.model.transformer.word_embedding",
8 | }
9 | 


--------------------------------------------------------------------------------
/terminator/functional_groups.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Original authors: Richard Hall and Guillaume Godin
 3 | #  This file is part of the RDKit.
 4 | #  The contents are covered by the terms of the BSD license
 5 | #  which is included in the file license.txt, found at the root
 6 | #  of the RDKit source tree.
 7 | 
 8 | from collections import namedtuple
 9 | 
10 | #
11 | #
12 | # Richard hall 2017
13 | # IFG main code
14 | # Guillaume Godin 2017
15 | # refine output function
16 | # astex_ifg: identify functional groups a la Ertl, J. Cheminform (2017) 9:36
17 | from rdkit import Chem
18 | 
19 | 
20 | def merge(mol, marked, aset):
21 |     bset = set()
22 |     for idx in aset:
23 |         atom = mol.GetAtomWithIdx(idx)
24 |         for nbr in atom.GetNeighbors():
25 |             jdx = nbr.GetIdx()
26 |             if jdx in marked:
27 |                 marked.remove(jdx)
28 |                 bset.add(jdx)
29 |     if not bset:
30 |         return
31 |     merge(mol, marked, bset)
32 |     aset.update(bset)
33 | 
34 | 
35 | # atoms connected by non-aromatic double or triple bond to any heteroatom
36 | # c=O should not match (see fig1, box 15).  I think using A instead of * should sort that out?
37 | PATT_DOUBLE_TRIPLE = Chem.MolFromSmarts("A=,#[!#6]")
38 | # atoms in non aromatic carbon-carbon double or triple bonds
39 | PATT_CC_DOUBLE_TRIPLE = Chem.MolFromSmarts("C=,#C")
40 | # acetal carbons, i.e. sp3 carbons connected to tow or more oxygens, nitrogens or sulfurs; these O, N or S atoms must have only single bonds
41 | PATT_ACETAL = Chem.MolFromSmarts("[CX4](-[O,N,S])-[O,N,S]")
42 | # all atoms in oxirane, aziridine and thiirane rings
43 | PATT_OXIRANE_ETC = Chem.MolFromSmarts("[O,N,S]1CC1")
44 | 
45 | PATT_TUPLE = (PATT_DOUBLE_TRIPLE, PATT_CC_DOUBLE_TRIPLE, PATT_ACETAL, PATT_OXIRANE_ETC)
46 | 
47 | 
48 | def identify_functional_groups(mol):
49 |     marked = set()
50 |     # mark all heteroatoms in a molecule, including halogens
51 |     for atom in mol.GetAtoms():
52 |         if atom.GetAtomicNum() not in (6, 1):  # would we ever have hydrogen?
53 |             marked.add(atom.GetIdx())
54 | 
55 |     # mark the four specific types of carbon atom
56 |     for patt in PATT_TUPLE:
57 |         for path in mol.GetSubstructMatches(patt):
58 |             for atomindex in path:
59 |                 marked.add(atomindex)
60 | 
61 |     # merge all connected marked atoms to a single FG
62 |     groups = []
63 |     while marked:
64 |         grp = set([marked.pop()])
65 |         merge(mol, marked, grp)
66 |         groups.append(grp)
67 | 
68 |     # extract also connected unmarked carbon atoms
69 |     ifg = namedtuple("IFG", ["atomIds", "atoms", "type"])
70 |     ifgs = []
71 |     for g in groups:
72 |         uca = set()
73 |         for atomidx in g:
74 |             for n in mol.GetAtomWithIdx(atomidx).GetNeighbors():
75 |                 if n.GetAtomicNum() == 6:
76 |                     uca.add(n.GetIdx())
77 |         ifgs.append(
78 |             ifg(
79 |                 atomIds=tuple(list(g)),
80 |                 atoms=Chem.MolFragmentToSmiles(mol, g, canonical=True),
81 |                 type=Chem.MolFragmentToSmiles(mol, g.union(uca), canonical=True),
82 |             )
83 |         )
84 |     return ifgs
85 | 


--------------------------------------------------------------------------------
/terminator/nlp.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import numpy as np
 4 | from transformers import XLNetTokenizer
 5 | 
 6 | 
 7 | def parse_humicroedit(
 8 |     dataset, expression_separator: str = "{", expression_end: str = "}"
 9 | ) -> List[str]:
10 |     """
11 |     Parse the humicrocredit dataset in an appropriate format.
12 |     - token separating numbers from text: {
13 |     - oken separating text items: }
14 | 
15 |     Args:
16 |         dataset: The respective chunkk of the humicroedit dataset loaded via Huggingface.
17 | 
18 |     Raises:
19 |         ValueError: If the joke cant be extracted uniquely
20 | 
21 |     Returns:
22 |         _description_
23 |     """
24 | 
25 |     lines = []
26 |     for sample in dataset:
27 |         prop = "[funny]" + str(round(float(sample["meanGrade"]), 1))
28 |         text = sample["original"]
29 |         if text.count("<") > 1 or text.count("/>") > 1:
30 |             raise ValueError(text)
31 |         if "{" in text or "}" in text:
32 |             print(text)
33 |         text = text.replace("<", "START ").replace("/>", " END")
34 | 
35 |         line = prop + expression_separator + sample["edit"] + expression_end + text
36 |         lines.append(line)
37 |     return lines
38 | 
39 | 
40 | def compute_topk(predictions: np.array) -> List[float]:
41 |     """
42 |     Computes the topk accuracy of a boolean np array
43 | 
44 |     Args:
45 |         predictions: boolean np.array of shape batch_size x k with correctness of each
46 |             prediction
47 | 
48 |     Returns:
49 |         List of floats denoting the top-k accuracies
50 |     """
51 | 
52 |     topk = [np.mean(predictions[:, 0])]
53 |     for k in range(1, predictions.shape[1]):
54 |         topk.append(topk[-1] + np.mean(predictions[:, k]))
55 |     return topk
56 | 


--------------------------------------------------------------------------------
/terminator/numerical_encodings.py:
--------------------------------------------------------------------------------
  1 | import numbers
  2 | import warnings
  3 | from math import cos, inf, sin
  4 | from typing import Dict, Optional
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import transformers
  9 | from torch import Tensor
 10 | 
 11 | from .utils import get_device
 12 | 
 13 | 
 14 | def get_float_encoding(
 15 |     token: str, embedding_size: int, vmax: float = 1.0
 16 | ) -> torch.Tensor:
 17 |     """Convert a token representing a float into a _fixed_ embedding vector.
 18 |     NOTE: This can be used for *any* range of numbers > 0.
 19 | 
 20 |     Args:
 21 |         token (str): A token representing a float. NOTE: Needs to follow notation
 22 |             _8_-1_ to represent 0.8 or _5_-2_ to represent 0.05.
 23 |         embedding_size (int): Size of the embedding.
 24 |         vmax (int, optional): Maximal value of float, defaults to 1. Normalizes
 25 |             values to be in the range ~ [-10, 10].
 26 |             NOTE: If remaining nn.embeddings in model use `max_norm`, this might result
 27 |             in large range discrepancies.
 28 | 
 29 |     Returns:
 30 |         torch.Tensor: Tensor of length embedding_size containing the embedding.
 31 |     """
 32 |     if embedding_size % 2 != 0:
 33 |         raise ValueError("Embedding size cant be odd.")
 34 | 
 35 |     vals = torch.zeros((embedding_size,))
 36 |     if len(token) == 1 or not (
 37 |         token.startswith("_") and token.endswith("_") and token.count("_") == 3
 38 |     ):
 39 |         return vals
 40 |     else:
 41 |         digit = int(token[1])
 42 |         order = int(token.split("_")[-2])
 43 |         val = digit * 10**order
 44 | 
 45 |     for i in range(0, embedding_size, 2):
 46 |         vals[i] = val / (i + 1)
 47 |         vals[i + 1] = -val / (i + 1)
 48 | 
 49 |     return vals / (vmax / 10)
 50 | 
 51 | 
 52 | def get_full_float_encoding(
 53 |     value: float, embedding_size: int, vmax: float = 1.0
 54 | ) -> Tensor:
 55 |     """
 56 |     Convert a float value into a _fixed_ embedding vector.
 57 | 
 58 |     Args:
 59 |         value: The float value to be encoded.
 60 |         embedding_size: The size of the embedding.
 61 |         vmax: Maximal value the `value` variable can take. This normalizes values
 62 |             to be in the range ~ [-10, 10]. NOTE: If remaining nn.embeddings in
 63 |             model use `max_norm`, this might result in large range discrepancies.
 64 | 
 65 |     Returns:
 66 |         torch.Tensor of shape (embedding_size, ) containing the embedding.
 67 |     """
 68 |     if embedding_size % 2 != 0:
 69 |         raise ValueError(f"Embedding size {embedding_size} cant be odd.")
 70 |     integer = int(value)
 71 |     decimal = value - integer
 72 |     scalar = integer * 10**decimal
 73 |     embedding = torch.zeros((embedding_size,))
 74 |     for i in range(0, embedding_size, 2):
 75 |         embedding[i] = scalar / (i + 1)
 76 |         embedding[i + 1] = -scalar / (i + 1)
 77 |     return embedding
 78 | 
 79 | 
 80 | def get_int_encoding(token: str, embedding_size: int) -> torch.Tensor:
 81 |     """Convert a token representing an integer into a _fixed_ embedding vector.
 82 |     NOTE: This can be used only for positive integers - the generation of the
 83 |         encodings is *identical* to positional encodings.
 84 | 
 85 |     Args:
 86 |         token (str): A token representing an integer. NOTE: Needs to follow notation
 87 |             _8_2_ to represent 80 or _5_1_ to represent 5.
 88 |         embedding_size (int): Size of the embedding.
 89 | 
 90 |     Returns:
 91 |         torch.Tensor: Tensor of length embedding_size containing the embedding.
 92 |     """
 93 |     ed = embedding_size
 94 |     vals = torch.zeros((ed,))
 95 | 
 96 |     if len(token) == 1 or not (
 97 |         token.startswith("_") and token.endswith("_") and token.count("_") == 3
 98 |     ):
 99 |         return vals
100 |     else:
101 |         digit = int(token[1])
102 |         order = int(token.split("_")[-2])
103 |         val = digit * 10**order
104 | 
105 |         if order < 0:
106 |             raise ValueError(
107 |                 f"Found float encoding in {token}. Pass positive ints only."
108 |             )
109 | 
110 |     sine = lambda p, i: sin(p / (10000.0 ** (2 * i / ed)))
111 |     cose = lambda p, i: cos(p / (10000.0 ** (2 * i / ed)))
112 |     for i in range(0, ed, 2):
113 |         vals[i] = sine(val, i)
114 |         vals[i + 1] = cose(val, i)
115 |     return vals
116 | 
117 | 
118 | class FloatEncoding(nn.Embedding):
119 |     """
120 |     A nn.Embedding inspired class to generate fixed embedding vectors that represent
121 |     numbers passed as tokens.
122 |     NOTE: Tokens representing numbers need to follow notation _8_-1_ to represent 0.8.
123 |     """
124 | 
125 |     def __init__(
126 |         self,
127 |         num_embeddings: int,
128 |         embedding_dim: int,
129 |         vocab: Dict,
130 |         vmax: Optional[float] = None,
131 |         *args,
132 |         **kwargs,
133 |     ) -> None:
134 |         """
135 |         Constructor for FloatEmbedding; sets up the fixed embedding matrix.
136 | 
137 |         Args:
138 |             num_embeddings (int): size of the dictionary of embeddings.
139 |             embedding_dim (int): the size of each embedding vector
140 |             vocab (Dict): the language dictionary with tokens as keys and indexes as
141 |                 values. Length needs to match num_embeddings
142 |             vmax (Optional[float]): Maximal value of float, defaults to None.
143 | 
144 |         Raises:
145 |             ValueError: if num_embeddings does not match len(vocab).
146 |             TypeError: if neither None nor a number is passed as vmax
147 |             ValueError: if vmax is negative.
148 |         """
149 | 
150 |         super(FloatEncoding, self).__init__(
151 |             num_embeddings, embedding_dim, *args, **kwargs
152 |         )
153 | 
154 |         if not len(vocab) == num_embeddings:
155 |             raise ValueError(
156 |                 f"num_embeddings needs to match size of vocabulary ({num_embeddings}!={len(vocab)})"
157 |             )
158 |         if not (vmax is None or isinstance(3, numbers.Number)):
159 |             raise TypeError(f"vmax needs to be a number or None, not {vmax}.")
160 | 
161 |         if vmax is None:
162 |             # Infer the highest number in the dictionary (for normalization)
163 |             test = lambda t: len(t) == 1 or not (
164 |                 t.startswith("_") and t.endswith("_") and t.count("_") == 3
165 |             )
166 |             vmax = max(
167 |                 [
168 |                     -inf
169 |                     if test(token)
170 |                     else int(token[1]) * 10 ** int(token.split("_")[-2])
171 |                     for token in vocab.keys()
172 |                 ]
173 |             )
174 |             warnings.warn(
175 |                 f"The inferred maximum float ({vmax}) is used for normalizing all float embeddings"
176 |                 " which might result in diminishing embeddings."
177 |             )
178 | 
179 |         if vmax < 0:
180 |             raise ValueError(f"Can not work only with negative numbers (vmax = {vmax})")
181 | 
182 |         weights = torch.zeros(num_embeddings, embedding_dim)
183 |         for idx, (token, index) in enumerate(vocab.items()):
184 |             assert (
185 |                 idx == index
186 |             ), "Please sort vocab indexes in ascending order starting from 0"
187 |             weights[idx, :] = get_float_encoding(token, embedding_dim, vmax)
188 |         weights = weights.to(device=get_device())
189 |         self.embedding = nn.Embedding.from_pretrained(weights, freeze=True)
190 |         self.vocab = vocab
191 | 
192 |     def forward(self, x: Tensor) -> Tensor:
193 |         return self.embedding(x)
194 | 
195 | 
196 | class IntEncoding(nn.Embedding):
197 |     """
198 |     A nn.Embedding inspired class to generate fixed embedding vectors that represent
199 |     positive integers passed as tokens.
200 |     NOTE: Tokens representing numbers need to follow notation _8_2_ to represent 80.
201 |     """
202 | 
203 |     def __init__(
204 |         self, num_embeddings: int, embedding_dim: int, vocab: Dict, *args, **kwargs
205 |     ) -> None:
206 |         """
207 |         Constructor for FloatEmbedding; sets up the fixed embedding matrix.
208 | 
209 |         Args:
210 |             num_embeddings (int): size of the dictionary of embeddings.
211 |             embedding_dim (int): the size of each embedding vector
212 |             vocab (Dict): the language dictionary with tokens as keys and indexes as
213 |                 values. Length needs to match num_embeddings
214 | 
215 |         Raises:
216 |             ValueError: if num_embeddings does not match len(vocab).
217 |             TypeError: if neither None nor a number is passed as vmax
218 |             ValueError: if vmax is negative.
219 |         """
220 | 
221 |         if "vmax" in kwargs.keys():
222 |             kwargs.pop("vmax")
223 | 
224 |         super(IntEncoding, self).__init__(
225 |             num_embeddings, embedding_dim, *args, **kwargs
226 |         )
227 | 
228 |         if not len(vocab) == num_embeddings:
229 |             raise ValueError(
230 |                 f"num_embeddings needs to match size of vocabulary ({num_embeddings}!={len(vocab)})"
231 |             )
232 | 
233 |         weights = torch.zeros(num_embeddings, embedding_dim)
234 |         for idx, (token, index) in enumerate(vocab.items()):
235 |             assert (
236 |                 idx == index
237 |             ), "Please sort vocab indexes in ascending order starting from 0"
238 |             weights[idx, :] = get_int_encoding(token, embedding_dim)
239 | 
240 |         weights = weights.to(device=get_device())
241 |         self.embedding = nn.Embedding.from_pretrained(weights, freeze=True)
242 |         self.vocab = vocab
243 | 
244 |     def forward(self, x: Tensor) -> Tensor:
245 |         return self.embedding(x)
246 | 


--------------------------------------------------------------------------------
/terminator/property_predictors.py:
--------------------------------------------------------------------------------
 1 | """Factory of property predictors based on strings"""
 2 | from rdkit import Chem
 3 | from modlamp.descriptors import GlobalDescriptor
 4 | from rdkit.Chem.QED import qed
 5 | 
 6 | 
 7 | def predict_qed(smiles: str) -> float:
 8 |     try:
 9 |         q = qed(Chem.MolFromSmiles(smiles, sanitize=False))
10 |         return q, {"qed": q}
11 |     except Exception:
12 |         return -1, {"qed": -1}
13 | 
14 | 
15 | def boman_index(sequence: str) -> float:
16 |     """Calculate the Boman index of a protein.
17 |     The Boman index is a measure of protein interactions (potential to bind to
18 |     membranes or others proteins). It's the average solubility for all residues
19 |     in the sequence. Above 2.48 is considered high binding potential.
20 | 
21 |     For details see:
22 |         Boman, H. G. "Antibacterial peptides: basic facts and emerging concepts."
23 |         Journal of internal medicine 254.3 (2003): 197-215.
24 | 
25 |     Args:
26 |         sequence (str): An AA sequence
27 | 
28 |     Returns:
29 |         float: The boman index.
30 |     """
31 |     try:
32 |         sequence = sequence.strip().upper()
33 |         desc = GlobalDescriptor(sequence)
34 |         desc.boman_index()
35 |         b = float(desc.descriptor)
36 |         return b, {"boman": b}
37 |     except Exception:
38 |         return -100, {"boman": -100}
39 | 
40 | 
41 | PREDICT_FACTORY = {"qed": predict_qed, "boman": boman_index}
42 | 


--------------------------------------------------------------------------------
/terminator/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/regression-transformer/6820b45e4548ee2648557fcf120b3361414a8cd9/terminator/py.typed


--------------------------------------------------------------------------------
/terminator/search.py:
--------------------------------------------------------------------------------
  1 | """Decoding utilities."""
  2 | from math import log
  3 | from sys import float_info
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import transformers
  8 | from torch import nn
  9 | 
 10 | from .utils import get_device
 11 | 
 12 | 
 13 | class Search(nn.Module):
 14 |     """Base search class."""
 15 | 
 16 |     def __init__(self, *args, **kwargs):
 17 |         super().__init__()
 18 |         self.device = get_device()
 19 | 
 20 |     def forward(self, logits: torch.Tensor) -> object:
 21 |         """
 22 |         Error handling.
 23 | 
 24 |         Args:
 25 |             logits: torch.Tensor (Tensor): the model's
 26 |                 logits. (batch_size, length, vocabulary_size)
 27 |         Returns:
 28 |             object: the search output.
 29 |         """
 30 |         if not len(logits.shape) == 3:
 31 |             raise ValueError(f"Logits need to be 3D Tensor, was: {logits.shape}")
 32 |         if not type(logits) == torch.Tensor:
 33 |             raise TypeError(f"Logits need to be torch.Tensor, was: {type(logits)}")
 34 | 
 35 |     def step(self, logits: torch.Tensor) -> object:
 36 |         """
 37 |         Error handling.
 38 | 
 39 |         Args:
 40 |             logits: torch.Tensor (Tensor): the model's
 41 |                 logits. (batch_size, vocabulary_size)
 42 |         Returns:
 43 |             object: the search output.
 44 |         """
 45 |         if len(logits.shape) > 3:
 46 |             raise ValueError(f"Logits need to be 2D or 3D Tensor, was: {logits.shape}")
 47 |         if not type(logits) == torch.Tensor:
 48 |             raise TypeError(f"Logits need to be torch.Tensor, was: {type(logits)}")
 49 | 
 50 | 
 51 | class GreedySearch(Search):
 52 |     """ "Greedy search."""
 53 | 
 54 |     def __init__(self, *args, **kwargs):
 55 |         super().__init__(*args, **kwargs)
 56 | 
 57 |     def forward(self, logits: torch.Tensor) -> torch.Tensor:
 58 |         """
 59 |         Perform the greedy search.
 60 | 
 61 |         Args:
 62 |             logits: torch.Tensor (Tensor): the model's
 63 |                 logits. (batch_size, length, vocabulary_size)
 64 |         Returns:
 65 |             torch.Tensor: the token indexes selected. (batch_size, length)
 66 |         """
 67 |         super().forward(logits)
 68 | 
 69 |         return torch.argmax(logits, 2)
 70 | 
 71 |     def step(self, logits: torch.Tensor) -> torch.Tensor:
 72 |         """
 73 |         Perform a greedy search step.
 74 | 
 75 |         Args:
 76 |             logits (torch.Tensor): the model's
 77 |                 logits. (batch_size, vocabulary_size)
 78 |         Returns:
 79 |             torch.Tensor: the token indexes for all the batch. (batch_size, 1).
 80 |         """
 81 |         super().step(logits)
 82 |         return torch.argmax(logits, 1, keepdim=True)
 83 | 
 84 | 
 85 | class SamplingSearch(Search):
 86 |     """ "Sampling search."""
 87 | 
 88 |     def __init__(self, temperature: float = 1.0, *args, **kwargs):
 89 |         """
 90 |         Initialize the sampling search.
 91 | 
 92 |         Args:
 93 |             temperature (float, optional): temperature parameter. Defaults to
 94 |                 1.0, a.k.a., no temperature. Temperature < 1 results in a more
 95 |                 descriminative softmax, > 1 in a flatter distribution.
 96 |         """
 97 |         super().__init__(*args, **kwargs)
 98 |         self.temperature = temperature
 99 | 
100 |     def forward(self, logits: torch.Tensor) -> torch.Tensor:
101 |         """
102 |         Perform the sampling search.
103 | 
104 |         Args:
105 |             logits: torch.Tensor (Tensor): the model's
106 |                 logits. (batch_size, length, vocabulary_size)
107 |         Returns:
108 |             torch.Tensor: the token indexes selected. (batch_size, length)
109 |         """
110 |         super().forward(logits)
111 |         probabilities = torch.softmax(logits.div(self.temperature), 2)
112 |         return torch.stack(
113 |             [torch.multinomial(probability, 1) for probability in probabilities]
114 |         ).squeeze(dim=-1)
115 | 
116 |     def step(self, logits: torch.Tensor) -> torch.Tensor:
117 |         """
118 |         Perform a sampling search step.
119 | 
120 |         Args:
121 |             logits (torch.Tensor): the model's
122 |                 logits. (batch_size, vocabulary_size)
123 |         Returns:
124 |             torch.Tensor: the token indexes for all the batch. (batch_size, 1).
125 |         """
126 |         super().step(logits)
127 |         probabilities = torch.softmax(logits.div(self.temperature), 1)
128 |         return torch.stack(
129 |             [torch.multinomial(probability, 1) for probability in probabilities]
130 |         )
131 | 
132 | 
133 | class BeamSearch(Search):
134 |     """Beam search."""
135 | 
136 |     def __init__(
137 |         self, beam_width: int = 3, temperature: float = 1.0, top_tokens: int = 5
138 |     ):
139 |         """
140 |         Initialize the beam search.
141 |         Args:
142 |             beam_width (int, optional): top sequences returned. Defaults to 3.
143 |             temperature (float, optional): temperature parameter. Defaults to
144 |                 1.0, a.k.a., no temperature. Temperature < 1 results in a more
145 |                 descriminative softmax, > 1 in a flatter distribution.
146 |             top_tokens (int, optional): number of top dictionary tokens kept
147 |                 for the search, defaults to 5.
148 |         """
149 |         super().__init__()
150 |         self.beam_width = beam_width
151 |         self.temperature = temperature
152 |         self.top_tokens = top_tokens
153 | 
154 |     def _beam_step_per_sequence(self, probabilities: torch.Tensor, beams: list) -> list:
155 |         """
156 |         Perform a beam search step.
157 |         Args:
158 |             probabilities (torch.Tensor): probabilities for the current step.
159 |                 (beam_width, vocabulary_size).
160 |             beams (list): beams containg sequence and score. Length is equal
161 |                 to beam_width.
162 |         Returns:
163 |             list: updated beams.
164 |         """
165 |         all_candidates = list()
166 |         # expand each current candidate
167 |         for probability, beam in zip(probabilities, beams):
168 |             a_sequence, score = beam
169 |             # Sort the probabilities over dict and select indices of top n
170 |             top_token_indexes = np.argsort(-probability)[: self.top_tokens]
171 |             for top_token in top_token_indexes:
172 |                 candidate = [
173 |                     a_sequence + [top_token],
174 |                     score + log(probability[top_token] + float_info.epsilon),
175 |                 ]
176 |                 all_candidates.append(candidate)
177 |         # order all candidates by score
178 |         ordered = sorted(all_candidates, key=lambda pair: pair[1], reverse=True)
179 |         # select best
180 |         return ordered[: self.beam_width]
181 | 
182 |     def _beam_per_sequence(self, logits: torch.Tensor) -> tuple:
183 |         """
184 |         Beam per sequence in the batch.
185 |         Args:
186 |             logits (torch.Tensor): logits.
187 |                 (length, vocabulary_size)
188 |         Returns:
189 |             tuple: a tuple containing:
190 |             - a tensor with tokens. (length, beam_width)
191 |             - score. (beam_width)
192 |         """
193 |         beams = [[list(), 0.0]]
194 |         probabilities = torch.softmax(logits.div(self.temperature), 1)
195 |         # walk over each step in sequence
196 |         for probability in probabilities:
197 |             probability_beams = torch.stack(
198 |                 [probability] + [probability.clone() for _ in range(self.beam_width)]
199 |             )
200 |             beams = self._beam_step_per_sequence(probability_beams, beams)
201 |         sequences, scores = zip(*beams)
202 |         return (torch.tensor(list(sequences)).T, torch.tensor(list(scores)))
203 | 
204 |     def forward(self, logits: torch.Tensor) -> tuple:
205 |         """
206 |         Perform the beam search for a non-autoregressive generator.
207 |         Args:
208 |             logits (torch.Tensor): the model's
209 |                 logits. (batch_size, length, vocabulary_size)
210 |         Returns:
211 |             tuple: a tuple containing:
212 |             - the token indexes for each top sequence.
213 |                 (batch_size, length, beam_width)
214 |             - scores. (batch_size, beam_width)
215 |         """
216 |         super().forward(logits)
217 |         tokens, scores = zip(
218 |             *[self._beam_per_sequence(sequence) for sequence in logits]
219 |         )
220 |         return (torch.stack(tokens), torch.stack(scores))
221 | 
222 |     def step(self, logits: torch.Tensor, beams: list) -> tuple:
223 |         """
224 |         Perform a single beam search step for an autoregressive model.
225 |         Args:
226 |             logits (torch.Tensor): the model's
227 |                 logits. (beam_width, batch_size, vocabulary_size)
228 |             beams (list): beams for all the batch.
229 |         Returns:
230 |             tuple: a tuple containing:
231 |             - the token indexes for all the batch.
232 |                 (beam_width, batch_size)
233 |             - updated beams for all the batch.
234 |         """
235 |         super().step(logits)
236 |         probabilities = torch.softmax(logits.div(self.temperature), 2)
237 |         updated_beams = [
238 |             self._beam_step_per_sequence(sample_probability, sample_beams)
239 |             for sample_probability, sample_beams in zip(
240 |                 probabilities.permute(1, 0, 2), beams
241 |             )
242 |         ]
243 |         token_beams = (
244 |             torch.stack(
245 |                 [
246 |                     # get last token for each beam
247 |                     torch.tensor([beam[0][-1] for beam in sample_beams])
248 |                     for sample_beams in updated_beams
249 |                 ]
250 |             )
251 |             .permute(1, 0)
252 |             .to(self.device)
253 |         )
254 |         return (token_beams, updated_beams)
255 | 
256 | 
257 | SEARCH_FACTORY = {"greedy": GreedySearch, "beam": BeamSearch, "sample": SamplingSearch}
258 | 


--------------------------------------------------------------------------------
/terminator/tokenization.py:
--------------------------------------------------------------------------------
  1 | """Tokenization utilties for exrepssions."""
  2 | import re
  3 | from typing import Dict, List, Tuple
  4 | 
  5 | import torch
  6 | from selfies import decoder, split_selfies
  7 | from transformers import BertTokenizer, XLNetTokenizer
  8 | 
  9 | SMILES_TOKENIZER_PATTERN = r"(\%\([0-9]{3}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\||\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
 10 | 
 11 | 
 12 | class RegexTokenizer:
 13 |     """Run regex tokenization"""
 14 | 
 15 |     def __init__(self, regex_pattern: str) -> None:
 16 |         """Constructs a RegexTokenizer.
 17 | 
 18 |         Args:
 19 |             regex_pattern: regex pattern used for tokenization
 20 |         """
 21 |         self.regex_pattern = regex_pattern
 22 |         self.regex = re.compile(self.regex_pattern)
 23 | 
 24 |     def tokenize(self, text: str) -> List[str]:
 25 |         """Regex tokenization.
 26 | 
 27 |         Args:
 28 |             text: text to tokenize.
 29 | 
 30 |         Returns:
 31 |             extracted tokens.
 32 |         """
 33 |         tokens = [token for token in self.regex.findall(text)]
 34 |         return tokens
 35 | 
 36 | 
 37 | class PropertyTokenizer:
 38 |     """Run a property tokenization."""
 39 | 
 40 |     def __init__(self) -> None:
 41 |         """Constructs a PropertyTokenizer."""
 42 |         self.regex = re.compile(r"\s*(<\w+>)\s*?(\+|-)?(\d+)(\.)?(\d+)?\s*")
 43 | 
 44 |     def tokenize(self, text: str) -> List[str]:
 45 |         """Tokenization of a property.
 46 | 
 47 |         Args:
 48 |             text: text to tokenize.
 49 | 
 50 |         Returns:
 51 |             extracted tokens.
 52 |         """
 53 |         tokens = []
 54 |         matched = self.regex.match(text)
 55 |         if matched:
 56 |             property_name, sign, units, dot, decimals = matched.groups()
 57 |             tokens = [property_name]
 58 |             if sign:
 59 |                 tokens += [f"_{sign}_"]
 60 |             tokens += [
 61 |                 f"_{number}_{position}_" for position, number in enumerate(units[::-1])
 62 |             ][::-1]
 63 |             if dot:
 64 |                 tokens += [f"_{dot}_"]
 65 |             if decimals:
 66 |                 tokens += [
 67 |                     f"_{number}_-{position}_"
 68 |                     for position, number in enumerate(decimals, 1)
 69 |                 ]
 70 |         return tokens
 71 | 
 72 | 
 73 | class PropertyTokenizerSquare(PropertyTokenizer):
 74 |     """Run a property tokenization."""
 75 | 
 76 |     def __init__(self) -> None:
 77 |         """Constructs a PropertyTokenizer."""
 78 |         self.regex = re.compile(r"\s*(\[\w+\])\s*?(\+|-)?(\d+)(\.)?(\d+)?\s*")
 79 | 
 80 | 
 81 | class CharacterTokenizer:
 82 |     def __init__(self) -> None:
 83 |         """Constructs a tokenizer that simply splits each character"""
 84 |         self.tokenizer = lambda x: list(x)
 85 | 
 86 |     def tokenize(self, text: str) -> List[str]:
 87 |         """Tokenize an expression.
 88 | 
 89 |         Args:
 90 |             text: text to tokenize.
 91 | 
 92 |         Returns:
 93 |             extracted tokens.
 94 |         """
 95 |         return self.tokenizer(text)
 96 | 
 97 | 
 98 | class SelfiesTokenizer(CharacterTokenizer):
 99 |     def __init__(self) -> None:
100 |         """Constructs an expression tokenizer for SELFIES
101 | 
102 |         Args:
103 |             expression_tokenizer: Separator token for properties and molecule.
104 |                 Defaults to '|'.
105 |         """
106 |         self.tokenizer = lambda x: list(split_selfies(x))
107 | 
108 | 
109 | class ExpressionTokenizer:
110 |     def __init__(
111 |         self, expression_tokenizer: str = "|", language: str = "SMILES"
112 |     ) -> None:
113 |         """Constructs an expression tokenizer.
114 | 
115 |         Args:
116 |             expression_tokenizer (str): Token separating the property. Defaults to '|'.
117 |                 Must not occur in the language itself.
118 |             language (str): Identifier for the (chemical) language. Should be either
119 |                 'SMILES', 'SELFIES' or 'AAS'.
120 |         """
121 |         self.language = language
122 |         if language == "SMILES":
123 |             self.text_tokenizer = RegexTokenizer(regex_pattern=SMILES_TOKENIZER_PATTERN)
124 |         elif language == "SELFIES":
125 |             self.text_tokenizer = SelfiesTokenizer()
126 |         elif language == "AAS":
127 |             self.text_tokenizer = CharacterTokenizer()
128 |         else:
129 |             raise ValueError(
130 |                 f"Unsupported language {language}, choose 'SMILES', 'SELFIES' or 'AAS'."
131 |             )
132 |         self.property_tokenizer = PropertyTokenizer()
133 |         self.expression_separator = expression_tokenizer
134 | 
135 |     def tokenize(self, text: str) -> List[str]:
136 |         """Tokenize an expression.
137 | 
138 |         Args:
139 |             text: text to tokenize.
140 | 
141 |         Returns:
142 |             extracted tokens.
143 |         """
144 |         splitted_expression = text.split(self.expression_separator)
145 |         tokens = []
146 |         for property_expression in splitted_expression[:-1]:
147 |             tokens.extend(self.property_tokenizer.tokenize(property_expression))
148 |             tokens.append(self.expression_separator)
149 |         tokens.extend(self.text_tokenizer.tokenize(splitted_expression[-1]))
150 |         return tokens
151 | 
152 | 
153 | class ExpressionBertTokenizer(BertTokenizer):
154 |     """
155 |     Constructs a bert-based tokenizer used for the Regression Transformer.
156 | 
157 |     Args:
158 |         vocab_file: path to a token per line vocabulary file.
159 |     """
160 | 
161 |     def __init__(
162 |         self,
163 |         vocab_file,
164 |         unk_token="[UNK]",
165 |         sep_token="[SEP]",
166 |         pad_token="[PAD]",
167 |         cls_token="[CLS]",
168 |         mask_token="[MASK]",
169 |         pad_even: bool = True,
170 |         language: str = "SMILES",
171 |         **kwargs,
172 |     ) -> None:
173 |         """Constructs an ExpressionTokenizer.
174 | 
175 |         Args:
176 |             vocab_file: vocabulary file containing tokens.
177 |             unk_token: unknown token. Defaults to "[UNK]".
178 |             sep_token: separator token. Defaults to "[SEP]".
179 |             pad_token: pad token. Defaults to "[PAD]".
180 |             cls_token: cls token. Defaults to "[CLS]".
181 |             mask_token: mask token. Defaults to "[MASK]".
182 |             pad_even (bool): Boolean indicating whether sequences of odd length should
183 |                 be padded to have an even length. Neede for PLM in XLNet. Defaults to
184 |                 True.
185 |             language (str): Identifier for the (chemical) language. Should be either
186 |                 'SMILES', 'SELFIES' or 'AAS'.
187 |         """
188 |         super().__init__(
189 |             vocab_file=vocab_file,
190 |             do_lower_case=False,
191 |             do_basic_tokenize=True,
192 |             unk_token=unk_token,
193 |             sep_token=sep_token,
194 |             pad_token=pad_token,
195 |             cls_token=cls_token,
196 |             mask_token=mask_token,
197 |             **kwargs,
198 |         )
199 |         # define tokenization utilities
200 |         self.language = language
201 |         if language == "SMILES":
202 |             self.text_tokenizer = RegexTokenizer(regex_pattern=SMILES_TOKENIZER_PATTERN)
203 |         elif self.language == "SELFIES":
204 |             self.text_tokenizer = SelfiesTokenizer()
205 |         elif language == "AAS":
206 |             self.text_tokenizer = CharacterTokenizer()
207 |         else:
208 |             raise ValueError(
209 |                 f"Unsupported language {language}, choose 'SMILES', 'SELFIES' or 'AAS'."
210 |             )
211 | 
212 |         self.property_tokenizer = PropertyTokenizer()
213 |         self.expression_separator = "|"
214 |         self.separator_idx = self.vocab[self.expression_separator]
215 |         self.pad_even = pad_even
216 | 
217 |         # DEPRECATED
218 |         if pad_even:
219 |             self.pad_even_fn = lambda x: x if len(x) % 2 == 0 else x + [self.pad_token]
220 |         else:
221 |             self.pad_even_fn = lambda x: x
222 | 
223 |     @property
224 |     def vocab_list(self) -> List[str]:
225 |         """List vocabulary tokens.
226 | 
227 |         Returns:
228 |             a list of vocabulary tokens.
229 |         """
230 |         return list(self.vocab.keys())
231 | 
232 |     def _tokenize(self, text: str) -> List[str]:
233 |         """Tokenize a text representing an expression.
234 | 
235 |         Args:
236 |             text: text to tokenize.
237 | 
238 |         Returns:
239 |             extracted tokens.
240 |         """
241 |         splitted_expression = text.split(self.expression_separator)
242 |         tokens = []
243 |         for property_expression in splitted_expression[:-1]:
244 |             tokens.extend(self.property_tokenizer.tokenize(property_expression))
245 |             tokens.append(self.expression_separator)
246 |         tokens.extend(self.text_tokenizer.tokenize(splitted_expression[-1]))
247 |         # TODO: remove this hack
248 |         # This is a hack to get around DataCollatorForLanguageModeling requiring even
249 |         # length sequences
250 |         return self.pad_even_fn(tokens)
251 | 
252 |     def add_padding_tokens(
253 |         self, token_ids: List[int], max_length: int, padding_right: bool = True
254 |     ) -> List[int]:
255 |         """Adds padding tokens to return a sequence of length max_length.
256 | 
257 |         By default padding tokens are added to the right of the sequence.
258 | 
259 |         Args:
260 |             token_ids: token indexes.
261 |             max_length: maximum length of the sequence.
262 |             padding_right: whether the sequence is padded on the right. Defaults to True.
263 | 
264 |         Returns:
265 |             padded sequence of token indexes.
266 |         """
267 |         padding_ids = [self.pad_token_id] * (max_length - len(token_ids))
268 |         if padding_right:
269 |             return token_ids + padding_ids
270 |         else:
271 |             return padding_ids + token_ids
272 | 
273 |     @staticmethod
274 |     def get_sample_label(mlm_label: List[str], mlm_input: List[str]) -> List[str]:
275 |         """MLM case: Retrieve true sample sequence from mlm label and mlm input.
276 |         NOTE: Also works for PLM.
277 | 
278 |         Args:
279 |             mlm_label (List[str]): Target sample used in MLM.
280 |             mlm_input (List[str]): MLM input sample.
281 | 
282 |         Returns:
283 |             List[str]: Sample sequence as part of the dataset
284 |         """
285 | 
286 |         return [i if el == "[UNK]" else el for el, i in zip(mlm_label, mlm_input)]
287 | 
288 |     @staticmethod
289 |     def get_sample_prediction(
290 |         mlm_prediction: List[str], mlm_input: List[str]
291 |     ) -> List[str]:
292 |         """MLM case: Retrieve predicted sequence from mlm prediction and mlm input
293 |         NOTE: Also works for PLM.
294 | 
295 |         Args:
296 |             mlm_label (List[str]): Target sample used in MLM.
297 |             mlm_input (List[str]): MLM input sample.
298 | 
299 |         Returns:
300 |             List[str]: Sample sequence as part of the dataset
301 |         """
302 |         return [
303 |             i if i not in ["[MASK]"] else o for o, i in zip(mlm_prediction, mlm_input)
304 |         ]
305 | 
306 |     @staticmethod
307 |     def floating_tokens_to_float(token_ids: List[str]) -> float:
308 |         """Converts tokens representing a float value into a float.
309 |         NOTE: Expects that non-floating tokens are strippped off
310 | 
311 |         Args:
312 |             token_ids: List of tokens, each representing a float.
313 |                 E.g.: ['_0_0_', '_._', '_9_-1_', '_3_-2_', '_1_-3_']
314 | 
315 |         Returns:
316 |             float: Float representation for the list of tokens.
317 |         """
318 |         try:
319 |             float_string = "".join([token.split("_")[1] for token in token_ids])
320 |             float_value = float(float_string)
321 |         except ValueError:
322 |             float_value = -1
323 |         return float_value
324 | 
325 |     def aggregate_tokens(
326 |         self, token_ids: List[str], label_mode: bool, cls_first: bool = True
327 |     ) -> Tuple[str, Dict]:
328 |         """Receives tokens of one sample and returns sequence (e.g. SMILES) and
329 |         a dict of properties.
330 | 
331 |         Args:
332 |             token_ids (List[str]): List of tokens.
333 |             label_mode (bool): Whether the token_ids are labels or predictions.
334 |             cls_first (bool, optional): Whether CLS  token occurres first, default: True
335 | 
336 |         Returns:
337 |             Tuple[str, Dict]:
338 |                 str: SMILES/SELFIES sequence of sample.
339 |                 Dict: A dictionary with property names (e.g. 'qed') as key and
340 |                     properties as values.
341 |         """
342 |         edx = min(
343 |             token_ids.index("[SEP]") if "[SEP]" in token_ids else 1000,
344 |             token_ids.index("[PAD]") if "[PAD]" in token_ids else 1000,
345 |         )
346 | 
347 |         edx = -1 if edx == 1000 else edx
348 | 
349 |         seq = (
350 |             "".join(token_ids[token_ids.index("|") + 1 : edx])
351 |             if "|" in token_ids
352 |             else "".join(token_ids)
353 |         )
354 |         property_dict = {}
355 |         for idx, t in enumerate(token_ids):
356 |             if t.startswith("<") and t.endswith(">"):
357 |                 key = t[1:-1]
358 | 
359 |                 # Convert float
360 |                 end_floating_idx = idx + 1
361 |                 while token_ids[end_floating_idx].startswith("_"):
362 |                     end_floating_idx += 1
363 | 
364 |                 prop = self.floating_tokens_to_float(
365 |                     token_ids[idx + 1 : end_floating_idx]
366 |                 )
367 | 
368 |                 property_dict[key] = prop
369 | 
370 |         return seq, property_dict
371 | 
372 |     def to_readable(self, sequence: str) -> str:
373 |         """Safely returns a readable string irrespective of whether the language is
374 |         SMILES, SELFIES or AAS.
375 | 
376 |         Args:
377 |             sequence (str): A string representing a molecule (either SMILES or SELFIES)
378 |                 or amino acid sequence.
379 | 
380 |         Returns:
381 |             str: A SMILES representing the same molecule.
382 |         """
383 |         if self.language == "SMILES":
384 |             return sequence
385 |         elif self.language == "SELFIES":
386 |             return decoder(sequence)
387 |         elif self.language == "AAS":
388 |             return sequence
389 |         else:
390 |             raise AttributeError(f"Unknown language {self.language}")
391 | 
392 | 
393 | class XLNetRTTokenizer(XLNetTokenizer):
394 |     """
395 |     A XLNet-based tokenizer for the Regression Transformer, build for the
396 |     humicroedit dataset
397 |     """
398 | 
399 |     def set_property_tokenizer(
400 |         self,
401 |         tokenizer: PropertyTokenizer,
402 |         expression_separator: str = "{",
403 |         expression_end: str = "}",
404 |         property_token: str = "[funny]",
405 |     ):
406 |         """
407 |         Set the property tokenizer to be used by the main tokenizer.
408 | 
409 |         Args:
410 |             tokenizer: a property tokenizer.
411 |             expression_separator: a token that separates the property from the rest.
412 |             expression_end: a token that ends the joke-token sequence.
413 |             property_token: the property token.
414 |         """
415 |         self.property_tokenizer = tokenizer
416 |         # The start token indicating the joke tokens
417 |         self.expression_separator = expression_separator
418 |         self.expressiond_end = expression_end
419 |         self.property_token = property_token
420 | 
421 |     def set_vocab(self):
422 |         self.vocab = self.get_vocab()
423 |         self.idx_to_token = dict(zip(self.vocab.values(), self.vocab.keys()))
424 | 
425 |     def _tokenize(self, text: str) -> List[str]:
426 |         """
427 |         Core tokenization function.
428 | 
429 |         Args:
430 |             text: A string to be tokenized.
431 | 
432 |         Returns:
433 |             A list of tokens.
434 |         """
435 |         prop, rest = text.split(self.expression_separator)
436 |         tokens = self.property_tokenizer.tokenize(prop) + [self.expression_separator]
437 | 
438 |         entities = rest.split(self.expressiond_end)
439 |         for idx, entity in enumerate(entities):
440 |             tokens.extend(super()._tokenize(entity))
441 |             if idx < len(entities) - 1:
442 |                 tokens.extend([self.expressiond_end])
443 | 
444 |         if len(tokens) % 2 != 0:
445 |             tokens + [self.pad_token]
446 |         return tokens
447 | 
448 |     @property
449 |     def vocab_list(self):
450 |         return list(self.vocab.keys())
451 | 
452 |     @staticmethod
453 |     def floating_tokens_to_float(token_ids: List[str]) -> float:
454 |         """Converts tokens representing a float value into a float.
455 |         NOTE: Expects that non-floating tokens are strippped off
456 | 
457 |         Args:
458 |             token_ids: List of tokens, each representing a float.
459 |                 E.g.: ['_0_0_', '_._', '_9_-1_', '_3_-2_', '_1_-3_']
460 | 
461 |         Returns:
462 |             float: Float representation for the list of tokens.
463 |         """
464 |         try:
465 |             float_string = "".join([token.split("_")[1] for token in token_ids])
466 |             float_value = float(float_string)
467 |         except ValueError:
468 |             float_value = -1
469 |         return float_value
470 | 
471 |     def decode_internal(self, token_ids: torch.Tensor, *args, **kwargs) -> str:
472 |         tokens = ""
473 |         for _id in token_ids.tolist():
474 |             token = self.idx_to_token[_id] if _id != -100 else "[UNK]"
475 |             tokens += token + " "
476 |         return tokens
477 | 
478 |     @staticmethod
479 |     def get_sample_label(mlm_label: List[str], mlm_input: List[str]) -> List[str]:
480 |         """MLM case: Retrieve true sample sequence from mlm label and mlm input.
481 |         NOTE: Also works for PLM.
482 | 
483 |         Args:
484 |             mlm_label (List[str]): Target sample used in MLM.
485 |             mlm_input (List[str]): MLM input sample.
486 | 
487 |         Returns:
488 |             List[str]: Sample sequence as part of the dataset
489 |         """
490 | 
491 |         return [i if el == "[UNK]" else el for el, i in zip(mlm_label, mlm_input)]
492 | 
493 |     @staticmethod
494 |     def get_sample_prediction(
495 |         mlm_prediction: List[str], mlm_input: List[str]
496 |     ) -> List[str]:
497 |         """MLM case: Retrieve predicted sequence from mlm prediction and mlm input
498 |         NOTE: Also works for PLM.
499 | 
500 |         Args:
501 |             mlm_label (List[str]): Target sample used in MLM.
502 |             mlm_input (List[str]): MLM input sample.
503 | 
504 |         Returns:
505 |             List[str]: Sample sequence as part of the dataset
506 |         """
507 |         return [
508 |             i if i not in ["[MASK]", "<mask>"] else o
509 |             for o, i in zip(mlm_prediction, mlm_input)
510 |         ]
511 | 
512 |     def aggregate_tokens(
513 |         self, token_ids: List[str], label_mode: bool, cls_first: bool = True
514 |     ) -> Tuple[str, Dict]:
515 |         """Receives tokens of one sample and returns sequence (e.g. SMILES) and
516 |         a dict of properties.
517 | 
518 |         Args:
519 |             token_ids (List[str]): List of tokens.
520 |             label_mode (bool): Whether the token_ids are labels or predictions.
521 |             cls_first (bool, optional): Whether CLS  token occurres first, default: True
522 | 
523 |         Returns:
524 |             Tuple[str, Dict]:
525 |                 str: SMILES/SELFIES sequence of sample.
526 |                 Dict: A dictionary with property names (e.g. 'qed') as key and
527 |                     properties as values.
528 |         """
529 |         edx = min(
530 |             token_ids.index("[SEP]") if "[SEP]" in token_ids else 1000,
531 |             token_ids.index("[PAD]") if "[PAD]" in token_ids else 1000,
532 |         )
533 | 
534 |         edx = -1 if edx == 1000 else edx
535 | 
536 |         seq = (
537 |             "".join(token_ids[token_ids.index("|") + 1 : edx])
538 |             if "|" in token_ids
539 |             else "".join(token_ids)
540 |         )
541 |         property_dict = {}
542 |         for idx, t in enumerate(token_ids):
543 |             if t == self.property_token:
544 |                 key = t[1:-1]
545 | 
546 |                 # Convert float
547 |                 end_floating_idx = idx + 1
548 |                 while token_ids[end_floating_idx].startswith("_"):
549 |                     end_floating_idx += 1
550 | 
551 |                 prop = self.floating_tokens_to_float(
552 |                     token_ids[idx + 1 : end_floating_idx]
553 |                 )
554 | 
555 |                 property_dict[key] = prop
556 | 
557 |         return seq, property_dict
558 | 


--------------------------------------------------------------------------------
/terminator/trainer_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Any, Dict, Optional
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import transformers
  7 | from torch import Tensor
  8 | from transformers.utils import logging
  9 | 
 10 | logger = logging.get_logger(__name__)
 11 | 
 12 | 
 13 | def get_trainer_dict(dictionary: Dict[str, Any]) -> Dict[str, Any]:
 14 |     """Helper function to take out a subset of a dictionary with keys that are
 15 |     important for `CustomTrainer` but cant be passed down to `Trainer`.
 16 | 
 17 |     Args:
 18 |         dictionary (dict): Dict with keyword arguments for `CustomTrainer` constructor.
 19 | 
 20 |     Returns:
 21 |         dict: Dict with keyword arguments for `CustomTrainer` that cant be passed to
 22 |             childclass constructor (`Trainer`).
 23 |     """
 24 |     keys_to_keep = [
 25 |         "verbose_evaluation",
 26 |         "numerical",
 27 |         "d_model",
 28 |         "vocab_size",
 29 |         "vmax",
 30 |         "model_type",
 31 |         "mem_len",
 32 |         "training_logs",
 33 |         "train_config",
 34 |         "alternating_collator",
 35 |     ]
 36 |     keep_dict = {}
 37 |     for keep_key in keys_to_keep:
 38 |         for key, val in dictionary.items():
 39 |             if re.search(keep_key, key) is not None:
 40 |                 keep_dict[key] = val
 41 |     return keep_dict
 42 | 
 43 | 
 44 | """
 45 | All below code is taken from transformers==3.5.0 to remedy issues with tensor stacking.
 46 | NOTE: 3.4.0 introduces accumulation steps in evaluation, but only 3.5.0 allows the
 47 | Trainer to handle dynamic sequence lengths.
 48 | """
 49 | 
 50 | 
 51 | def nested_new_like(arrays, num_samples, padding_index=-100):
 52 |     """Create the same nested structure as `arrays` with a first dimension always at `num_samples`."""
 53 |     if isinstance(arrays, (list, tuple)):
 54 |         return type(arrays)(nested_new_like(x, num_samples) for x in arrays)
 55 |     return np.full_like(arrays, padding_index, shape=(num_samples, *arrays.shape[1:]))
 56 | 
 57 | 
 58 | def nested_truncate(tensors, limit):
 59 |     "Truncate `tensors` at `limit` (even if it's a nested list/tuple of tensors)."
 60 |     if isinstance(tensors, (list, tuple)):
 61 |         return type(tensors)(nested_truncate(t, limit) for t in tensors)
 62 |     return tensors[:limit]
 63 | 
 64 | 
 65 | def nested_expand_like(arrays, new_seq_length, padding_index=-100):
 66 |     """Expand the `arrays` so that the second dimension grows to `new_seq_length`.
 67 |     Uses `padding_index` for padding."""
 68 |     if isinstance(arrays, (list, tuple)):
 69 |         return type(arrays)(
 70 |             nested_expand_like(x, new_seq_length, padding_index=padding_index)
 71 |             for x in arrays
 72 |         )
 73 | 
 74 |     result = np.full_like(
 75 |         arrays,
 76 |         padding_index,
 77 |         shape=(arrays.shape[0], new_seq_length) + arrays.shape[2:],
 78 |     )
 79 |     result[:, : arrays.shape[1]] = arrays
 80 |     return result
 81 | 
 82 | 
 83 | def _get_first_shape(arrays):
 84 |     """Return the shape of the first array found in the nested struct `arrays`."""
 85 |     if isinstance(arrays, (list, tuple)):
 86 |         return _get_first_shape(arrays[0])
 87 |     return arrays.shape
 88 | 
 89 | 
 90 | class DistributedTensorGatherer:
 91 |     """
 92 |     A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU
 93 |     by chunks.
 94 |     If our dataset has 16 samples with a batch size of 2 on 3 processes and we gather then transfer on
 95 |     CPU at every step, our sampler will generate the following indices:
 96 |         :obj:`[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]`
 97 |     to get something of size a multiple of 3 (so that each process gets the same dataset length). Then
 98 |     process 0, 1 and 2 will be responsible of making predictions for the following samples:
 99 |         - P0: :obj:`[0, 1, 2, 3, 4, 5]`
100 |         - P1: :obj:`[6, 7, 8, 9, 10, 11]`
101 |         - P2: :obj:`[12, 13, 14, 15, 0, 1]`
102 |     The first batch treated on each process will be
103 |         - P0: :obj:`[0, 1]`
104 |         - P1: :obj:`[6, 7]`
105 |         - P2: :obj:`[12, 13]`
106 |     So if we gather at the end of the first batch, we will get a tensor (nested list/tuple of tensor)
107 |     corresponding to the following indices:
108 |         :obj:`[0, 1, 6, 7, 12, 13]`
109 |     If we directly concatenate our results without taking any precautions, the user will then get
110 |     the predictions for the indices in this order at the end of the prediction loop:
111 |         :obj:`[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]`
112 |     For some reason, that's not going to roll their boat. This class is there to solve that problem.
113 |     Args:
114 |         world_size (:obj:`int`):
115 |             The number of processes used in the distributed training.
116 |         num_samples (:obj:`int`):
117 |             The number of samples in our dataset.
118 |         make_multiple_of (:obj:`int`, `optional`):
119 |             If passed, the class assumes the datasets passed to each process are made to be a multiple of this argument
120 |             (by adding samples).
121 |     """
122 | 
123 |     def __init__(
124 |         self, world_size, num_samples, make_multiple_of=None, padding_index=-100
125 |     ):
126 |         self.world_size = world_size
127 |         self.num_samples = num_samples
128 |         total_size = (
129 |             world_size if make_multiple_of is None else world_size * make_multiple_of
130 |         )
131 |         self.total_samples = int(np.ceil(num_samples / total_size)) * total_size
132 |         self.process_length = self.total_samples // world_size
133 |         self._storage = None
134 |         self._offsets = None
135 |         self.padding_index = padding_index
136 | 
137 |     def add_arrays(self, arrays):
138 |         """
139 |         Add :obj:`arrays` to the internal storage, Will initialize the storage to the full size at the first arrays
140 |         passed so that if we're bound to get an OOM, it happens at the beginning.
141 |         """
142 |         if arrays is None:
143 |             return
144 |         if self._storage is None:
145 |             self._storage = nested_new_like(
146 |                 arrays, self.total_samples, padding_index=self.padding_index
147 |             )
148 |             self._offsets = list(range(0, self.total_samples, self.process_length))
149 |         else:
150 |             storage_shape = _get_first_shape(self._storage)
151 |             arrays_shape = _get_first_shape(arrays)
152 |             if len(storage_shape) > 1 and storage_shape[1] < arrays_shape[1]:
153 |                 # If we get new arrays that are too big too fit, we expand the shape fo the storage
154 |                 self._storage = nested_expand_like(
155 |                     self._storage, arrays_shape[1], padding_index=self.padding_index
156 |                 )
157 |         slice_len = self._nested_set_tensors(self._storage, arrays)
158 |         for i in range(self.world_size):
159 |             self._offsets[i] += slice_len
160 | 
161 |     def _nested_set_tensors(self, storage, arrays):
162 |         if isinstance(arrays, (list, tuple)):
163 |             for x, y in zip(storage, arrays):
164 |                 slice_len = self._nested_set_tensors(x, y)
165 |             return slice_len
166 |         assert (
167 |             arrays.shape[0] % self.world_size == 0
168 |         ), f"Arrays passed should all have a first dimension multiple of {self.world_size}, found {arrays.shape[0]}."
169 | 
170 |         slice_len = arrays.shape[0] // self.world_size
171 |         for i in range(self.world_size):
172 |             if len(arrays.shape) == 1:
173 |                 storage[self._offsets[i] : self._offsets[i] + slice_len] = arrays[
174 |                     i * slice_len : (i + 1) * slice_len
175 |                 ]
176 |             else:
177 |                 storage[
178 |                     self._offsets[i] : self._offsets[i] + slice_len, : arrays.shape[1]
179 |                 ] = arrays[i * slice_len : (i + 1) * slice_len]
180 |         return slice_len
181 | 
182 |     def finalize(self):
183 |         """
184 |         Return the properly gathered arrays and truncate to the number of samples (since the sampler added some extras
185 |         to get each process a dataset of the same length).
186 |         """
187 |         if self._storage is None:
188 |             return
189 |         if self._offsets[0] != self.process_length:
190 |             logger.warn(
191 |                 "Not all data has been set. Are you sure you passed all values?"
192 |             )
193 |         return nested_truncate(self._storage, self.num_samples)
194 | 
195 | 
196 | def torch_pad_and_concatenate(
197 |     tensor1: Tensor, tensor2: Tensor, padding_index: int = -100
198 | ) -> Tensor:
199 |     """Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary."""
200 |     if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
201 |         return torch.cat((tensor1, tensor2), dim=0)
202 | 
203 |     # Let's figure out the new shape
204 |     new_shape = (
205 |         tensor1.shape[0] + tensor2.shape[0],
206 |         max(tensor1.shape[1], tensor2.shape[1]),
207 |     ) + tensor1.shape[2:]
208 | 
209 |     # Now let's fill the result tensor
210 |     result = tensor1.new_full(new_shape, padding_index)
211 |     result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1
212 |     result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2
213 |     return result.detach()
214 | 
215 | 
216 | def numpy_pad_and_concatenate(
217 |     array1: np.array, array2: np.array, padding_index: str = -100
218 | ) -> np.array:
219 |     """Concatenates `array1` and `array2` on first axis, applying padding on the second if necessary."""
220 |     if len(array1.shape) == 1 or array1.shape[1] == array2.shape[1]:
221 |         return np.concatenate((array1, array2), dim=0)
222 | 
223 |     # Let's figure out the new shape
224 |     new_shape = (
225 |         array1.shape[0] + array2.shape[0],
226 |         max(array1.shape[1], array2.shape[1]),
227 |     ) + array1.shape[2:]
228 | 
229 |     # Now let's fill the result tensor
230 |     result = np.full_like(array1, padding_index, shape=new_shape)
231 |     result[: array1.shape[0], : array1.shape[1]] = array1
232 |     result[array1.shape[0] :, : array2.shape[1]] = array2
233 |     return result
234 | 
235 | 
236 | def nested_concat(tensors, new_tensors, padding_index=-100):
237 |     """
238 |     Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or
239 |     nested list/tuples of tensors.
240 |     """
241 |     assert type(tensors) == type(
242 |         new_tensors
243 |     ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
244 |     if isinstance(tensors, (list, tuple)):
245 |         return type(tensors)(
246 |             nested_concat(t, n, padding_index=padding_index)
247 |             for t, n in zip(tensors, new_tensors)
248 |         )
249 |     elif isinstance(tensors, torch.Tensor):
250 |         return torch_pad_and_concatenate(
251 |             tensors, new_tensors, padding_index=padding_index
252 |         )
253 |     elif isinstance(tensors, np.ndarray):
254 |         return numpy_pad_and_concatenate(
255 |             tensors, new_tensors, padding_index=padding_index
256 |         )
257 |     else:
258 |         raise TypeError(f"Unsupported type for concatenation: got {type(tensors)}")
259 | 
260 | 
261 | def distributed_concat(
262 |     tensor: "torch.Tensor", num_total_examples: Optional[int] = None
263 | ) -> torch.Tensor:
264 |     try:
265 |         if isinstance(tensor, (tuple, list)):
266 |             return type(tensor)(
267 |                 distributed_concat(t, num_total_examples) for t in tensor
268 |             )
269 |         output_tensors = [
270 |             tensor.clone() for _ in range(torch.distributed.get_world_size())
271 |         ]
272 |         torch.distributed.all_gather(output_tensors, tensor)
273 |         concat = torch.cat(output_tensors, dim=0)
274 | 
275 |         # truncate the dummy elements added by SequentialDistributedSampler
276 |         if num_total_examples is not None:
277 |             concat = concat[:num_total_examples]
278 |         return concat
279 |     except AssertionError:
280 |         raise AssertionError("Not currently using distributed training")
281 | 
282 | 
283 | def nested_numpify(tensors):
284 |     "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
285 |     if isinstance(tensors, (list, tuple)):
286 |         return type(tensors)(nested_numpify(t) for t in tensors)
287 |     return tensors.cpu().numpy()
288 | 


--------------------------------------------------------------------------------
/terminator/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import subprocess as sp
  4 | import sys
  5 | from typing import List
  6 | 
  7 | import numpy as np
  8 | import psutil
  9 | import rdkit.rdBase as rkrb
 10 | import rdkit.RDLogger as rkl
 11 | import torch
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 15 | 
 16 | 
 17 | def get_gpu_memory():
 18 |     if not cuda():
 19 |         return 0, 0, 0
 20 |     command = "nvidia-smi --query-gpu=memory.free --format=csv"
 21 |     memory_free_info = (
 22 |         sp.check_output(command.split()).decode("ascii").split("\n")[:-1][1:]
 23 |     )
 24 |     memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
 25 | 
 26 |     tot_m, used_m, free_m = map(int, os.popen("free -t -m").readlines()[-1].split()[1:])
 27 |     return memory_free_values, used_m, tot_m
 28 | 
 29 | 
 30 | def get_cpu_memory():
 31 |     mem = psutil.virtual_memory()
 32 |     return mem.total / 1000**3, mem.percent, psutil.cpu_percent()
 33 | 
 34 | 
 35 | def get_process_mmeory():
 36 |     process = psutil.Process(os.getpid())
 37 |     return process.memory_percent()
 38 | 
 39 | 
 40 | def get_device():
 41 |     return torch.device("cuda" if cuda() else "cpu")
 42 | 
 43 | 
 44 | def cuda():
 45 |     return torch.cuda.is_available()
 46 | 
 47 | 
 48 | def get_latest_checkpoint(model_path: str, must_contain: str = "best") -> str:
 49 |     """
 50 |     Given a path to the model folder it searches the latest saved checkpoint
 51 |     and returns the path to it.
 52 |     Args:
 53 |         model_path (str): Path to model folder. Has to contain folders called
 54 |             'checkpoint-best-STEP' and 'checkpoint-latest-STEP' where STEP is
 55 |             a positive integer.
 56 |         must_contain (str, optional): Subselect checkpoints that contain a
 57 |             certain query. Defaults to 'best'.
 58 |     Returns:
 59 |         str: Path to latest checkpoint
 60 |     """
 61 | 
 62 |     # Finding checkpoints
 63 |     checkpoints = [f for f in os.listdir(model_path) if f.startswith("checkpoint")]
 64 |     if must_contain is not None:
 65 |         checkpoints = list(filter(lambda x: must_contain in x, checkpoints))
 66 | 
 67 |     if len(checkpoints) == 0:
 68 |         logger.warning(
 69 |             f"No checkpoints found that contain {must_contain} in {model_path}."
 70 |         )
 71 |         # Relax criteria and retry
 72 |         next_try = "checkpoint" if must_contain != "checkpoint" else ""
 73 |         return get_latest_checkpoint(model_path, must_contain=next_try)
 74 | 
 75 |     # Sorting
 76 |     try:
 77 |         idx = np.argsort([int(c.split("-")[-1]) for c in checkpoints])[-1]
 78 |     except ValueError:
 79 |         raise ValueError(f"Checkpoints dont seem to follow format: {checkpoints}.")
 80 | 
 81 |     return os.path.join(model_path, checkpoints[idx])
 82 | 
 83 | 
 84 | def disable_rdkit_logging():
 85 |     """
 86 |     Disables RDKit whiny logging.
 87 |     """
 88 |     logger = rkl.logger()
 89 |     logger.setLevel(rkl.ERROR)
 90 |     rkrb.DisableLog("rdApp.error")
 91 | 
 92 | 
 93 | def find_safe_path(path: str) -> str:
 94 |     """Method to find a safe path that does not exist yet.
 95 |     Args:
 96 |         path (str): Desired path.
 97 |     Returns:
 98 |         str: Non existing path.
 99 |     """
100 |     safe_path = path
101 |     c = 0
102 |     while os.path.exists(safe_path):
103 |         c += 1
104 |         safe_path = ".".join(
105 |             [
106 |                 s if i != path.count(".") - 1 else f"{s}_v{c}"
107 |                 for i, s in enumerate(path.split("."))
108 |             ]
109 |         )
110 |     return safe_path
111 | 
112 | 
113 | def get_equispaced_ranges(
114 |     data_path: str, properties: List[str], n: int = 10, precisions: List[int] = [2]
115 | ) -> List[List[float]]:
116 |     """
117 |     Given a path to a data file it returns the ranges of the properties.
118 |     Args:
119 |         data_path : Path to data file.
120 |         properties: List of properties to consider.
121 |         n: number of points per property (will be equally spaced).
122 |         precisions: number of decimal places to round to (one per property).
123 |     Returns:
124 |         List of ranges for each property.
125 |     """
126 |     with open(data_path, "r") as f:
127 |         data = f.readlines()
128 | 
129 |     ranges = []
130 | 
131 |     for prop, pre in zip(properties, precisions):
132 | 
133 |         values = [float(line.split(prop)[-1].split("|")[0]) for line in data]
134 |         _range = []
135 |         for x in np.linspace(np.min(values), np.max(values), n):
136 |             if pre == 1:
137 |                 _range.append(f"{x:.1f}")
138 |             elif pre == 2:
139 |                 _range.append(f"{x:.2f}")
140 |             elif pre == 3:
141 |                 _range.append(f"{x:.3f}")
142 |             elif pre == 4:
143 |                 _range.append(f"{x:.4f}")
144 |         ranges.append(_range)
145 |     return ranges
146 | 


--------------------------------------------------------------------------------
/training_configs/qed_alternated_cc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "reset_training_loss": true,
 3 |     "alternate_tasks": true,
 4 |     "cc_loss": true,
 5 |     "property_tokens": [
 6 |         "<qed>"
 7 |     ],
 8 |     "alternate_steps": 50,
 9 |     "checkpoint-str": "best",
10 |     "cg_collator": "vanilla_cg",
11 |     "cg_collator_params": {
12 |         "do_sample": false,
13 |         "property_tokens": [
14 |             "<qed>"
15 |         ],
16 |         "plm_probability": 0.4,
17 |         "max_span_length": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/training_configs/qed_proponly.json:
--------------------------------------------------------------------------------
1 | {
2 |     "reset_training_loss": true,
3 |     "alternate_tasks": false,
4 |     "task": "proponly",
5 |     "checkpoint-str": "pearson",
6 |     "property_tokens": [
7 |         "<qed>"
8 |     ]
9 | }


--------------------------------------------------------------------------------
/training_configs/reactions_alternating_cc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "reset_training_loss": true,
 3 |     "alternate_tasks": true,
 4 |     "cg_collator": "multientity_cg",
 5 |     "cc_loss": true,
 6 |     "property_tokens": [
 7 |         "<yield>"
 8 |     ],
 9 |     "alternate_steps": 50,
10 |     "cg_collator_params": {
11 |         "do_sample": false,
12 |         "property_tokens": [
13 |             "<yield>"
14 |         ],
15 |         "plm_probability": 1.0,
16 |         "max_span_length": 7,
17 |         "entity_to_mask": -1,
18 |         "entity_separator_token": "<energy>"
19 |     }
20 | }


--------------------------------------------------------------------------------
/vocabs/proteins.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | [PAD]
  3 | [unused1]
  4 | [unused2]
  5 | [unused3]
  6 | [unused4]
  7 | [unused5]
  8 | [unused6]
  9 | [unused7]
 10 | [unused8]
 11 | [unused9]
 12 | <stab>
 13 | [UNK]
 14 | [CLS]
 15 | [SEP]
 16 | [MASK]
 17 | |
 18 | <fluor>
 19 | <mut>
 20 | <boman>
 21 | <sas>
 22 | <scs>
 23 | <esol>
 24 | <plogp>
 25 | <lipinski>
 26 | <rxnretro>
 27 | _._
 28 | _0_-0_
 29 | _0_-1_
 30 | _0_-2_
 31 | _0_-3_
 32 | _0_-4_
 33 | _0_-5_
 34 | _0_0_
 35 | _0_1_
 36 | _0_2_
 37 | _0_3_
 38 | _0_4_
 39 | _0_5_
 40 | _1_-0_
 41 | _1_-1_
 42 | _1_-2_
 43 | _1_-3_
 44 | _1_-4_
 45 | _1_-5_
 46 | _1_-6_
 47 | _1_0_
 48 | _1_1_
 49 | _1_2_
 50 | _1_3_
 51 | _1_4_
 52 | _1_5_
 53 | _2_-0_
 54 | _2_-1_
 55 | _2_-2_
 56 | _2_-3_
 57 | _2_-4_
 58 | _2_-5_
 59 | _2_-6_
 60 | _2_0_
 61 | _2_1_
 62 | _2_2_
 63 | _2_3_
 64 | _2_4_
 65 | _2_5_
 66 | _3_-0_
 67 | _3_-1_
 68 | _3_-2_
 69 | _3_-3_
 70 | _3_-4_
 71 | _3_-5_
 72 | _3_-6_
 73 | _3_0_
 74 | _3_1_
 75 | _3_2_
 76 | _3_3_
 77 | _3_4_
 78 | _3_5_
 79 | _4_-0_
 80 | _4_-1_
 81 | _4_-2_
 82 | _4_-3_
 83 | _4_-4_
 84 | _4_-5_
 85 | _4_-6_
 86 | _4_0_
 87 | _4_1_
 88 | _4_2_
 89 | _4_3_
 90 | _4_4_
 91 | _4_5_
 92 | _5_-0_
 93 | _5_-1_
 94 | _5_-2_
 95 | _5_-3_
 96 | _5_-4_
 97 | _5_-5_
 98 | _5_-6_
 99 | _5_0_
100 | _5_1_
101 | _5_2_
102 | _5_3_
103 | _5_4_
104 | _5_5_
105 | _6_-0_
106 | _6_-1_
107 | _6_-2_
108 | _6_-3_
109 | _6_-4_
110 | _6_-5_
111 | _6_-6_
112 | _6_0_
113 | _6_1_
114 | _6_2_
115 | _6_3_
116 | _6_4_
117 | _6_5_
118 | _7_-0_
119 | _7_-1_
120 | _7_-2_
121 | _7_-3_
122 | _7_-4_
123 | _7_-5_
124 | _7_-6_
125 | _7_0_
126 | _7_1_
127 | _7_2_
128 | _7_3_
129 | _7_4_
130 | _7_5_
131 | _8_-0_
132 | _8_-1_
133 | _8_-2_
134 | _8_-3_
135 | _8_-4_
136 | _8_-5_
137 | _8_-6_
138 | _8_0_
139 | _8_1_
140 | _8_2_
141 | _8_3_
142 | _8_4_
143 | _8_5_
144 | _9_-0_
145 | _9_-1_
146 | _9_-2_
147 | _9_-3_
148 | _9_-4_
149 | _9_-5_
150 | _9_-6_
151 | _9_0_
152 | _9_1_
153 | _9_2_
154 | _9_3_
155 | _9_4_
156 | _9_5_
157 | A
158 | B
159 | C
160 | D
161 | E
162 | F
163 | G
164 | H
165 | I
166 | J
167 | K
168 | L
169 | M
170 | N
171 | O
172 | P
173 | Q
174 | R
175 | S
176 | T
177 | U
178 | V
179 | W
180 | X
181 | Y
182 | Z
183 | a
184 | b
185 | c
186 | d
187 | e
188 | f
189 | g
190 | h
191 | i
192 | j
193 | k
194 | l
195 | m
196 | n
197 | o
198 | p
199 | q
200 | r
201 | s
202 | t
203 | u
204 | v
205 | w
206 | x
207 | y
208 | z
209 | [Branch2_1]
210 | [=O]
211 | [epsilon]
212 | [Ring1]
213 | [=C]
214 | [Ring2]
215 | [Branch1_3]
216 | [N]
217 | [Branch1_1]
218 | [C]
219 | [=N]
220 | [Branch2_3]
221 | [Branch1_2]
222 | [#N]
223 | [Br]
224 | [O]
225 | [Branch2_2]
226 | [F]
227 | [S]
228 | [=S]
229 | [#C]
230 | [Cl]
231 | [O-expl]
232 | [N+expl]
233 | [P]
234 | [.]
235 | [I]
236 | [c]
237 | [-c]
238 | [s]
239 | [nHexpl]
240 | [\c]
241 | [n]
242 | [\C]
243 | [o]
244 | [C@@Hexpl]
245 | [C@expl]
246 | [C@@expl]
247 | [C@Hexpl]
248 | [/C]
249 | [/c]
250 | [Ptexpl]
251 | [\N]
252 | [\C@@Hexpl]
253 | [/C@Hexpl]
254 | [\C@Hexpl]
255 | [-n]
256 | [=c]
257 | [B]
258 | [\S]
259 | [/n]
260 | [=N+expl]
261 | [Expl\Ring2]
262 | [Expl/Ring1]
263 | [n+expl]
264 | [Expl\Ring1]
265 | [Asexpl]
266 | [N@@expl]
267 | [S@@expl]
268 | [/O]
269 | [Expl-Ring1]
270 | [/N]
271 | [S+expl]
272 | [/S]
273 | [Pexpl]
274 | [=Nexpl]
275 | [#O+expl]
276 | [C-expl]
277 | [Iexpl]
278 | [O+expl]
279 | [Brexpl]
280 | [Clexpl]
281 | [=N-expl]
282 | [N-expl]
283 | [P+expl]
284 | [Oexpl]
285 | [#C-expl]
286 | [=Oexpl]
287 | [#N+expl]
288 | [=Iexpl]
289 | [CH-expl]
290 | [P-expl]
291 | [s+expl]
292 | [=P]
293 | [=I++expl]
294 | [o+expl]
295 | [=O+expl]
296 | [CH2expl]
297 | [=S+expl]
298 | [I+expl]
299 | [IHexpl]
300 | [CHexpl]
301 | [-n+expl]
302 | [=CHexpl]
303 | [=O-expl]
304 | [c-expl]
305 | [S-expl]
306 | [p]
307 | [Nexpl]
308 | [Cexpl]
309 | [=P+expl]
310 | [n-expl]
311 | [cH-expl]
312 | [B-expl]
313 | [Expl-Ring2]
314 | [C+expl]
315 | [c+expl]
316 | [=n+expl]
317 | [NH-expl]
318 | [NH2+expl]
319 | [Expl/Ring2]
320 | [Expl=Ring1]
321 | [Cl-expl]
322 | [Na+expl]
323 | [Hexpl]
324 | [NH4+expl]
325 | [Hgexpl]
326 | [\O]
327 | [Br-expl]
328 | [N@expl]
329 | [Ca++expl]
330 | [Snexpl]
331 | [I-expl]
332 | [Co+expl]
333 | [N@@+expl]
334 | [K+expl]
335 | [Fe--expl]
336 | [\Hexpl]
337 | [N@+expl]
338 | [Fe+3expl]
339 | [Gd+3expl]
340 | [/N+expl]
341 | [NH+expl]
342 | [=NH+expl]
343 | [Zn++expl]
344 | [/Br]
345 | [/Cl]
346 | [/C@@Hexpl]
347 | [\N+expl]
348 | [NH3+expl]
349 | [Alexpl]
350 | [Hg++expl]
351 | [Cu++expl]
352 | [Znexpl]
353 | [Au-expl]
354 | [Auexpl]
355 | [Crexpl]
356 | [Cd++expl]
357 | [Cdexpl]
358 | [Siexpl]
359 | [Sbexpl]
360 | [Seexpl]
361 | [=Seexpl]
362 | [Cuexpl]
363 | [Li+expl]
364 | [Tl+expl]
365 | [Biexpl]
366 | [Inexpl]
367 | [/Hexpl]
368 | [Caexpl]
369 | [Dyexpl]
370 | [Co++expl]
371 | [Cr+3expl]
372 | [Fe++expl]
373 | [Pt-2expl]
374 | [Sb+3expl]
375 | [Be++expl]
376 | [Mg++expl]
377 | [Tiexpl]
378 | [Fe-expl]
379 | [Ndexpl]
380 | [Pdexpl]
381 | [#Inexpl]
382 | [Ba++expl]
383 | [H+expl]
384 | [Mn+expl]
385 | [Mn++expl]
386 | [SiHexpl]
387 | [\Cl]
388 | [Ni++expl]
389 | [Zrexpl]
390 | [Niexpl]
391 | [PbH2++expl]
392 | [Ybexpl]
393 | [Naexpl]
394 | [=Moexpl]
395 | [=Cdexpl]
396 | [Cu+expl]
397 | [Geexpl]
398 | [Baexpl]
399 | [=Crexpl]
400 | [Cr++expl]
401 | [OH-expl]
402 | [SnH2++expl]
403 | [Mg+2expl]
404 | [=Siexpl]
405 | [\Br]
406 | [\C@expl]
407 | [Vexpl]
408 | [Ag+expl]
409 | [\C@@expl]
410 | [Pt+2expl]
411 | [2Hexpl]
412 | [Ti++expl]
413 | [Sr++expl]
414 | [=Auexpl]
415 | [Ruexpl]
416 | [\O-expl]
417 | [P@expl]
418 | [Liexpl]
419 | [/C@@expl]
420 | [As+expl]
421 | [\Siexpl]
422 | [/Alexpl]
423 | [\O+expl]
424 | [/Crexpl]
425 | [/Feexpl]
426 | [Euexpl]
427 | [Scexpl]
428 | [Zn+2expl]
429 | [Ca+2expl]
430 | [Hg+2expl]
431 | [=Zrexpl]
432 | [nH+expl]
433 | [Cl+3expl]
434 | [Ba+2expl]
435 | [TlH2+expl]
436 | [Fe+2expl]
437 | [AlH3expl]
438 | [=PHexpl]
439 | [Co+2expl]
440 | [Cu+2expl]
441 | [PbH2+2expl]
442 | [\s]
443 | [Ni+2expl]
444 | [Cd+2expl]
445 | [SnH2+2expl]
446 | [Ti+2expl]
447 | [PHexpl]
448 | [Mn+2expl]
449 | [Sr+2expl]
450 | [Be+2expl]
451 | [seexpl]
452 | [Cr+2expl]
453 | [=Biexpl]
454 | [=C-expl]
455 | [SbH6+3expl]
456 | [\n]
457 | [Fe-2expl]
458 | [=OH+expl]
459 | [-c-expl]
460 | [/s]
461 | [=NH2+expl]
462 | [#S+expl]
463 | [/F]
464 | [F-expl]
465 | [SH-expl]
466 | [CH+expl]
467 | [\NH+expl]
468 | [\CH-expl]
469 | [\c-expl]
470 | [/o]
471 | [CH2-expl]
472 | [\N-expl]
473 | [/n-expl]
474 | [\C-expl]
475 | [/NH+expl]
476 | [/N-expl]
477 | [\F]
478 | [Gd-4expl]
479 | [Gd-5expl]
480 | [N@@H+expl]
481 | [SiH3expl]
482 | [Branch3_3]
483 | [Sexpl]
484 | [\I]
485 | [BiH3expl]
486 | [SeHexpl]
487 | [SiH2expl]
488 | [Feexpl]
489 | [S@expl]
490 | [\P]
491 | [/nHexpl]
492 | [SH+expl]
493 | [-oexpl]
494 | [-sexpl]
495 | [Kexpl]
496 | [=S@@expl]
497 | [*expl]
498 | [CH2+expl]
499 | [S@+expl]
500 | [S@@+expl]
501 | [-nexpl]
502 | [P@@expl]
503 | [/I]
504 | [Reexpl]
505 | [=SH+expl]
506 | [/CH-expl]
507 | [\nHexpl]
508 | [=C@@expl]
509 | [N@H+expl]


--------------------------------------------------------------------------------
/vocabs/reactions.txt:
--------------------------------------------------------------------------------
  1 | [PAD]
  2 | _-_
  3 | _+_
  4 | <bace>
  5 | <lipo>
  6 | <freesolv>
  7 | <energy>
  8 | [Agexpl]
  9 | <pdi>
 10 | <conv>
 11 | <yield>
 12 | [UNK]
 13 | [CLS]
 14 | [SEP]
 15 | [MASK]
 16 | |
 17 | <qed>
 18 | <logp>
 19 | <molwt>
 20 | <sas>
 21 | <scs>
 22 | <esol>
 23 | <plogp>
 24 | <lipinski>
 25 | <rxnretro>
 26 | _._
 27 | _0_-0_
 28 | _0_-1_
 29 | _0_-2_
 30 | _0_-3_
 31 | _0_-4_
 32 | _0_-5_
 33 | _0_0_
 34 | _0_1_
 35 | _0_2_
 36 | _0_3_
 37 | _0_4_
 38 | _0_5_
 39 | _1_-0_
 40 | _1_-1_
 41 | _1_-2_
 42 | _1_-3_
 43 | _1_-4_
 44 | _1_-5_
 45 | _1_-6_
 46 | _1_0_
 47 | _1_1_
 48 | _1_2_
 49 | _1_3_
 50 | _1_4_
 51 | _1_5_
 52 | _2_-0_
 53 | _2_-1_
 54 | _2_-2_
 55 | _2_-3_
 56 | _2_-4_
 57 | _2_-5_
 58 | _2_-6_
 59 | _2_0_
 60 | _2_1_
 61 | _2_2_
 62 | _2_3_
 63 | _2_4_
 64 | _2_5_
 65 | _3_-0_
 66 | _3_-1_
 67 | _3_-2_
 68 | _3_-3_
 69 | _3_-4_
 70 | _3_-5_
 71 | _3_-6_
 72 | _3_0_
 73 | _3_1_
 74 | _3_2_
 75 | _3_3_
 76 | _3_4_
 77 | _3_5_
 78 | _4_-0_
 79 | _4_-1_
 80 | _4_-2_
 81 | _4_-3_
 82 | _4_-4_
 83 | _4_-5_
 84 | _4_-6_
 85 | _4_0_
 86 | _4_1_
 87 | _4_2_
 88 | _4_3_
 89 | _4_4_
 90 | _4_5_
 91 | _5_-0_
 92 | _5_-1_
 93 | _5_-2_
 94 | _5_-3_
 95 | _5_-4_
 96 | _5_-5_
 97 | _5_-6_
 98 | _5_0_
 99 | _5_1_
100 | _5_2_
101 | _5_3_
102 | _5_4_
103 | _5_5_
104 | _6_-0_
105 | _6_-1_
106 | _6_-2_
107 | _6_-3_
108 | _6_-4_
109 | _6_-5_
110 | _6_-6_
111 | _6_0_
112 | _6_1_
113 | _6_2_
114 | _6_3_
115 | _6_4_
116 | _6_5_
117 | _7_-0_
118 | _7_-1_
119 | _7_-2_
120 | _7_-3_
121 | _7_-4_
122 | _7_-5_
123 | _7_-6_
124 | _7_0_
125 | _7_1_
126 | _7_2_
127 | _7_3_
128 | _7_4_
129 | _7_5_
130 | _8_-0_
131 | _8_-1_
132 | _8_-2_
133 | _8_-3_
134 | _8_-4_
135 | _8_-5_
136 | _8_-6_
137 | _8_0_
138 | _8_1_
139 | _8_2_
140 | _8_3_
141 | _8_4_
142 | _8_5_
143 | _9_-0_
144 | _9_-1_
145 | _9_-2_
146 | _9_-3_
147 | _9_-4_
148 | _9_-5_
149 | _9_-6_
150 | _9_0_
151 | _9_1_
152 | _9_2_
153 | _9_3_
154 | _9_4_
155 | _9_5_
156 | [Branch2_1]
157 | [=O]
158 | [epsilon]
159 | [Ring1]
160 | [=C]
161 | [Ring2]
162 | [Branch1_3]
163 | [N]
164 | [Branch1_1]
165 | [C]
166 | [=N]
167 | [Branch2_3]
168 | [Branch1_2]
169 | [#N]
170 | [Br]
171 | [O]
172 | [Branch2_2]
173 | [F]
174 | [S]
175 | [=S]
176 | [#C]
177 | [Cl]
178 | [O-expl]
179 | [N+expl]
180 | [P]
181 | [.]
182 | [I]
183 | [c]
184 | [-c]
185 | [s]
186 | [nHexpl]
187 | [\c]
188 | [n]
189 | [\C]
190 | [o]
191 | [C@@Hexpl]
192 | [C@expl]
193 | [C@@expl]
194 | [C@Hexpl]
195 | [/C]
196 | [/c]
197 | [Ptexpl]
198 | [\N]
199 | [\C@@Hexpl]
200 | [/C@Hexpl]
201 | [\C@Hexpl]
202 | [-n]
203 | [=c]
204 | [B]
205 | [\S]
206 | [/n]
207 | [=N+expl]
208 | [Expl\Ring2]
209 | [Expl/Ring1]
210 | [n+expl]
211 | [Expl\Ring1]
212 | [Asexpl]
213 | [N@@expl]
214 | [S@@expl]
215 | [/O]
216 | [Expl-Ring1]
217 | [/N]
218 | [S+expl]
219 | [/S]
220 | [Pexpl]
221 | [=Nexpl]
222 | [#O+expl]
223 | [C-expl]
224 | [Iexpl]
225 | [O+expl]
226 | [Brexpl]
227 | [Clexpl]
228 | [=N-expl]
229 | [N-expl]
230 | [P+expl]
231 | [Oexpl]
232 | [#C-expl]
233 | [=Oexpl]
234 | [#N+expl]
235 | [=Iexpl]
236 | [CH-expl]
237 | [P-expl]
238 | [s+expl]
239 | [=P]
240 | [=I++expl]
241 | [o+expl]
242 | [=O+expl]
243 | [CH2expl]
244 | [=S+expl]
245 | [I+expl]
246 | [IHexpl]
247 | [CHexpl]
248 | [-n+expl]
249 | [=CHexpl]
250 | [=O-expl]
251 | [c-expl]
252 | [S-expl]
253 | [p]
254 | [Nexpl]
255 | [Cexpl]
256 | [=P+expl]
257 | [n-expl]
258 | [cH-expl]
259 | [B-expl]
260 | [Expl-Ring2]
261 | [C+expl]
262 | [c+expl]
263 | [=n+expl]
264 | [NH-expl]
265 | [NH2+expl]
266 | [Expl/Ring2]
267 | [Expl=Ring1]
268 | [Cl-expl]
269 | [Na+expl]
270 | [Hexpl]
271 | [NH4+expl]
272 | [Hgexpl]
273 | [\O]
274 | [Br-expl]
275 | [N@expl]
276 | [Ca++expl]
277 | [Snexpl]
278 | [I-expl]
279 | [Co+expl]
280 | [N@@+expl]
281 | [K+expl]
282 | [Fe--expl]
283 | [\Hexpl]
284 | [N@+expl]
285 | [Fe+3expl]
286 | [Gd+3expl]
287 | [/N+expl]
288 | [NH+expl]
289 | [=NH+expl]
290 | [Zn++expl]
291 | [/Br]
292 | [/Cl]
293 | [/C@@Hexpl]
294 | [\N+expl]
295 | [NH3+expl]
296 | [Alexpl]
297 | [Hg++expl]
298 | [Cu++expl]
299 | [Znexpl]
300 | [Au-expl]
301 | [Auexpl]
302 | [Crexpl]
303 | [Cd++expl]
304 | [Cdexpl]
305 | [Siexpl]
306 | [Sbexpl]
307 | [Seexpl]
308 | [=Seexpl]
309 | [Cuexpl]
310 | [Li+expl]
311 | [Tl+expl]
312 | [Biexpl]
313 | [Inexpl]
314 | [/Hexpl]
315 | [Caexpl]
316 | [Dyexpl]
317 | [Co++expl]
318 | [Cr+3expl]
319 | [Fe++expl]
320 | [Pt-2expl]
321 | [Sb+3expl]
322 | [Be++expl]
323 | [Mg++expl]
324 | [Tiexpl]
325 | [Fe-expl]
326 | [Ndexpl]
327 | [Pdexpl]
328 | [#Inexpl]
329 | [Ba++expl]
330 | [H+expl]
331 | [Mn+expl]
332 | [Mn++expl]
333 | [SiHexpl]
334 | [\Cl]
335 | [Ni++expl]
336 | [Zrexpl]
337 | [Niexpl]
338 | [PbH2++expl]
339 | [Ybexpl]
340 | [Naexpl]
341 | [=Moexpl]
342 | [=Cdexpl]
343 | [Cu+expl]
344 | [Geexpl]
345 | [Baexpl]
346 | [=Crexpl]
347 | [Cr++expl]
348 | [OH-expl]
349 | [SnH2++expl]
350 | [Mg+2expl]
351 | [=Siexpl]
352 | [\Br]
353 | [\C@expl]
354 | [Vexpl]
355 | [Ag+expl]
356 | [\C@@expl]
357 | [Pt+2expl]
358 | [2Hexpl]
359 | [Ti++expl]
360 | [Sr++expl]
361 | [=Auexpl]
362 | [Ruexpl]
363 | [\O-expl]
364 | [P@expl]
365 | [Liexpl]
366 | [/C@@expl]
367 | [As+expl]
368 | [\Siexpl]
369 | [/Alexpl]
370 | [\O+expl]
371 | [/Crexpl]
372 | [/Feexpl]
373 | [Euexpl]
374 | [Scexpl]
375 | [Zn+2expl]
376 | [Ca+2expl]
377 | [Hg+2expl]
378 | [=Zrexpl]
379 | [nH+expl]
380 | [Cl+3expl]
381 | [Ba+2expl]
382 | [TlH2+expl]
383 | [Fe+2expl]
384 | [AlH3expl]
385 | [=PHexpl]
386 | [Co+2expl]
387 | [Cu+2expl]
388 | [PbH2+2expl]
389 | [\s]
390 | [Ni+2expl]
391 | [Cd+2expl]
392 | [SnH2+2expl]
393 | [Ti+2expl]
394 | [PHexpl]
395 | [Mn+2expl]
396 | [Sr+2expl]
397 | [Be+2expl]
398 | [seexpl]
399 | [Cr+2expl]
400 | [=Biexpl]
401 | [=C-expl]
402 | [SbH6+3expl]
403 | [\n]
404 | [Fe-2expl]
405 | [=OH+expl]
406 | [-c-expl]
407 | [/s]
408 | [=NH2+expl]
409 | [#S+expl]
410 | [/F]
411 | [F-expl]
412 | [SH-expl]
413 | [CH+expl]
414 | [\NH+expl]
415 | [\CH-expl]
416 | [\c-expl]
417 | [/o]
418 | [CH2-expl]
419 | [\N-expl]
420 | [/n-expl]
421 | [\C-expl]
422 | [/NH+expl]
423 | [/N-expl]
424 | [\F]
425 | [Gd-4expl]
426 | [Gd-5expl]
427 | [N@@H+expl]
428 | [SiH3expl]
429 | [Branch3_3]
430 | [Sexpl]
431 | [\I]
432 | [BiH3expl]
433 | [SeHexpl]
434 | [SiH2expl]
435 | [Feexpl]
436 | [S@expl]
437 | [\P]
438 | [/nHexpl]
439 | [SH+expl]
440 | [-oexpl]
441 | [-sexpl]
442 | [Kexpl]
443 | [=S@@expl]
444 | [*expl]
445 | [CH2+expl]
446 | [S@+expl]
447 | [S@@+expl]
448 | [-nexpl]
449 | [P@@expl]
450 | [/I]
451 | [Reexpl]
452 | [=SH+expl]
453 | [/CH-expl]
454 | [\nHexpl]
455 | [=C@@expl]
456 | [N@H+expl]
457 | [Teexpl]
458 | [Osexpl]
459 | [=Ru-expl]
460 | [Re-expl]
461 | [Zn+expl]
462 | [nexpl]
463 | [Mnexpl]
464 | [BH-expl]
465 | [\NH-expl]
466 | [BH3-expl]
467 | [11CH3expl]
468 | [/O-expl]
469 | [PH+expl]
470 | [Wexpl]
471 | [OH+expl]
472 | [/P]
473 | [Nb--expl]
474 | [Pt--expl]
475 | [Fe-3expl]
476 | [Al-3expl]
477 | [Cu-expl]
478 | [Ag-expl]
479 | [As-expl]
480 | [Pd--expl]
481 | [Se-expl]
482 | [cexpl]
483 | [3Hexpl]
484 | [\B]
485 | [Ring3]
486 | [=B]
487 | [Coexpl]
488 | [BH2-expl]
489 | [125Iexpl]
490 | [18Fexpl]
491 | [=CH+expl]
492 | [/C@expl]
493 | [=Ruexpl]
494 | [\n+expl]
495 | [\CH+expl]
496 | [Moexpl]
497 | [cH+expl]
498 | <PAD>
499 | <UNK>
500 | <START>
501 | <STOP>
502 | [NHexpl]
503 | [Expl=Ring2]
504 | .
505 | [Branch3_1]
506 | [Branch3_2]
507 | [Expl=Ring3]
508 | [Pr+3expl]
509 | [=Pbexpl]
510 | [\NHexpl]
511 | [=Ag+expl]
512 | [P+3expl]
513 | [=Reexpl]
514 | [Pt-expl]
515 | [=V+3expl]
516 | [SH2+expl]
517 | [=Tiexpl]
518 | [Ag+2expl]
519 | [U+6expl]
520 | [=SH2expl]
521 | [PH2+expl]
522 | [Sm+2expl]
523 | [Hf+3expl]
524 | [=PH3expl]
525 | [=Mo+2expl]
526 | [Y+3expl]
527 | [=V+2expl]
528 | [Ga+3expl]
529 | [=SiHexpl]
530 | [\S@@expl]
531 | [Fe+5expl]
532 | [Cr+4expl]
533 | [=Se+expl]
534 | [SeH+expl]
535 | [#Sbexpl]
536 | [Fe+4expl]
537 | [PH2-expl]
538 | [Ru+2expl]
539 | [=Al-expl]
540 | [AlH-expl]
541 | [Zr+4expl]
542 | [Ag+3expl]
543 | [#Si+expl]
544 | [=Zr+2expl]
545 | [Hf+4expl]
546 | [=Mnexpl]
547 | [Ceexpl]
548 | [=PH2expl]
549 | [SH2expl]
550 | [=As+3expl]
551 | [AsH2expl]
552 | [Ce+3expl]
553 | [I+3expl]
554 | [=Pd-2expl]
555 | [Taexpl]
556 | [131Iexpl]
557 | [32PH3expl]
558 | [Sn+2expl]
559 | [Nb+5expl]
560 | [=Agexpl]
561 | [=Sbexpl]
562 | [Ga+2expl]
563 | [=Bi+expl]
564 | [SnHexpl]
565 | [=Au-expl]
566 | [Bi+2expl]
567 | [Br+2expl]
568 | [=Niexpl]
569 | [229Thexpl]
570 | [P+2expl]
571 | [=Hfexpl]
572 | [Ti+6expl]
573 | [PH2expl]
574 | [11CH4expl]
575 | [V+5expl]
576 | [Ta+2expl]
577 | [Cd+expl]
578 | [Ir+3expl]
579 | [=Pd-3expl]
580 | [/Siexpl]
581 | [=SiH2expl]
582 | [BH4-expl]
583 | [=Ptexpl]
584 | [/S@expl]
585 | [=Geexpl]
586 | [GeH3expl]
587 | [=Cuexpl]
588 | [Al+3expl]
589 | [HeHexpl]
590 | [=Vexpl]
591 | [Ru+expl]
592 | [Fe+expl]
593 | [/S-expl]
594 | [Zr+3expl]
595 | [PH3expl]
596 | [Pb+2expl]
597 | [Gaexpl]
598 | [Sb+expl]
599 | [=Teexpl]
600 | [Rh+4expl]
601 | [AsH+expl]
602 | [=Pt+3expl]
603 | [=I]
604 | [#CH3+3expl]
605 | [\S@expl]
606 | [=Re+5expl]
607 | [Sb-expl]
608 | [=Pd-expl]
609 | [Mn+4expl]
610 | [=Znexpl]
611 | [IH2expl]
612 | [33PH3expl]
613 | [=AsH2expl]
614 | [=Hgexpl]
615 | [Pd-expl]
616 | [Se+expl]
617 | [=Wexpl]
618 | [Si@Hexpl]
619 | [=Irexpl]
620 | [AlH2-expl]
621 | [Sn+6expl]
622 | [Cu+4expl]
623 | [TaH3expl]
624 | [TeHexpl]
625 | [Er+3expl]
626 | [CH3+expl]
627 | [P+5expl]
628 | [=Sb+expl]
629 | [BrH+expl]
630 | [Ga+expl]
631 | [CuH2-expl]
632 | [=Pdexpl]
633 | [=Au-2expl]
634 | [=C+expl]
635 | [Fe-4expl]
636 | [S-2expl]
637 | [Cs+expl]
638 | [Ti+5expl]
639 | [Co+3expl]
640 | [Mn+6expl]
641 | [AsH3expl]
642 | [ClH+expl]
643 | [SnH4expl]
644 | [60Coexpl]
645 | [Pd-4expl]
646 | [Au+3expl]
647 | [#B]
648 | [=Hf+2expl]
649 | [GeHexpl]
650 | [=Cu-2expl]
651 | [Ni+4expl]
652 | [=SHexpl]
653 | [SiH4expl]
654 | [13CH4expl]
655 | [P-3expl]
656 | [Pd+2expl]
657 | [=Coexpl]
658 | [13NH3expl]
659 | [=Al+expl]
660 | [Au+expl]
661 | [Tc+6expl]
662 | [=Alexpl]
663 | [SH3+expl]
664 | [Mn+5expl]
665 | [PH4+expl]
666 | [\Mgexpl]
667 | [/B-expl]
668 | [Rh+expl]
669 | [Heexpl]
670 | [Ni+3expl]
671 | [SnH3expl]
672 | [Sm+3expl]
673 | [Tlexpl]
674 | [Smexpl]
675 | [Al-expl]
676 | [In+3expl]
677 | [Arexpl]
678 | [Sn+3expl]
679 | [SeH-expl]
680 | [V+expl]
681 | [Nd+expl]
682 | [=SnH2expl]
683 | [=Thexpl]
684 | [Pt+expl]
685 | [Cu+3expl]
686 | [I+2expl]
687 | [=Re+expl]
688 | [CH3-expl]
689 | [=Pt-expl]
690 | [Xeexpl]
691 | [GeH2expl]
692 | [18FHexpl]
693 | [IH2+expl]
694 | [Cl+expl]
695 | [Br+expl]
696 | [=Ti+2expl]
697 | [Sn+5expl]
698 | [=S@expl]
699 | [=99Tcexpl]
700 | [Si@@Hexpl]
701 | [Sc+3expl]
702 | [Cr-expl]
703 | [Yexpl]
704 | [13Cexpl]
705 | [Mn+3expl]
706 | [PH5expl]
707 | [Ir+expl]
708 | [Ti+3expl]
709 | [Hfexpl]
710 | [Ir-4expl]
711 | [/B]
712 | [Eu+3expl]
713 | [AlHexpl]
714 | [Ti+expl]
715 | >>
716 | [Csexpl]
717 | [=Rhexpl]
718 | [Thexpl]
719 | [/S+expl]
720 | [=Feexpl]
721 | [Al+2expl]
722 | [/NHexpl]
723 | [Al+expl]
724 | [=Ceexpl]
725 | [Neexpl]
726 | [=Mgexpl]
727 | [Ni+expl]
728 | [Mg+expl]
729 | [Mo+4expl]
730 | [=Se-expl]
731 | [Rhexpl]
732 | [Sb+5expl]
733 | [B+expl]
734 | [Zr+2expl]
735 | [N+3expl]
736 | [Ce+2expl]
737 | [=I+3expl]
738 | [MgHexpl]
739 | [Yb+2expl]
740 | [OHexpl]
741 | [Mgexpl]
742 | [Pd+expl]
743 | [Pb+3expl]
744 | [Si-expl]
745 | [AlH2+expl]
746 | [=Ag-expl]
747 | [Laexpl]
748 | [Sn+4expl]
749 | [=U+2expl]
750 | [=P+3expl]
751 | [Si@@expl]
752 | [U+4expl]
753 | [Srexpl]
754 | [Bi+3expl]
755 | [Rh+3expl]
756 | [B+3expl]
757 | [#Wexpl]
758 | [=Osexpl]
759 | [=W-2expl]
760 | [=Snexpl]
761 | [Nbexpl]
762 | [Tbexpl]
763 | [Rh+2expl]
764 | [Bexpl]
765 | [=Ru-2expl]
766 | [Gdexpl]
767 | [#Tiexpl]
768 | [99Tcexpl]
769 | [=Ru-4expl]
770 | [Hf+2expl]
771 | [Fe+6expl]
772 | [Th+4expl]
773 | [C+4expl]
774 | [1HHexpl]
775 | [Tb+3expl]
776 | [/S@@expl]
777 | [In+expl]
778 | [Expl#Ring2]
779 | [Ti+4expl]
780 | [=Rh+expl]
781 | [Si+2expl]
782 | [Re+5expl]
783 | [Dy+3expl]
784 | [Expl#Ring1]
785 | [Irexpl]
786 | [/Snexpl]
787 | [Se-2expl]
788 | [=Cu-expl]
789 | [=Taexpl]
790 | [32Pexpl]
791 | [W+6expl]
792 | [Ta+5expl]
793 | [=Ag-2expl]
794 | [Si@expl]
795 | [=CH2+expl]
796 | [Uexpl]
797 | [#S]
798 | [#Zrexpl]
799 | [Tl+2expl]
800 | [I+7expl]
801 | [Pd-3expl]
802 | [Pb+4expl]
803 | [B+2expl]
804 | [SnH2expl]
805 | [Tl+3expl]
806 | [Ru-2expl]
807 | [Pd+3expl]
808 | [Cr+6expl]
809 | [#Niexpl]
810 | [=SeH-expl]
811 | [V+2expl]
812 | [V+3expl]
813 | [\SHexpl]
814 | [AlH4-expl]
815 | [OH2+expl]
816 | [H-expl]
817 | [O-2expl]
818 | [\Snexpl]
819 | [AsH4+expl]
820 | [=IH2expl]
821 | [NiH6-5expl]
822 | [Rb+expl]
823 | [As+3expl]
824 | [SiH-expl]
825 | [SHexpl]
826 | [La+3expl]
827 | [Yb+3expl]
828 | [2H-expl]
829 | [Pbexpl]
830 | [Prexpl]
831 | [Sb+2expl]
832 | [IH+expl]
833 | [Ni-expl]
834 | [PdH2expl]
835 | [Hg+expl]
836 | [PH4expl]
837 | [Nd+3expl]
838 | [15OH2expl]
839 | [V+4expl]
840 | [123I-expl]
841 | [Rh-expl]
842 | [14Cexpl]
843 | [PH3+expl]
844 | [Si+4expl]
845 | [Pt+4expl]
846 | [AsHexpl]
847 | [=Asexpl]
848 | [Ge+3expl]
849 | [Ce+4expl]
850 | [Pd-2expl]
851 | [Ru-expl]
852 | [Zr+expl]
853 | [Cl+2expl]
854 | [NH2-expl]
855 | [99Tc+4expl]
856 | [OH3+expl]
857 | [Sn+expl]
858 | [Ru+3expl]
859 | [=Tcexpl]
860 | [/Mgexpl]
861 | [Hg-2expl]


--------------------------------------------------------------------------------
/vocabs/smallmolecules.txt:
--------------------------------------------------------------------------------
  1 | [PAD]
  2 | _-_
  3 | _+_
  4 | <bace>
  5 | <lipo>
  6 | <freesolv>
  7 | <energy>
  8 | [Agexpl]
  9 | <pdi>
 10 | <conv>
 11 | [unused10]
 12 | [UNK]
 13 | [CLS]
 14 | [SEP]
 15 | [MASK]
 16 | |
 17 | <qed>
 18 | <logp>
 19 | <molwt>
 20 | <sas>
 21 | <scs>
 22 | <esol>
 23 | <plogp>
 24 | <lipinski>
 25 | <rxnretro>
 26 | _._
 27 | _0_-0_
 28 | _0_-1_
 29 | _0_-2_
 30 | _0_-3_
 31 | _0_-4_
 32 | _0_-5_
 33 | _0_0_
 34 | _0_1_
 35 | _0_2_
 36 | _0_3_
 37 | _0_4_
 38 | _0_5_
 39 | _1_-0_
 40 | _1_-1_
 41 | _1_-2_
 42 | _1_-3_
 43 | _1_-4_
 44 | _1_-5_
 45 | _1_-6_
 46 | _1_0_
 47 | _1_1_
 48 | _1_2_
 49 | _1_3_
 50 | _1_4_
 51 | _1_5_
 52 | _2_-0_
 53 | _2_-1_
 54 | _2_-2_
 55 | _2_-3_
 56 | _2_-4_
 57 | _2_-5_
 58 | _2_-6_
 59 | _2_0_
 60 | _2_1_
 61 | _2_2_
 62 | _2_3_
 63 | _2_4_
 64 | _2_5_
 65 | _3_-0_
 66 | _3_-1_
 67 | _3_-2_
 68 | _3_-3_
 69 | _3_-4_
 70 | _3_-5_
 71 | _3_-6_
 72 | _3_0_
 73 | _3_1_
 74 | _3_2_
 75 | _3_3_
 76 | _3_4_
 77 | _3_5_
 78 | _4_-0_
 79 | _4_-1_
 80 | _4_-2_
 81 | _4_-3_
 82 | _4_-4_
 83 | _4_-5_
 84 | _4_-6_
 85 | _4_0_
 86 | _4_1_
 87 | _4_2_
 88 | _4_3_
 89 | _4_4_
 90 | _4_5_
 91 | _5_-0_
 92 | _5_-1_
 93 | _5_-2_
 94 | _5_-3_
 95 | _5_-4_
 96 | _5_-5_
 97 | _5_-6_
 98 | _5_0_
 99 | _5_1_
100 | _5_2_
101 | _5_3_
102 | _5_4_
103 | _5_5_
104 | _6_-0_
105 | _6_-1_
106 | _6_-2_
107 | _6_-3_
108 | _6_-4_
109 | _6_-5_
110 | _6_-6_
111 | _6_0_
112 | _6_1_
113 | _6_2_
114 | _6_3_
115 | _6_4_
116 | _6_5_
117 | _7_-0_
118 | _7_-1_
119 | _7_-2_
120 | _7_-3_
121 | _7_-4_
122 | _7_-5_
123 | _7_-6_
124 | _7_0_
125 | _7_1_
126 | _7_2_
127 | _7_3_
128 | _7_4_
129 | _7_5_
130 | _8_-0_
131 | _8_-1_
132 | _8_-2_
133 | _8_-3_
134 | _8_-4_
135 | _8_-5_
136 | _8_-6_
137 | _8_0_
138 | _8_1_
139 | _8_2_
140 | _8_3_
141 | _8_4_
142 | _8_5_
143 | _9_-0_
144 | _9_-1_
145 | _9_-2_
146 | _9_-3_
147 | _9_-4_
148 | _9_-5_
149 | _9_-6_
150 | _9_0_
151 | _9_1_
152 | _9_2_
153 | _9_3_
154 | _9_4_
155 | _9_5_
156 | [Branch2_1]
157 | [=O]
158 | [epsilon]
159 | [Ring1]
160 | [=C]
161 | [Ring2]
162 | [Branch1_3]
163 | [N]
164 | [Branch1_1]
165 | [C]
166 | [=N]
167 | [Branch2_3]
168 | [Branch1_2]
169 | [#N]
170 | [Br]
171 | [O]
172 | [Branch2_2]
173 | [F]
174 | [S]
175 | [=S]
176 | [#C]
177 | [Cl]
178 | [O-expl]
179 | [N+expl]
180 | [P]
181 | [.]
182 | [I]
183 | [c]
184 | [-c]
185 | [s]
186 | [nHexpl]
187 | [\c]
188 | [n]
189 | [\C]
190 | [o]
191 | [C@@Hexpl]
192 | [C@expl]
193 | [C@@expl]
194 | [C@Hexpl]
195 | [/C]
196 | [/c]
197 | [Ptexpl]
198 | [\N]
199 | [\C@@Hexpl]
200 | [/C@Hexpl]
201 | [\C@Hexpl]
202 | [-n]
203 | [=c]
204 | [B]
205 | [\S]
206 | [/n]
207 | [=N+expl]
208 | [Expl\Ring2]
209 | [Expl/Ring1]
210 | [n+expl]
211 | [Expl\Ring1]
212 | [Asexpl]
213 | [N@@expl]
214 | [S@@expl]
215 | [/O]
216 | [Expl-Ring1]
217 | [/N]
218 | [S+expl]
219 | [/S]
220 | [Pexpl]
221 | [=Nexpl]
222 | [#O+expl]
223 | [C-expl]
224 | [Iexpl]
225 | [O+expl]
226 | [Brexpl]
227 | [Clexpl]
228 | [=N-expl]
229 | [N-expl]
230 | [P+expl]
231 | [Oexpl]
232 | [#C-expl]
233 | [=Oexpl]
234 | [#N+expl]
235 | [=Iexpl]
236 | [CH-expl]
237 | [P-expl]
238 | [s+expl]
239 | [=P]
240 | [=I++expl]
241 | [o+expl]
242 | [=O+expl]
243 | [CH2expl]
244 | [=S+expl]
245 | [I+expl]
246 | [IHexpl]
247 | [CHexpl]
248 | [-n+expl]
249 | [=CHexpl]
250 | [=O-expl]
251 | [c-expl]
252 | [S-expl]
253 | [p]
254 | [Nexpl]
255 | [Cexpl]
256 | [=P+expl]
257 | [n-expl]
258 | [cH-expl]
259 | [B-expl]
260 | [Expl-Ring2]
261 | [C+expl]
262 | [c+expl]
263 | [=n+expl]
264 | [NH-expl]
265 | [NH2+expl]
266 | [Expl/Ring2]
267 | [Expl=Ring1]
268 | [Cl-expl]
269 | [Na+expl]
270 | [Hexpl]
271 | [NH4+expl]
272 | [Hgexpl]
273 | [\O]
274 | [Br-expl]
275 | [N@expl]
276 | [Ca++expl]
277 | [Snexpl]
278 | [I-expl]
279 | [Co+expl]
280 | [N@@+expl]
281 | [K+expl]
282 | [Fe--expl]
283 | [\Hexpl]
284 | [N@+expl]
285 | [Fe+3expl]
286 | [Gd+3expl]
287 | [/N+expl]
288 | [NH+expl]
289 | [=NH+expl]
290 | [Zn++expl]
291 | [/Br]
292 | [/Cl]
293 | [/C@@Hexpl]
294 | [\N+expl]
295 | [NH3+expl]
296 | [Alexpl]
297 | [Hg++expl]
298 | [Cu++expl]
299 | [Znexpl]
300 | [Au-expl]
301 | [Auexpl]
302 | [Crexpl]
303 | [Cd++expl]
304 | [Cdexpl]
305 | [Siexpl]
306 | [Sbexpl]
307 | [Seexpl]
308 | [=Seexpl]
309 | [Cuexpl]
310 | [Li+expl]
311 | [Tl+expl]
312 | [Biexpl]
313 | [Inexpl]
314 | [/Hexpl]
315 | [Caexpl]
316 | [Dyexpl]
317 | [Co++expl]
318 | [Cr+3expl]
319 | [Fe++expl]
320 | [Pt-2expl]
321 | [Sb+3expl]
322 | [Be++expl]
323 | [Mg++expl]
324 | [Tiexpl]
325 | [Fe-expl]
326 | [Ndexpl]
327 | [Pdexpl]
328 | [#Inexpl]
329 | [Ba++expl]
330 | [H+expl]
331 | [Mn+expl]
332 | [Mn++expl]
333 | [SiHexpl]
334 | [\Cl]
335 | [Ni++expl]
336 | [Zrexpl]
337 | [Niexpl]
338 | [PbH2++expl]
339 | [Ybexpl]
340 | [Naexpl]
341 | [=Moexpl]
342 | [=Cdexpl]
343 | [Cu+expl]
344 | [Geexpl]
345 | [Baexpl]
346 | [=Crexpl]
347 | [Cr++expl]
348 | [OH-expl]
349 | [SnH2++expl]
350 | [Mg+2expl]
351 | [=Siexpl]
352 | [\Br]
353 | [\C@expl]
354 | [Vexpl]
355 | [Ag+expl]
356 | [\C@@expl]
357 | [Pt+2expl]
358 | [2Hexpl]
359 | [Ti++expl]
360 | [Sr++expl]
361 | [=Auexpl]
362 | [Ruexpl]
363 | [\O-expl]
364 | [P@expl]
365 | [Liexpl]
366 | [/C@@expl]
367 | [As+expl]
368 | [\Siexpl]
369 | [/Alexpl]
370 | [\O+expl]
371 | [/Crexpl]
372 | [/Feexpl]
373 | [Euexpl]
374 | [Scexpl]
375 | [Zn+2expl]
376 | [Ca+2expl]
377 | [Hg+2expl]
378 | [=Zrexpl]
379 | [nH+expl]
380 | [Cl+3expl]
381 | [Ba+2expl]
382 | [TlH2+expl]
383 | [Fe+2expl]
384 | [AlH3expl]
385 | [=PHexpl]
386 | [Co+2expl]
387 | [Cu+2expl]
388 | [PbH2+2expl]
389 | [\s]
390 | [Ni+2expl]
391 | [Cd+2expl]
392 | [SnH2+2expl]
393 | [Ti+2expl]
394 | [PHexpl]
395 | [Mn+2expl]
396 | [Sr+2expl]
397 | [Be+2expl]
398 | [seexpl]
399 | [Cr+2expl]
400 | [=Biexpl]
401 | [=C-expl]
402 | [SbH6+3expl]
403 | [\n]
404 | [Fe-2expl]
405 | [=OH+expl]
406 | [-c-expl]
407 | [/s]
408 | [=NH2+expl]
409 | [#S+expl]
410 | [/F]
411 | [F-expl]
412 | [SH-expl]
413 | [CH+expl]
414 | [\NH+expl]
415 | [\CH-expl]
416 | [\c-expl]
417 | [/o]
418 | [CH2-expl]
419 | [\N-expl]
420 | [/n-expl]
421 | [\C-expl]
422 | [/NH+expl]
423 | [/N-expl]
424 | [\F]
425 | [Gd-4expl]
426 | [Gd-5expl]
427 | [N@@H+expl]
428 | [SiH3expl]
429 | [Branch3_3]
430 | [Sexpl]
431 | [\I]
432 | [BiH3expl]
433 | [SeHexpl]
434 | [SiH2expl]
435 | [Feexpl]
436 | [S@expl]
437 | [\P]
438 | [/nHexpl]
439 | [SH+expl]
440 | [-oexpl]
441 | [-sexpl]
442 | [Kexpl]
443 | [=S@@expl]
444 | [*expl]
445 | [CH2+expl]
446 | [S@+expl]
447 | [S@@+expl]
448 | [-nexpl]
449 | [P@@expl]
450 | [/I]
451 | [Reexpl]
452 | [=SH+expl]
453 | [/CH-expl]
454 | [\nHexpl]
455 | [=C@@expl]
456 | [N@H+expl]
457 | [Teexpl]
458 | [Osexpl]
459 | [=Ru-expl]
460 | [Re-expl]
461 | [Zn+expl]
462 | [nexpl]
463 | [Mnexpl]
464 | [BH-expl]
465 | [\NH-expl]
466 | [BH3-expl]
467 | [11CH3expl]
468 | [/O-expl]
469 | [PH+expl]
470 | [Wexpl]
471 | [OH+expl]
472 | [/P]
473 | [Nb--expl]
474 | [Pt--expl]
475 | [Fe-3expl]
476 | [Al-3expl]
477 | [Cu-expl]
478 | [Ag-expl]
479 | [As-expl]
480 | [Pd--expl]
481 | [Se-expl]
482 | [cexpl]
483 | [3Hexpl]
484 | [\B]
485 | [Ring3]
486 | [=B]
487 | [Coexpl]
488 | [BH2-expl]
489 | [125Iexpl]
490 | [18Fexpl]
491 | [=CH+expl]
492 | [/C@expl]
493 | [=Ruexpl]
494 | [\n+expl]
495 | [\CH+expl]
496 | [Moexpl]
497 | [cH+expl]
498 | <PAD>
499 | <UNK>
500 | <START>
501 | <STOP>
502 | [NHexpl]
503 | [Expl=Ring2]
504 | .
505 | [Branch3_1]
506 | [Branch3_2]
507 | [Expl=Ring3]


--------------------------------------------------------------------------------