├── .gitignore ├── LICENSE ├── README.md ├── assets ├── esol.png ├── gradio_demo.png └── overview.jpg ├── conda.yml ├── configs ├── qed_eval.json └── rt_small.json ├── dev_requirements.txt ├── examples ├── example.smi └── qed_property_example.txt ├── pyproject.toml ├── requirements.txt ├── scripts ├── create_vocabulary.py ├── eval_language_modeling.py ├── eval_lm_nlp.py ├── eval_regressionhead.py ├── generate_example_data.py ├── run_language_modeling.py ├── run_lm_nlp.py └── run_regressionhead.py ├── setup.cfg ├── setup.py ├── terminator ├── __init__.py ├── args.py ├── collator_utils.py ├── collators.py ├── datasets.py ├── evaluator.py ├── factories.py ├── functional_groups.py ├── nlp.py ├── numerical_encodings.py ├── property_predictors.py ├── py.typed ├── search.py ├── tokenization.py ├── trainer.py ├── trainer_utils.py └── utils.py ├── training_configs ├── qed_alternated_cc.json ├── qed_proponly.json └── reactions_alternating_cc.json └── vocabs ├── proteins.txt ├── reactions.txt └── smallmolecules.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Mac 10 | *.DS_Store 11 | 12 | # Model files 13 | data/ 14 | models/ 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | docs/api/* 81 | 82 | # PyBuilder 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | *.ipynb 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 104 | __pypackages__/ 105 | 106 | # Celery stuff 107 | celerybeat-schedule 108 | celerybeat.pid 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | 140 | # Visual Studio Code settings 141 | .vscode/ 142 | 143 | # PyCharm settings 144 | .idea/ 145 | 146 | runs 147 | examples/models 148 | bash 149 | sandbox -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 International Business Machines 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Regression Transformer 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 3 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 4 | [![Gradio demo](https://img.shields.io/website-up-down-green-red/https/hf.space/gradioiframe/GT4SD/regression_transformer/+.svg?label=demo%20status)](https://huggingface.co/spaces/GT4SD/regression_transformer) 5 | [![DOI](https://zenodo.org/badge/449377638.svg)](https://zenodo.org/badge/latestdoi/449377638) 6 | 7 | A multitask Transformer that reformulates regression as a conditional sequence modeling task. 8 | This yields a dichotomous language model that seamlessly integrates regression with property-driven conditional generation. 9 | 10 | ![Summary](assets/overview.jpg) 11 | 12 | This repo contains the development code. Read the paper in [*Nature Machine Intelligence*](https://www.nature.com/articles/s42256-023-00639-z). 13 | 14 | ## Demo with UI 15 | 🤗 A gradio demo with a simple UI is available on [HuggingFace spaces](https://huggingface.co/spaces/GT4SD/regression_transformer) 16 | ![Summary](assets/gradio_demo.png) 17 | 18 | 19 | ## Building upon this research 20 | 21 | #### You want to use a pretrained RT-model or finetune it on your own data? Then read here, otherwise the development setup can be found [below](#development-setup). 22 | 23 | The Regression Transformer is implemented in the [GT4SD](https://github.com/GT4SD/gt4sd-core) library. 24 | Via GT4SD, using several pretrained RegressionTransformers is a matter of a few lines of code :rocket:. 25 | A complete tutorial of running inference, finetuning a RT model (or training it from scratch) and sharing and deploying it to the GT4SD model hub, can be found [here](https://github.com/GT4SD/gt4sd-core/tree/main/examples/regression_transformer). 26 | 27 | For example, via GT4SD you can use the RT pretrained on small molecules with some properties as shown in the paper, in particular [QED](https://www.nature.com/articles/nchem.1243) and [ESOL](https://pubs.acs.org/doi/10.1021/ci034243x) (water solubility). There is also several multiproperty variants of the RT: e.g., a model trained jointly on logP and synthesizability (aka [SCScore](https://pubs.acs.org/doi/10.1021/acs.jcim.7b00622)). 28 | For protein language modeling, you will also find a RT trained on a [peptide stability](https://www.science.org/doi/full/10.1126/science.aan0693) dataset from the [TAPE](https://github.com/songlab-cal/tape) benchmark. 29 | In sum, GT4SD provides RT models pretrained on: 30 | - **small molecules**: single (`qed`, `esol`, `crippen_logp`) or multiple (`logp_and_synthesizability`, `cosmo_acdl`, `pfas`) properties. All those models use SELFIES apart from `crippen_logp` which uses SMILES. 31 | - **proteins**: `stability` 32 | - **chemical reactions**: `uspto` (using reaction SMILES) 33 | - **polymers**: `rop_catalyst` and `block_copolymer` are both described in [Park et al., (2023; *Nature Communications*)](https://www.nature.com/articles/s41467-023-39396-3). The `rop_catalyst` uses conventional SELFIES but the `block_copolymer` model uses a novel polymer language called CMDL described also in [Park et al., (2023; *Nature Communications*)](https://www.nature.com/articles/s41467-023-39396-3). 34 | 35 | A jupyter notebook with a toy usecase on adapting a molecule toward solubility is provided in [GT4SD](https://github.com/GT4SD/gt4sd-core/blob/main/notebooks/regression-transformer-demo.ipynb) too. 36 | If you use [GT4SD](https://github.com/GT4SD/gt4sd-core), you can generate molecules like this: 37 | ```python 38 | from gt4sd.algorithms.conditional_generation.regression_transformer import ( 39 | RegressionTransformer, RegressionTransformerMolecules 40 | ) 41 | 42 | buturon = "CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1" 43 | target_esol = -3.53 44 | config = RegressionTransformerMolecules( 45 | algorithm_version="solubility", 46 | search="sample", 47 | temperature=2, 48 | tolerance=5, 49 | sampling_wrapper={ 50 | 'property_goal': {'': target_esol}, 51 | 'fraction_to_mask': 0.2 52 | } 53 | ) 54 | esol_generator = RegressionTransformer(configuration=config, target=buturon) 55 | generations = list(esol_generator.sample(8)) 56 | ``` 57 | 58 | Explore the solubility of the local chemical space around Buturon. Upon varying the property primers, you might obtain something like this: 59 | ![Esol](assets/esol.png) 60 | 61 | ## Development setup 62 | This is mainly intended to reproduce or extend the results of the paper. 63 | ```console 64 | conda env create -f conda.yml 65 | conda activate terminator 66 | pip install -e . 67 | ``` 68 | 69 | ### Data 70 | The processed data used to train the models is available via [Box](https://ibm.box.com/s/kijawq3rf4191bbcyflsxx7kp9m74jnx). 71 | 72 | ### Training a model 73 | You can download the data and launch a training by pointing to train and test data: 74 | ```console 75 | python scripts/run_language_modeling.py --output_dir rt_example \ 76 | --config_name configs/rt_small.json --tokenizer_name ./vocabs/smallmolecules.txt \ 77 | --do_train --do_eval --learning_rate 1e-4 --num_train_epochs 5 --save_total_limit 2 \ 78 | --save_steps 500 --per_gpu_train_batch_size 16 --evaluate_during_training --eval_steps 5 \ 79 | --eval_data_file ./examples/qed_property_example.txt --train_data_file ./examples/qed_property_example.txt \ 80 | --line_by_line --block_size 510 --seed 42 --logging_steps 100 --eval_accumulation_steps 2 \ 81 | --training_config_path training_configs/qed_alternated_cc.json 82 | ``` 83 | :warning: This configuration uses dummy data, do not use as is :no_good: 84 | The `training_config_path` argument points to a file that specifies the training regime. This is optional, if the argument is not given, we default to vanilla PLM training that masks everywhere with equal probability (recommended for initial pretraining only). For refined examples, please see `training_configs` folder. 85 | 86 | Also note that the `vocabs` folder contains the vocabulary files for training on small molecules, proteins and chemical reactions. 87 | 88 | Exemplary model configurations (number of heads, layers, etc.) can be found in the [configs](./configs) folder. 89 | 90 | :warning: **XLNet trains relatively slowly. It is recommended to start a training/finetuning from a pretrained model, ideally with the GT4SD trainer** (see above) :warning: 91 | 92 | 93 | ### Evaluating a model 94 | To evaluate a model trained e.g., on the QED task, run the following: 95 | ```console 96 | python scripts/eval_language_modeling.py --output_dir path_to_model \ 97 | --eval_file ./examples/qed_property_example.txt --eval_accumulation_steps 2 --param_path configs/qed_eval.json 98 | ``` 99 | 100 | ### Pretrained models 101 | Pretrained models are available via the GT4SD model hub. There's a total of 9 models that can also be used via [HuggingFace Spaces](https://huggingface.co/spaces/jannisborn/regression_transformer). Models that are part of the publication are also available via the [Box folder mentioned above](https://ibm.box.com/s/kijawq3rf4191bbcyflsxx7kp9m74jnx). 102 | 103 | #### Generate some data 104 | To generate custom data for the QED task in a RT-compatible format, run [scripts/generate_example_data.py](./scripts/generate_example_data.py) and point to a `.smi` file with SMILES in the first column. 105 | ```console 106 | python scripts/generate_example_data.py examples/example.smi examples/qed_property_example.txt 107 | ``` 108 | For user-defined properties, please adapt the file or open an issue. 109 | 110 | If you need to create a new vocabulary for a dataset you can use [scripts/create_vocabulary.py](./scripts/create_vocabulary.py) it will also automatically add some special tokens at the top of your vocabulary file. 111 | ```console 112 | python scripts/create_vocabulary.py examples/qed_property_example.txt examples/vocab.txt 113 | ``` 114 | 115 | At this point the folder containing the vocabulary file can be used to load a tokenizer compatible with any `ExpressionBertTokenizer`: 116 | ```python 117 | >>> from terminator.tokenization import ExpressionBertTokenizer 118 | >>> tokenizer = ExpressionBertTokenizer.from_pretrained('examples') 119 | >>> text = '0.3936|CBr' 120 | >>> tokens = tokenizer.tokenize(text) 121 | >>> print(tokens) 122 | ['', '_0_0_', '_._', '_3_-1_', '_9_-2_', '_3_-3_', '_6_-4_', '|', 'C', 'Br'] 123 | >>> token_indexes = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) 124 | >>> print(token_indexes) 125 | [16, 17, 18, 28, 45, 34, 35, 19, 15, 63] 126 | >>> tokenizer.build_inputs_with_special_tokens(token_indexes) 127 | [12, 16, 17, 18, 28, 45, 34, 35, 19, 15, 63, 13] 128 | ``` 129 | 130 | ## Citation 131 | If you use the regression transformer, please cite: 132 | ```bib 133 | @article{born2023regression, 134 | title={Regression Transformer enables concurrent sequence regression and generation for molecular language modelling}, 135 | author={Born, Jannis and Manica, Matteo}, 136 | journal={Nature Machine Intelligence}, 137 | volume={5}, 138 | number={4}, 139 | pages={432--444}, 140 | year={2023}, 141 | publisher={Nature Publishing Group UK London} 142 | } 143 | ``` 144 | -------------------------------------------------------------------------------- /assets/esol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/regression-transformer/6820b45e4548ee2648557fcf120b3361414a8cd9/assets/esol.png -------------------------------------------------------------------------------- /assets/gradio_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/regression-transformer/6820b45e4548ee2648557fcf120b3361414a8cd9/assets/gradio_demo.png -------------------------------------------------------------------------------- /assets/overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/regression-transformer/6820b45e4548ee2648557fcf120b3361414a8cd9/assets/overview.jpg -------------------------------------------------------------------------------- /conda.yml: -------------------------------------------------------------------------------- 1 | name: terminator 2 | channels: 3 | - https://conda.anaconda.org/rdkit 4 | dependencies: 5 | - rdkit=2019.03.1 6 | - python=3.7 7 | - pip>=19.1,<20.3 8 | - pip: 9 | - transformers==v3.1.0 10 | - -r file:requirements.txt 11 | - -r file:dev_requirements.txt 12 | - -e . 13 | -------------------------------------------------------------------------------- /configs/qed_eval.json: -------------------------------------------------------------------------------- 1 | { 2 | "plm_probability": 0.4, 3 | "checkpoint-str": "rmse", 4 | "max_span_length": 7, 5 | "conditioning_range": [ 6 | [ 7 | 0.051, 8 | 0.151, 9 | 0.251, 10 | 0.351, 11 | 0.451, 12 | 0.551, 13 | 0.651, 14 | 0.751, 15 | 0.851, 16 | 0.951 17 | ] 18 | ], 19 | "line_by_line": true, 20 | "property_tokens": [ 21 | "" 22 | ], 23 | "property_tokens_to_mask": [ 24 | [ 25 | 5 26 | ] 27 | ], 28 | "property_token_masking_order": [ 29 | [ 30 | 2, 31 | 3, 32 | 4, 33 | 0, 34 | 1 35 | ] 36 | ], 37 | "beam_width": 3 38 | } -------------------------------------------------------------------------------- /configs/rt_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "XLNetLMHeadModel" 4 | ], 5 | "attn_type": "bi", 6 | "bi_data": false, 7 | "bos_token_id": 14, 8 | "clamp_len": -1, 9 | "d_head": 16, 10 | "d_inner": 1024, 11 | "d_model": 256, 12 | "dropout": 0.2, 13 | "end_n_top": 5, 14 | "eos_token_id": 14, 15 | "ff_activation": "gelu", 16 | "initializer_range": 0.02, 17 | "language": "selfies", 18 | "layer_norm_eps": 1e-12, 19 | "mem_len": null, 20 | "model_type": "xlnet", 21 | "n_head": 16, 22 | "n_layer": 32, 23 | "numerical_encodings_dim": 16, 24 | "numerical_encodings_format": "sum", 25 | "numerical_encodings_type": "float", 26 | "pad_token_id": 0, 27 | "reuse_len": null, 28 | "same_length": false, 29 | "start_n_top": 5, 30 | "summary_activation": "tanh", 31 | "summary_last_dropout": 0.1, 32 | "summary_type": "last", 33 | "summary_use_proj": true, 34 | "task_specific_params": { 35 | "text-generation": { 36 | "do_sample": true, 37 | "max_length": 250 38 | } 39 | }, 40 | "untie_r": true, 41 | "use_numerical_encodings": true, 42 | "vmax": 1.0, 43 | "vocab_size": 507 44 | } -------------------------------------------------------------------------------- /dev_requirements.txt: -------------------------------------------------------------------------------- 1 | flake8==3.8.4 2 | mypy==0.800 3 | pytest==6.1.1 4 | pytest-cov==2.10.1 5 | black==20.8b1 6 | sphinx==3.4.3 7 | sphinx-autodoc-typehints==1.11.1 8 | better-apidoc==0.3.1 9 | sphinx_rtd_theme==0.5.1 10 | myst-parser==0.13.3 11 | flask==1.1.2 12 | flask_login==0.5.0 -------------------------------------------------------------------------------- /examples/example.smi: -------------------------------------------------------------------------------- 1 | CCO CHEMBL545 2 | C CHEMBL17564 3 | CO CHEMBL14688 4 | NCCS CHEMBL602 5 | NCCN CHEMBL816 6 | CN CHEMBL43280 7 | C=O CHEMBL1255 8 | CCN CHEMBL14449 9 | CSC CHEMBL15580 10 | CBr CHEMBL48339 11 | CI CHEMBL115849 12 | CF CHEMBL116838 13 | CC CHEMBL135626 14 | CNC=O CHEMBL9240 15 | CCCN CHEMBL14409 16 | CCCO CHEMBL14687 17 | O=CC#C CHEMBL722 18 | C=CC=O CHEMBL721 19 | CC#N CHEMBL45211 20 | CCCl CHEMBL46058 21 | NC#N CHEMBL56279 22 | CC=O CHEMBL76101 23 | SC#N CHEMBL84336 24 | FCF CHEMBL115186 25 | C#C CHEMBL116336 26 | CCl CHEMBL117545 27 | C=C CHEMBL117822 28 | COC CHEMBL119178 29 | CNC CHEMBL120433 30 | CCNCC CHEMBL1189 31 | CCC CHEMBL135416 32 | N#N CHEMBL142438 33 | CNO CHEMBL144761 34 | CNN CHEMBL160520 35 | C#N CHEMBL183419 36 | CC(C)O CHEMBL582 37 | CNC=O CHEMBL9081 38 | CCCCON CHEMBL6960 39 | CCNC=O CHEMBL9421 40 | CC(O)=O CHEMBL539 41 | CCCCO CHEMBL14245 42 | CCCCN CHEMBL13968 43 | COCOC CHEMBL15537 44 | CCC#N CHEMBL15871 45 | CCCCC CHEMBL16102 46 | CCOCC CHEMBL16264 47 | NC(N)=N CHEMBL821 48 | ClCCl CHEMBL45967 49 | NCC=C CHEMBL57286 50 | NC(N)=O CHEMBL985 51 | NCCO CHEMBL104943 52 | OCCF CHEMBL115586 53 | CC=C CHEMBL117213 54 | OC=O CHEMBL116736 55 | CC#C CHEMBL116902 56 | CCCC CHEMBL134702 57 | CCBr CHEMBL156378 58 | CNNC CHEMBL162921 59 | CC=O CHEMBL170365 60 | OCCS CHEMBL254951 61 | NC=O CHEMBL266160 62 | ON=C CHEMBL324784 63 | OCCO CHEMBL457299 64 | CON CHEMBL1213633 65 | CCCCl CHEMBL15697 66 | CS(C)=O CHEMBL504 67 | ON=C CHEMBL185198 68 | Cn1ccnc1 CHEMBL543 69 | CCCCCO CHEMBL14568 70 | CCCCCC CHEMBL15939 71 | ClCCCl CHEMBL16370 72 | CCCC#C CHEMBL16262 73 | OCC(O)CO CHEMBL692 74 | CN1CCCC1 CHEMBL665 75 | CC(=O)NO CHEMBL734 76 | NCC(O)=O CHEMBL773 77 | CCCCCF CHEMBL42434 78 | CCOC=O CHEMBL44215 79 | CCCCCl CHEMBL47259 80 | NCCCCN CHEMBL46257 81 | NNC(N)=O CHEMBL903 82 | CCNCCN CHEMBL70445 83 | CNCCO CHEMBL104083 84 | N=C=N CHEMBL116583 85 | NCCCO CHEMBL115530 86 | C=C=C CHEMBL116960 87 | CCC=C CHEMBL117210 88 | CCSCC CHEMBL117181 89 | CC#CC CHEMBL119108 90 | NCCCN CHEMBL174324 91 | OCCCl CHEMBL191244 92 | OCC=C CHEMBL234926 93 | NC(=O)NO CHEMBL467 94 | CCC=O CHEMBL275626 95 | CSCCO CHEMBL277871 96 | COC=O CHEMBL295026 97 | ClCBr CHEMBL346918 98 | C1CCSC1 CHEMBL1379 99 | COCCO CHEMBL444144 100 | OCCCO CHEMBL379652 101 | OCCBr CHEMBL468583 102 | C1CN1 CHEMBL540990 103 | CCON CHEMBL1213044 104 | OC#N CHEMBL1161700 105 | NCCF CHEMBL1162280 106 | NCC=O CHEMBL296723 107 | CCNC CHEMBL1232589 108 | CCCS CHEMBL1236818 109 | CSSC CHEMBL1347061 110 | CCNN CHEMBL1359929 111 | SC#N CHEMBL1161685 112 | NCCCC(O)=O CHEMBL96 113 | c1cnoc1 CHEMBL13257 114 | Nc1ccccc1 CHEMBL538 115 | CCCCCCO CHEMBL14085 116 | CC(C)=O CHEMBL14253 117 | c1cscn1 CHEMBL15605 118 | CC(N)=O CHEMBL16081 119 | CCCCC=O CHEMBL18602 120 | C1CCNC1 CHEMBL22830 121 | CC(N)=S CHEMBL38737 122 | CCC(C)O CHEMBL45462 123 | CN(C)CCO CHEMBL1135 124 | CC(C)N CHEMBL117080 125 | CCOC=C CHEMBL116745 126 | CCOCCO CHEMBL119596 127 | ClCCBr CHEMBL160255 128 | CCCCBr CHEMBL160949 129 | COCC#C CHEMBL162694 130 | OCCCCO CHEMBL171623 131 | CNCCCN CHEMBL174165 132 | C1CNCCN1 CHEMBL1412 133 | OCNC=O CHEMBL268447 134 | NCCCON CHEMBL281021 135 | CCCC=C CHEMBL295337 136 | ClCCNCCCl CHEMBL913 137 | CC(F)F CHEMBL325493 138 | CSCCCO CHEMBL332887 139 | CCNNCC CHEMBL350303 140 | CN=C=S CHEMBL396000 141 | CN(C)C CHEMBL439723 142 | C=CC#N CHEMBL445612 143 | BrCCBr CHEMBL452370 144 | OCC(S)CS CHEMBL1597 145 | ClCC=C CHEMBL451126 146 | OCCCBr CHEMBL466545 147 | C1CC1 CHEMBL1796999 148 | ClCC=O CHEMBL506976 149 | C1CO1 CHEMBL1743219 150 | C=CC=C CHEMBL537970 151 | NCCCF CHEMBL1162286 152 | NCC#N CHEMBL1193997 153 | NCCCl CHEMBL1190279 154 | BrCBr CHEMBL1229889 155 | CCCBr CHEMBL1230095 156 | O=C=O CHEMBL1231871 157 | S=C=S CHEMBL1365180 158 | OCC#C CHEMBL1563026 159 | NCCBr CHEMBL1697693 160 | ClC=C CHEMBL2311071 161 | SCC=C CHEMBL3222024 162 | NNCCO CHEMBL3183346 163 | C1CS1 CHEMBL3184935 164 | COCCl CHEMBL3185256 165 | CCCCS CHEMBL3188256 166 | OCCCS CHEMBL3234722 167 | NCC#C CHEMBL3263480 168 | BrC#N CHEMBL3561885 169 | CC=CCO CHEMBL116709 170 | CC=CCO CHEMBL118459 171 | CC(N)CS CHEMBL37279 172 | N#CN1CCC1 CHEMBL8123 173 | Cc1ccccc1 CHEMBL9113 174 | NCc1ccccc1 CHEMBL522 175 | c1c[nH]cn1 CHEMBL540 176 | COC(C)=O CHEMBL14079 177 | CCC(N)CC CHEMBL14178 178 | CCC(O)=O CHEMBL14021 179 | c1cncnc1 CHEMBL15562 180 | c1ncncn1 CHEMBL15698 181 | c1cnccn1 CHEMBL15797 182 | CCC(C)=O CHEMBL15849 183 | C1CCCCC1 CHEMBL15980 184 | C1CCNCC1 CHEMBL15487 185 | c1ccnnc1 CHEMBL15719 186 | NCCNCCNCCN CHEMBL609 187 | OCc1ccccc1 CHEMBL720 188 | CCCC(C)C CHEMBL30909 189 | CC1CCCC1 CHEMBL30940 190 | C=CCCC=C CHEMBL31747 191 | CC(C)CCN CHEMBL42003 192 | CC(Cl)Cl CHEMBL45079 193 | CCCCNC=O CHEMBL45466 194 | CCCC(C)O CHEMBL45065 195 | CCC(O)CC CHEMBL47100 196 | CCNCCNCC CHEMBL54723 197 | NCCSSCCN CHEMBL61350 198 | CNCCCCNC CHEMBL61621 199 | CN1CCNCC1 CHEMBL1011 200 | NCC(N)=O CHEMBL86954 201 | FC(F)Cl CHEMBL116155 202 | FC(F)=C CHEMBL116020 203 | CC(N)CO CHEMBL116663 204 | CSCCCCO CHEMBL117865 205 | OCCNCCO CHEMBL119604 206 | CSCCC#N CHEMBL119837 207 | NC(O)=O CHEMBL125278 208 | CCCCCCC CHEMBL134658 209 | CC(C)NO CHEMBL140282 210 | CCCCCCS CHEMBL153339 211 | CCCCCBr CHEMBL155850 212 | ClCCCBr CHEMBL156560 213 | ClCCCCl CHEMBL157427 214 | NCCC[P] CHEMBL160548 215 | CNCCCNC CHEMBL174776 216 | OCC=C=C CHEMBL230163 217 | CC=NC#N CHEMBL259526 218 | NC(N)=S CHEMBL260876 219 | CC(C)CO CHEMBL269630 220 | C1CCOC1 CHEMBL276521 221 | c1ccsc1 CHEMBL278958 222 | c1ccoc1 CHEMBL278980 223 | CC(O)CO CHEMBL286398 224 | CCCCCNC CHEMBL298481 225 | CCCNC=O CHEMBL297216 226 | NCCNCCN CHEMBL303429 227 | CC(O)CN CHEMBL326602 228 | NCCCCCO CHEMBL333552 229 | CSCCC=O CHEMBL333298 230 | CCCCCCl CHEMBL348039 231 | OCCSCCO CHEMBL444480 232 | OCCCCCO CHEMBL448289 233 | OC=CC=O CHEMBL446036 234 | OC1CCC1 CHEMBL449234 235 | CC(C)Br CHEMBL451810 236 | CCN(C)C CHEMBL609099 237 | CNCC#C CHEMBL1192210 238 | NOCC=C CHEMBL1213179 239 | NCCCCCN CHEMBL119296 240 | CNC(C)=O CHEMBL11544 241 | COCCOC CHEMBL1232411 242 | OCCCCl CHEMBL1230102 243 | BrCC=C CHEMBL1429506 244 | IC(I)I CHEMBL1451116 245 | CCCC=O CHEMBL1478334 246 | C1COC1 CHEMBL1538076 247 | CCON=O CHEMBL1551365 248 | O=CC=O CHEMBL1606435 249 | CN=C=O CHEMBL1608558 250 | CCCCNN CHEMBL1619936 251 | NCCC#N CHEMBL1618272 252 | CC(C)S CHEMBL1897156 253 | CSCSSC CHEMBL1988732 254 | OCCNCO CHEMBL2000638 255 | C1CNC1 CHEMBL2171713 256 | CC(C)C CHEMBL2106398 257 | BrCC#N CHEMBL2139477 258 | CN1CC1 CHEMBL2448851 259 | CC1CO1 CHEMBL2251584 260 | CC1CN1 CHEMBL3183775 261 | COCCCN CHEMBL3186458 262 | CCNCCO CHEMBL3188262 263 | ClCC#N CHEMBL3187297 264 | CNCC=C CHEMBL3558338 265 | ClC=CCl CHEMBL157026 266 | N=[N]#N CHEMBL186537 267 | FCC1CO1 CHEMBL501668 268 | O=S1CCCC1 CHEMBL1207 269 | CC1CO1 CHEMBL1901974 270 | NC1CONC1=O CHEMBL771 271 | COC(=O)C=C CHEMBL9019 272 | S=C1NCCN1 CHEMBL11860 273 | O=C1NCCN1 CHEMBL12034 274 | CC(C)(C)N CHEMBL13782 275 | CCOC(C)=O CHEMBL14152 276 | CCn1ccnc1 CHEMBL13911 277 | Oc1ccccc1 CHEMBL14060 278 | CCCC(O)=O CHEMBL14227 279 | OC(=O)CCl CHEMBL14090 280 | Cc1ccncc1 CHEMBL15544 281 | Cc1cccnc1 CHEMBL15722 282 | Cc1ccccn1 CHEMBL15732 283 | C1CCC=CC1 CHEMBL16396 284 | CC(C)(C)O CHEMBL16502 285 | CCCCCCC=O CHEMBL18104 286 | CCC(Cl)CC CHEMBL18780 287 | CC(C)CC=O CHEMBL18360 288 | O=CC1CCC1 CHEMBL18475 289 | O=C1CCCC1 CHEMBL18620 290 | CC1CCNCC1 CHEMBL21533 291 | CC1CCCCN1 CHEMBL21454 292 | Nc1ccccn1 CHEMBL21619 293 | OC1CCCNC1 CHEMBL22463 294 | CCCCCCCCO CHEMBL26215 295 | Nc1cccnc1 CHEMBL25541 296 | NCC(F)CON CHEMBL27061 297 | NCC(F)CNN CHEMBL26945 298 | CCCCN=C=O CHEMBL27104 299 | NCCCSCCCN CHEMBL29299 300 | NCC(O)CON CHEMBL26857 301 | NCCCNCCCN CHEMBL28743 302 | NCCCOCCCN CHEMBL29910 303 | O=C1CNCN1 CHEMBL30446 304 | NCCCCCCCN CHEMBL28242 305 | OC1CCCCC1 CHEMBL32010 306 | CCCC(C)CC CHEMBL31377 307 | CC(Cl)CCl CHEMBL44641 308 | CCCC(C)=O CHEMBL45345 309 | ClC(Cl)Cl CHEMBL44618 310 | CCCC(O)CC CHEMBL46678 311 | CCC(=O)CC CHEMBL45315 312 | CCCCC(C)O CHEMBL45425 313 | CCCCOCCCC CHEMBL48132 314 | O=CNC1CC1 CHEMBL49963 315 | O=C1CCCS1 CHEMBL56395 316 | NC(=O)CBr CHEMBL60628 317 | OC(=O)CBr CHEMBL60851 318 | Cn1ccnc1F CHEMBL62383 319 | Cn1ccnc1N CHEMBL64053 320 | NC1=NCCO1 CHEMBL69446 321 | OCc1ccsc1 CHEMBL76469 322 | NN=C(N)NO CHEMBL80352 323 | Nc1ncccn1 CHEMBL88580 324 | NN1CCNCC1 CHEMBL89042 325 | Nc1ncncn1 CHEMBL89436 326 | O=C1CCCO1 CHEMBL95681 327 | FC1NCC=N1 CHEMBL98364 328 | CN1CC=NC1 CHEMBL99100 329 | Fc1ccccc1 CHEMBL16070 330 | OC(=O)CS CHEMBL116455 331 | CCN(CC)CCO CHEMBL1183 332 | C1OCC=C1 CHEMBL117135 333 | CN(C)N=O CHEMBL117311 334 | NC(CO)CO CHEMBL116834 335 | FC(Cl)Cl CHEMBL116813 336 | OCCN1CC1 CHEMBL118671 337 | OC1CCNC1 CHEMBL118705 338 | CCCCCCCC CHEMBL134886 339 | CCC(C)Br CHEMBL156276 340 | CCCCCCCl CHEMBL156095 341 | CC(C)CCl CHEMBL160966 342 | NC(=O)CF CHEMBL160811 343 | NCC(O)CN CHEMBL177097 344 | N#CC1CN1 CHEMBL177264 345 | CCC(I)CC CHEMBL177307 346 | CCC(F)CC CHEMBL177481 347 | NN=C(N)N CHEMBL225304 348 | BrCC=C=C CHEMBL226728 349 | OCC(O)=O CHEMBL252557 350 | NNC(N)=S CHEMBL256250 351 | CN(C)C=O CHEMBL268291 352 | c1ccncc1 CHEMBL266158 353 | CCCCCCCO CHEMBL273459 354 | c1ccccc1 CHEMBL277500 355 | C1COCCN1 CHEMBL276518 356 | NC(=O)CI CHEMBL276727 357 | CCCCCC=O CHEMBL280331 358 | CCCCOCCO CHEMBL284588 359 | OC1CCCC1 CHEMBL288998 360 | Nc1ncns1 CHEMBL295053 361 | CCC(C)CN CHEMBL294955 362 | NCCCCCCN CHEMBL303004 363 | NC(=N)NO CHEMBL309499 364 | NCC1CCCCC1 CHEMBL1049 365 | C1NCN=C1 CHEMBL317004 366 | CCC(N)CO CHEMBL327032 367 | OP(O)(O)=O CHEMBL1187 368 | O=C1CCN1 CHEMBL344042 369 | Nc1nccs1 CHEMBL344760 370 | ClCCCCBr CHEMBL350215 371 | CCC(C)Cl CHEMBL346529 372 | CC(C)CBr CHEMBL346532 373 | OCCCC(O)=O CHEMBL1342 374 | CSC(N)=N CHEMBL356703 375 | CCC(C)CC CHEMBL357767 376 | NCCCNCCO CHEMBL361813 377 | CC(C)CCO CHEMBL372396 378 | OC(=O)CI CHEMBL376280 379 | CCCN=C=O CHEMBL441027 380 | CC(=O)OO CHEMBL444965 381 | CCC(C)CO CHEMBL451923 382 | C1COCCO1 CHEMBL453716 383 | OCCCCCCO CHEMBL458616 384 | CCC(C)C CHEMBL1797287 385 | OC(=O)CF CHEMBL509273 386 | SC(S)=S CHEMBL1207991 387 | C1NN=CO1 CHEMBL541688 388 | OS(O)=O CHEMBL1161699 389 | NCC(F)F CHEMBL1162281 390 | OC(O)=O CHEMBL1161632 391 | NCCc1ccccc1 CHEMBL610 392 | OCCOCCO CHEMBL1235226 393 | C1CCCC1 CHEMBL1370850 394 | CC(N)CN CHEMBL1319459 395 | CCCCCCN CHEMBL1320720 396 | CCC1CO1 CHEMBL1378095 397 | ClC=CCl CHEMBL1385560 398 | ClCCC#N CHEMBL1451739 399 | CCCCC#N CHEMBL1503158 400 | OCC1CO1 CHEMBL1530150 401 | CCCCC=C CHEMBL1548726 402 | CCCCNCC CHEMBL1598939 403 | CC(N)Cl CHEMBL1697721 404 | FCCCCCF CHEMBL1697728 405 | CCCC=NO CHEMBL1729186 406 | C1CSCN1 CHEMBL1916078 407 | c1cnon1 CHEMBL2171711 408 | c1cocn1 CHEMBL2171710 409 | c1cnsc1 CHEMBL2171712 410 | C=COC=C CHEMBL2105883 411 | N#CNC#N CHEMBL2365294 412 | C[N+]#N CHEMBL2419248 413 | OCCCCCl CHEMBL2260957 414 | NCCCC=O CHEMBL2261442 415 | CCN=C=S CHEMBL2251727 416 | CC(N)=N CHEMBL2227684 417 | CCCOC=O CHEMBL2270393 418 | SCCSCCS CHEMBL3182274 419 | NCCOCCO CHEMBL3183757 420 | NCCOCCN CHEMBL3183428 421 | CCC(C)S CHEMBL3183438 422 | CCSCCCl CHEMBL3183525 423 | N#CCC#N CHEMBL3187514 424 | OCC#CCO CHEMBL3187551 425 | C1COCO1 CHEMBL3187281 426 | ClCOCCl CHEMBL3185875 427 | CCCNCCC CHEMBL3185961 428 | NCCNCCO CHEMBL3186403 429 | CCC(C)N CHEMBL3186956 430 | CCCCCCI CHEMBL3188734 431 | CCCOCCC CHEMBL3187166 432 | CCCOCCO CHEMBL3189002 433 | COCCC#N CHEMBL3560782 434 | CC=CC=O CHEMBL3561468 435 | CCCCOCC CHEMBL3561108 436 | ClCC=CCl CHEMBL155926 437 | NC1CONC1=O CHEMBL8151 438 | CCC1OC1C CHEMBL177905 439 | Cc1ccccc1N CHEMBL1381 440 | CCC=CC=O CHEMBL256368 441 | CC=CC=O CHEMBL1086445 442 | CCCC=NO CHEMBL1869638 443 | ClC=CCl CHEMBL1441128 444 | CCC=CCO CHEMBL2269088 445 | OCC=CCO CHEMBL3188586 446 | NCC(O)CON CHEMBL26435 447 | CC(O)CBr CHEMBL446288 448 | ClCC1CO1 CHEMBL448626 449 | BrCC1CO1 CHEMBL504705 450 | CC(N)CO CHEMBL1229871 451 | NC(N)=O CHEMBL2096635 452 | NC(N)=O CHEMBL2096648 453 | CCN(=O)=O CHEMBL15625 454 | CN(=O)=O CHEMBL276924 455 | CCS(C)=O CHEMBL278882 456 | O=S1CCC1 CHEMBL368734 457 | CSS(C)=O CHEMBL403038 458 | OCC(O)CI CHEMBL467595 459 | O=C1NC=CC=C1 CHEMBL662 460 | CN1CCCNC1=S CHEMBL6954 461 | CN(CCCl)CCCl CHEMBL427 462 | OCN1CCC1=O CHEMBL10689 463 | NCCc1c[nH]cn1 CHEMBL90 464 | S=C1NCCCN1 CHEMBL11938 465 | O=C1CCCCN1 CHEMBL12193 466 | O=C1NCCCN1 CHEMBL12593 467 | CN1CCCC1=O CHEMBL12543 468 | Oc1ccc(O)cc1 CHEMBL537 469 | NCC(Cl)CON CHEMBL15650 470 | c1nc[nH]n1 CHEMBL15571 471 | c1cn[nH]c1 CHEMBL15967 472 | c1cc[nH]c1 CHEMBL16225 473 | Clc1ccccc1 CHEMBL16200 474 | CCCC(=O)OC CHEMBL15859 475 | Brc1ccccc1 CHEMBL16068 476 | CCCCCCCC=O CHEMBL18407 477 | CCCCC(O)CN CHEMBL18843 478 | O=C1CCCCC1 CHEMBL18850 479 | NCCCCNCCCN CHEMBL19612 480 | CCC1CCCCN1 CHEMBL22270 481 | CCc1ccncc1 CHEMBL22977 482 | CCc1cccnc1 CHEMBL23025 483 | CCN1CCCCC1 CHEMBL25053 484 | CCCCCCCCCO CHEMBL24563 485 | NCCc1nccs1 CHEMBL25414 486 | CNCC(O)CON CHEMBL26797 487 | NCCCCSCCCN CHEMBL28866 488 | NCCCCCCCCN CHEMBL29392 489 | NCCCS(O)=O CHEMBL32102 490 | NCc1ccccn1 CHEMBL32189 491 | ON1CCCC1=O CHEMBL31629 492 | CCC(C)(C)O CHEMBL44658 493 | CCOC(=O)CC CHEMBL44115 494 | CCCOC(C)=O CHEMBL44857 495 | OCCNCCNCCO CHEMBL47248 496 | CCCCCN(C)C CHEMBL47794 497 | O=CNC1CCC1 CHEMBL49774 498 | O=C1CCC=C1 CHEMBL52190 499 | CCN(CC)CCN CHEMBL52701 500 | CC(=O)NC#N CHEMBL56672 501 | CN(C)CCCCN CHEMBL59625 502 | NCCCCNCC=C CHEMBL61421 503 | O=Cc1ccsc1 CHEMBL72211 504 | CSc1cnccn1 CHEMBL94743 505 | ClC1NCC=N1 CHEMBL98381 506 | CCc1cnccn1 CHEMBL97525 507 | COc1cnccn1 CHEMBL97794 508 | CSC1NCC=N1 CHEMBL99226 509 | CCC1NCC=N1 CHEMBL98596 510 | NC1NCC=N1 CHEMBL102029 511 | OCC(Cl)Cl CHEMBL113957 512 | Ic1ccccc1 CHEMBL116296 513 | Sc1ccccc1 CHEMBL119405 514 | NNC(=O)NO CHEMBL134941 515 | Cc1cn[nH]c1 CHEMBL1308 516 | ClC(Cl)=C CHEMBL156455 517 | CCC(C)CBr CHEMBL156329 518 | CC(C)=CBr CHEMBL157418 519 | ClC(Br)Br CHEMBL157093 520 | CC(=C)CCl CHEMBL157368 521 | CC(C)CCBr CHEMBL158800 522 | CC(Br)CCl CHEMBL160835 523 | NC1=NCCC1 CHEMBL161318 524 | CC(C)=CCl CHEMBL160508 525 | CON(C)N=O CHEMBL163675 526 | CCN(C)N=O CHEMBL164852 527 | NC(=O)CCO CHEMBL170341 528 | OC(=C)C=O CHEMBL170721 529 | C=CCSCC=C CHEMBL170458 530 | COCC(N)=O CHEMBL170742 531 | OCC(O)C=O CHEMBL173813 532 | OCCCCSC#N CHEMBL176782 533 | NC1CCNCC1 CHEMBL174570 534 | N#CCCSC#N CHEMBL177036 535 | NCC(=O)CN CHEMBL175201 536 | NCC(=O)NO CHEMBL216796 537 | C=CCN=C=S CHEMBL233248 538 | NCc1cccs1 CHEMBL237711 539 | Oc1cccnc1 CHEMBL237847 540 | NCC(=O)NN CHEMBL241347 541 | NCc1ccsc1 CHEMBL252602 542 | ON1CCCCC1 CHEMBL277887 543 | O=C1CCCN1 CHEMBL276849 544 | CC1CCCNC1 CHEMBL279512 545 | OCP(CO)CO CHEMBL279546 546 | CN1CCCCC1 CHEMBL281417 547 | CCN(CC)CC CHEMBL284057 548 | OC1CCNCC1 CHEMBL284022 549 | Nc1ccncc1 CHEMBL284348 550 | CNCC(O)CO CHEMBL286961 551 | CC(C)NC=O CHEMBL296027 552 | NCCC(O)=O CHEMBL297569 553 | Nc1ccncn1 CHEMBL302453 554 | CNCC(O)=O CHEMBL304383 555 | OCc1ccco1 CHEMBL308187 556 | CCSC(N)=N CHEMBL321691 557 | O=C1CNCS1 CHEMBL338595 558 | CCCCCCCCC CHEMBL335900 559 | CONC(N)=O CHEMBL339711 560 | NC(=O)C=C CHEMBL348107 561 | NOCC(O)=O CHEMBL347862 562 | O=NN1CCNCC1 CHEMBL1333 563 | BrC(Br)Br CHEMBL345248 564 | ClC(Cl)Br CHEMBL346231 565 | NC(=O)CCl CHEMBL346368 566 | CNCc1ccccc1 CHEMBL1338 567 | O=NN1CCC1 CHEMBL351479 568 | OC(=O)CCS CHEMBL358697 569 | NC1=NCCS1 CHEMBL362148 570 | S=C=NCC#C CHEMBL401514 571 | OCc1ccoc1 CHEMBL440914 572 | CNC(N)=NN CHEMBL447555 573 | C1CCCCCC1 CHEMBL453194 574 | ClCCSCCCl CHEMBL455341 575 | CCOC(N)=O CHEMBL462547 576 | FC(F)CCl CHEMBL1797000 577 | C1CC=CC1 CHEMBL1797299 578 | Cc1cnccn1 CHEMBL479791 579 | OOC1CCCO1 CHEMBL505384 580 | Nn1cnnc1 CHEMBL1868166 581 | COC(N)=O CHEMBL1085707 582 | NC1CCCC1 CHEMBL1171859 583 | CSCCNCCN CHEMBL1191800 584 | NCCCNCCS CHEMBL1201382 585 | O=C1CCO1 CHEMBL1200627 586 | CC(O)C#N CHEMBL3559764 587 | NCCCNCCN CHEMBL1213267 588 | CC1NCC=N1 CHEMBL330591 589 | ClCC(Cl)Cl CHEMBL43882 590 | CCCn1ccnc1 CHEMBL95929 591 | SCc1cccs1 CHEMBL152603 592 | OCCSSCCO CHEMBL1233278 593 | COCCOCCO CHEMBL1235250 594 | CCC(N)=O CHEMBL1235716 595 | NCC(S)=O CHEMBL1233056 596 | Cn1cccc1 CHEMBL1234459 597 | C=CC1CO1 CHEMBL1299388 598 | CC(O)CCl CHEMBL1361129 599 | CN(C)CCS CHEMBL1395579 600 | CC(C)C=O CHEMBL1404017 601 | OCC(O)CS CHEMBL1398948 602 | CNC(S)=S CHEMBL1413694 603 | ClCC1CO1 CHEMBL1421613 604 | Cc1ccco1 CHEMBL1445555 605 | CC(C)C#N CHEMBL1492874 606 | C1OCOCO1 CHEMBL1495792 607 | N#CCCC#N CHEMBL1562258 608 | Cc1cscn1 CHEMBL1566946 609 | CC1CCCO1 CHEMBL1580503 610 | C=CCCC#N CHEMBL1595985 611 | Nc1nncs1 CHEMBL1650237 612 | C=CCOC=O CHEMBL1697703 613 | CSC(N)=S CHEMBL1673038 614 | OCCCSCCO CHEMBL1741874 615 | CCOCCOCC CHEMBL1877517 616 | NC(=N)CF CHEMBL1962624 617 | CNC(N)NC CHEMBL2009606 618 | C1OOCOO1 CHEMBL2071269 619 | OCS(O)=O CHEMBL2111064 620 | C1CSCCN1 CHEMBL2333141 621 | COC(S)=S CHEMBL3039753 622 | NCCCCC=O CHEMBL2261443 623 | FC1CCNC1 CHEMBL2448948 624 | CCCN(C)C CHEMBL2448976 625 | CCN(C)CC CHEMBL2448813 626 | OC1CCOC1 CHEMBL2287517 627 | CCCN=C=S CHEMBL2251726 628 | CC(CO)CS CHEMBL3098154 629 | CC(=O)NN CHEMBL3091859 630 | CCCCOC=O CHEMBL2270394 631 | CC(CO)CO CHEMBL3183047 632 | BrCC1CO1 CHEMBL3183066 633 | C1CSCCS1 CHEMBL3183037 634 | COC(C)OC CHEMBL3183607 635 | CCN(O)CC CHEMBL3184786 636 | CCCCON=O CHEMBL3181968 637 | CC=CCC#N CHEMBL3181969 638 | CCC=CC#N CHEMBL3185046 639 | COCC(C)N CHEMBL3184692 640 | CC(O)CCO CHEMBL3186475 641 | COCC(C)O CHEMBL3186306 642 | O=CNNC=O CHEMBL3185965 643 | ON=CC=NO CHEMBL3185538 644 | BrCCCCBr CHEMBL3185714 645 | CCCCCCBr CHEMBL3187491 646 | SC1CCCC1 CHEMBL3186752 647 | NCCCCCCS CHEMBL3247584 648 | NCC(N)CS CHEMBL3302693 649 | NC(=N)CS CHEMBL3304035 650 | NC(=S)NO CHEMBL3274945 651 | OCC(O)CF CHEMBL3276496 652 | ClCC=CCl CHEMBL3561804 653 | CCCCCCCN CHEMBL3561940 654 | CCCCOC=C CHEMBL3561125 655 | OC(CI)CI CHEMBL3707258 656 | CC(=O)C=NO CHEMBL17940 657 | Nc1ccccc1O CHEMBL28319 658 | CC1CCOC1=O CHEMBL36365 659 | Cc1ccccc1C CHEMBL45005 660 | Cc1ccccc1O CHEMBL46931 661 | CC=C(C)C=O CHEMBL53493 662 | CC1CCNC1=O CHEMBL59378 663 | CC=CC(N)=O CHEMBL58562 664 | Nc1ccccc1N CHEMBL70582 665 | Cc1nccnc1C CHEMBL96425 666 | CN1CC=NC1N CHEMBL97965 667 | CCNC(N)=S CHEMBL116961 668 | ClCC=CCCl CHEMBL468582 669 | Cc1ccoc1C CHEMBL108232 670 | CC=CCC#N CHEMBL1322495 671 | CCC=CC#N CHEMBL2138413 672 | CCC=CCCO CHEMBL2251452 673 | CCCC=CCO CHEMBL2228463 674 | CCC=CCCO CHEMBL3184538 675 | CC=CCC=C CHEMBL3182034 676 | CCC=CC#N CHEMBL3185899 677 | CN1C=CNC1=S CHEMBL1515 678 | CC(O)CCO CHEMBL1231501 679 | CCC(N)CO CHEMBL3184640 680 | CSCCC(N)CS CHEMBL36661 681 | CCS(=O)CC CHEMBL174477 682 | NCC(O)CON CHEMBL284573 683 | NC(CS)C(O)=O CHEMBL863 684 | ONN(=O)=O CHEMBL369802 685 | CC(Cl)CCl CHEMBL373466 686 | CNN(=O)=O CHEMBL405641 687 | CSC(N)=N CHEMBL2112024 688 | ON(O)N=O CHEMBL1741048 689 | OC1COCC1O CHEMBL350524 690 | OC1CNCC1O CHEMBL396701 691 | OC1CCCC1O CHEMBL399324 692 | S=C1SSC=C1 CHEMBL368700 693 | FC1CCNCC1 CHEMBL1162291 694 | CN1CCCCC1=O CHEMBL12011 695 | S=C1NCCCCN1 CHEMBL11693 696 | CN(C)C(C)=O CHEMBL11873 697 | CC(N)C(O)=O CHEMBL12198 698 | CCN1CCCC1=O CHEMBL12221 699 | O=C1NCCCCN1 CHEMBL12376 700 | COc1ccc(O)cc1 CHEMBL544 701 | CCCCCC(O)=O CHEMBL14184 702 | Cn1cncc1CCN CHEMBL14722 703 | N#Cc1ccccc1 CHEMBL15819 704 | O=Cc1ccccc1 CHEMBL15972 705 | CCCCCC(O)CN CHEMBL18576 706 | O=C1CCCCCC1 CHEMBL18607 707 | CCCCCC(C)=O CHEMBL18893 708 | NCCC1CCCCC1 CHEMBL19428 709 | COC1CNC=NC1 CHEMBL21779 710 | CCCc1ccccn1 CHEMBL21824 711 | CCCC1CCCCN1 CHEMBL21867 712 | CC(C)(C)COO CHEMBL23860 713 | CS(C)(=O)=O CHEMBL25028 714 | CC(C)(C)CCO CHEMBL25029 715 | CNCCCNCCCNC CHEMBL29194 716 | C1CCCC=CCC1 CHEMBL30773 717 | ON1CNCCC1=O CHEMBL31155 718 | CCCCC(CC)CO CHEMBL31637 719 | NCCc1ccccn1 CHEMBL32813 720 | NCCCCNCCCCN CHEMBL36119 721 | CC(N)P(O)=O CHEMBL37702 722 | OC(=O)C1CO1 CHEMBL35308 723 | CCNCCCNCCCN CHEMBL37901 724 | CC(=O)OCCBr CHEMBL42088 725 | CSCCOC(C)=O CHEMBL42606 726 | NCP(O)(O)=O CHEMBL41873 727 | CCCCCCCNC=O CHEMBL43719 728 | CCCC(=O)OCC CHEMBL44800 729 | Cc1nsc(N)n1 CHEMBL47803 730 | CCOC(=O)C=C CHEMBL52084 731 | O=C1OCCC=C1 CHEMBL55078 732 | CCOC(=O)C#C CHEMBL53384 733 | CNCCCCNCC=C CHEMBL60417 734 | CCN(CC)C(S)=S CHEMBL961 735 | CNCCCCNCC#C CHEMBL59623 736 | NC1=CONC1=O CHEMBL67409 737 | NS(O)(=O)=O CHEMBL68253 738 | CCCCN=C(N)N CHEMBL73004 739 | CN(CCN)CCCN CHEMBL76497 740 | NCCCCCC(O)=O CHEMBL1046 741 | NC1=NCCSCC1 CHEMBL88308 742 | NCCCCCNCCCN CHEMBL89035 743 | NNCCc1ccccc1 CHEMBL1089 744 | CCOc1cnccn1 CHEMBL93554 745 | CCSc1cnccn1 CHEMBL93555 746 | CCN1CCC=CC1 CHEMBL98544 747 | CCCCn1ccnc1 CHEMBL97667 748 | O=Nc1ccccc1 CHEMBL98797 749 | NCCCNOCCCN CHEMBL105749 750 | C[S+](C)C CHEMBL1237171 751 | Nc1ccc(O)cc1 CHEMBL1142 752 | NCCCP(O)=O CHEMBL112203 753 | CSCCC(O)=O CHEMBL116212 754 | OCC(F)(F)F CHEMBL116675 755 | C1NC=NC=C1 CHEMBL122235 756 | NP(O)(O)=O CHEMBL121754 757 | CP(O)(O)=S CHEMBL122577 758 | CC(C)(N)CO CHEMBL122588 759 | CP(O)(O)=O CHEMBL122938 760 | OCCN1CCCC1 CHEMBL122581 761 | CCCCCC(C)N CHEMBL123693 762 | CCCCCCCCCC CHEMBL134537 763 | CSc1nccn1C CHEMBL136263 764 | CCCCCNCC#N CHEMBL139751 765 | CCC(C)(C)C CHEMBL142735 766 | NC(=N)SCCF CHEMBL148951 767 | CCOC(=O)NO CHEMBL153081 768 | NCCc1cscn1 CHEMBL155328 769 | ClCC(Cl)=C CHEMBL156075 770 | CCCCCCCCBr CHEMBL156047 771 | CCCCCCCCCl CHEMBL158445 772 | NC1=NCCCN1 CHEMBL158626 773 | NC1=NCCCO1 CHEMBL161118 774 | NNCCC(O)=O CHEMBL159205 775 | CN(C=C)N=O CHEMBL163957 776 | COCN(C)N=O CHEMBL163961 777 | CCN(CC)N=O CHEMBL164290 778 | CCCN(C)N=O CHEMBL165385 779 | O=C1OCC=C1 CHEMBL166223 780 | NC(=O)CCCl CHEMBL171266 781 | CNc1ccccc1 CHEMBL170781 782 | NC(=O)CCCO CHEMBL174258 783 | OCCCCCSC#N CHEMBL176455 784 | O=Cc1ccco1 CHEMBL189362 785 | CSc1ccccc1 CHEMBL192899 786 | CCCCC(C)=O CHEMBL195861 787 | NCCC(=O)NO CHEMBL218945 788 | O=Cc1cscn1 CHEMBL225650 789 | N#CN1CCCC1 CHEMBL262697 790 | Cn1cnc(CCN)c1 CHEMBL507 791 | CCCCC(O)=O CHEMBL268736 792 | N=C1CCCCN1 CHEMBL269058 793 | CCN(CC)C#N CHEMBL274120 794 | CCC(CC)C=O CHEMBL273782 795 | O=CC1CCCC1 CHEMBL274711 796 | COc1ccccc1 CHEMBL278024 797 | CCc1ccccn1 CHEMBL279305 798 | O=C1CCCNN1 CHEMBL283612 799 | ON1CNCC1=O CHEMBL286024 800 | NOCC(O)CON CHEMBL287667 801 | NCCCC(N)CF CHEMBL290500 802 | NCCCCNCC#C CHEMBL292770 803 | CC(N)P(O)O CHEMBL300051 804 | CC(C)CNC=O CHEMBL299094 805 | Cn1ccnc1Br CHEMBL305538 806 | NC1=NCCCS1 CHEMBL306541 807 | ONc1ccccc1 CHEMBL320474 808 | Brc1ccncc1 CHEMBL325044 809 | O=Cc1cccs1 CHEMBL328441 810 | Brc1ccccn1 CHEMBL331374 811 | BrC1NCC=N1 CHEMBL330406 812 | NP(N)(O)=O CHEMBL333905 813 | CC(C)(C)OO CHEMBL348399 814 | CC(C)(C)Br CHEMBL347644 815 | COC(OC)C#C CHEMBL349188 816 | OCCN(CCO)N=O CHEMBL1334 817 | CC(C)(C)Cl CHEMBL346997 818 | O=NN1CCCC1 CHEMBL351175 819 | O=NN1CCOC1 CHEMBL351189 820 | CC(Cl)CCCl CHEMBL352037 821 | CC1CCN=C1N CHEMBL359703 822 | CCCCC(N)CS CHEMBL357138 823 | C=CCSSCC=C CHEMBL366603 824 | CCc1ccccc1 CHEMBL371561 825 | CNCC(C)CNC CHEMBL367458 826 | O=Cc1cocn1 CHEMBL444137 827 | CC(C)C(C)O CHEMBL443470 828 | CN(C)CCCCl CHEMBL449411 829 | BrC(=C)C=O CHEMBL447065 830 | NCCSC(N)=N CHEMBL454761 831 | CCCCCC(C)O CHEMBL449522 832 | CCCCC(O)CC CHEMBL452729 833 | C1CCCCCCC1 CHEMBL452651 834 | CC(C)=CC=O CHEMBL453815 835 | NNc1ccccc1 CHEMBL456807 836 | CC(C)(C)CO CHEMBL458630 837 | NC1CCCCC1 CHEMBL1794762 838 | CCCCCCCCl CHEMBL1797136 839 | OC(CCl)CCl CHEMBL468581 840 | CCCSC(N)=N CHEMBL483092 841 | CCCCC(C)C CHEMBL1797267 842 | OC1CCCCCC1 CHEMBL503332 843 | CCNC(N)=NN CHEMBL507240 844 | OCCC(O)=O CHEMBL1205969 845 | Clc1ccccn1 CHEMBL509579 846 | CSSc1cccs1 CHEMBL554538 847 | CC(C)(C)NO CHEMBL555486 848 | OCCn1ccnc1 CHEMBL555293 849 | ClC(=C)C=C CHEMBL555660 850 | O=C1NCCO1 CHEMBL1867161 851 | CCCCN=C=S CHEMBL1814588 852 | NCCc1cccs1 CHEMBL252803 853 | CN(C)CCCO CHEMBL1209424 854 | CC(=O)CBr CHEMBL1085947 855 | OC(=O)C=C CHEMBL1213529 856 | OC(=O)C#C CHEMBL1213530 857 | CSCC(N)CS CHEMBL1159811 858 | CCCCCCCCN CHEMBL1160509 859 | Fc1ccccn1 CHEMBL1162360 860 | OC(=O)C=O CHEMBL1162545 861 | FC1CCCNC1 CHEMBL1162289 862 | Fc1cccnc1 CHEMBL1162361 863 | Nc1cnccn1 CHEMBL1834089 864 | COC(=O)CN CHEMBL1193103 865 | CNC(=N)NC CHEMBL1193979 866 | CSCCNCCCN CHEMBL1191798 867 | CCSCCNCCN CHEMBL1195713 868 | NCCC(N)=O CHEMBL1229081 869 | O=C1CCCCO1 CHEMBL452383 870 | NC(=N)NN=O CHEMBL447467 871 | CCCCCCCCCCO CHEMBL25363 872 | OCCCCCCBr CHEMBL1231334 873 | CCCC(N)=O CHEMBL1231396 874 | CCOCCOCCO CHEMBL1230841 875 | OCC(=O)CO CHEMBL1229937 876 | [C-]#[O+] CHEMBL1231840 877 | CCCCCCC#N CHEMBL1231869 878 | O=CCCCC=O CHEMBL1235482 879 | CCOP(O)=O CHEMBL1231082 880 | CC(=O)CCl CHEMBL1231084 881 | CN(C)CCCN CHEMBL1232234 882 | CNC(=O)NC CHEMBL1234380 883 | COCCOCCOC CHEMBL1234162 884 | OCCC(O)CO CHEMBL1356759 885 | COC(=O)CS CHEMBL1341329 886 | CC(=O)NBr CHEMBL1256514 887 | NCCS(O)=O CHEMBL1256480 888 | C1CCCNCC1 CHEMBL1375444 889 | CCCCCCC=C CHEMBL1376677 890 | CC(=C)C#N CHEMBL1529759 891 | N#CSCSC#N CHEMBL1524617 892 | CCCC(C)CO CHEMBL1569610 893 | CC(=C)C=C CHEMBL1566132 894 | CC(=O)C=C CHEMBL1600824 895 | ClCCOCCCl CHEMBL1613350 896 | ONCC(O)=O CHEMBL1645222 897 | COCC(O)=O CHEMBL1697714 898 | OCCNCNC=O CHEMBL1984734 899 | CONC(C)=O CHEMBL1990145 900 | O=S1CSCS1 CHEMBL1971386 901 | OCC#CCNCl CHEMBL1998044 902 | N=C1SCCS1 CHEMBL2009648 903 | ICC1CCCO1 CHEMBL1999579 904 | NC(=N)CCl CHEMBL2365371 905 | S=C1NCCS1 CHEMBL2398099 906 | CCOC(S)=S CHEMBL3039661 907 | CC(C)CCCO CHEMBL2260955 908 | OCCCCCCCl CHEMBL2260959 909 | CN(C)CC#N CHEMBL2448941 910 | CCCCN(C)C CHEMBL2448977 911 | COCCN(C)C CHEMBL2448835 912 | CN1CCOCC1 CHEMBL2448839 913 | O=COC1CC1 CHEMBL2924223 914 | OCC1CCOC1 CHEMBL2287525 915 | CCC(Cl)Cl CHEMBL2287704 916 | OCC1CCCO1 CHEMBL2287521 917 | CCc1ccco1 CHEMBL2269084 918 | CCC(O)C=C CHEMBL2269086 919 | CCC(C)C=O CHEMBL2270060 920 | CCCSSSCCC CHEMBL3222023 921 | NNC(=S)NN CHEMBL3181818 922 | CC(C)(C)S CHEMBL3182458 923 | CNC(=S)NN CHEMBL3182946 924 | CCC(=O)OC CHEMBL3183973 925 | COC(=O)NN CHEMBL3183780 926 | CCC(CC)CO CHEMBL3181836 927 | BrCCCCCBr CHEMBL3182198 928 | COC(Cl)=O CHEMBL3182300 929 | CCC(C)=NO CHEMBL3181847 930 | CC(C)=CCO CHEMBL3184952 931 | OCCCCOC=C CHEMBL3182123 932 | ClCC=CCCl CHEMBL3182125 933 | CCCCCCCCS CHEMBL3182056 934 | COP(=O)OC CHEMBL3183964 935 | COC(=O)OC CHEMBL3185216 936 | C1COC=CC1 CHEMBL3184439 937 | CC(C)OC=O CHEMBL3184082 938 | O=C1OCCO1 CHEMBL3181803 939 | CCCCNCCCC CHEMBL3184528 940 | COP(OC)OC CHEMBL3186364 941 | CC(C)CC#N CHEMBL3186839 942 | COC(C)CCO CHEMBL3186019 943 | CCOCC(C)O CHEMBL3188294 944 | OCC(O)CCl CHEMBL3185949 945 | CCCSSCC=C CHEMBL3187351 946 | COC(OC)OC CHEMBL3187679 947 | COCC(O)CO CHEMBL3187682 948 | CCCCCON=O CHEMBL3188202 949 | SC1CCCCC1 CHEMBL3187982 950 | C1C=CC=C1 CHEMBL3188826 951 | C=CCNCC=C CHEMBL3186706 952 | CCC(CO)CO CHEMBL3187400 953 | N#CCNCC#N CHEMBL3186090 954 | CC(C)OCCO CHEMBL3187409 955 | CNC(=S)NC CHEMBL3189044 956 | CSCCCCCCN CHEMBL3247585 957 | OCC(=O)CF CHEMBL3276492 958 | OCC(O)CBr CHEMBL3276497 959 | Ic1ccccn1 CHEMBL3274303 960 | OCC(CS)CS CHEMBL3425833 961 | CC(=C)CCO CHEMBL3561140 962 | SCc1ccco1 CHEMBL3560314 963 | O=S1OCCO1 CHEMBL3561007 964 | NCc1ccco1 CHEMBL3561633 965 | CCCCSCCCC CHEMBL3561568 966 | CN(C)CCCl CHEMBL3580424 967 | CCCCNCC=C CHEMBL3558346 968 | CN1CCCNC1=O CHEMBL12319 969 | NCc1ccccc1F CHEMBL12892 970 | COc1ccccc1O CHEMBL13766 971 | ON=Cc1cnsn1 CHEMBL19205 972 | ON=Cc1cnns1 CHEMBL19953 973 | NCC=CC(O)=O CHEMBL32307 974 | NCC=CC(O)=O CHEMBL33086 975 | CCC1CCOC1=O CHEMBL35976 976 | CCC1CCSC1=O CHEMBL36472 977 | CCC1CCNC1=O CHEMBL57505 978 | CSc1nccnc1C CHEMBL97593 979 | CN1CC=NC1Br CHEMBL99153 980 | CC(=C)C=NO CHEMBL104456 981 | Nc1ccccc1S CHEMBL116835 982 | Cc1ncccc1O CHEMBL134348 983 | CCCCCCC=NO CHEMBL137790 984 | CN1CCSC1=N CHEMBL167256 985 | Nc1ccccc1F CHEMBL195328 986 | Oc1ccccc1F CHEMBL224144 987 | Oc1ccccc1I CHEMBL225564 988 | Oc1ccccc1O CHEMBL280998 989 | CC1CCSC1=O CHEMBL287617 990 | Cc1cccnc1N CHEMBL291544 991 | CN1CC=NC1F CHEMBL327284 992 | CC(Cl)=CCl CHEMBL346519 993 | Cc1ccccc1F CHEMBL352215 994 | Nc1ccncc1N CHEMBL354077 995 | CN1CCSC1=S CHEMBL397404 996 | CCCCC=CC=O CHEMBL454759 997 | BrCC=CCBr CHEMBL1324439 998 | N#CC=CC#N CHEMBL1451833 999 | COCC=CCOC CHEMBL1994550 1000 | -------------------------------------------------------------------------------- /examples/qed_property_example.txt: -------------------------------------------------------------------------------- 1 | 0.4068|CCO 2 | 0.3598|C 3 | 0.3853|CO 4 | 0.419|NCCS 5 | 0.4035|NCCN 6 | 0.3847|CN 7 | 0.3606|C=O 8 | 0.4062|CCN 9 | 0.4028|CSC 10 | 0.3936|CBr 11 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 88 3 | skip-string-normalization = false 4 | target-version = ['py37'] 5 | 6 | [tool.isort] 7 | multi_line_output = 3 8 | include_trailing_comma = true 9 | force_grid_wrap = 0 10 | use_parentheses = true 11 | ensure_newline_before_comments = true 12 | line_length = 88 13 | force_to_top = ["rdkit"] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==v3.1.0 2 | torch==1.7.1 3 | tqdm 4 | numpy 5 | modlamp>=4.3.0 6 | selfies==1.0.4 7 | psutil -------------------------------------------------------------------------------- /scripts/create_vocabulary.py: -------------------------------------------------------------------------------- 1 | """Create a vocabulary.""" 2 | import argparse 3 | import os 4 | from collections import Counter 5 | 6 | from tqdm import tqdm 7 | 8 | from terminator.tokenization import ExpressionTokenizer 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument( 12 | "input_filepath", type=str, help="data used to create a vocabulary." 13 | ) 14 | parser.add_argument( 15 | "output_filepath", type=str, help="output where to store the vocabulary." 16 | ) 17 | parser.add_argument( 18 | "--max_exponent", type=int, default=5, help="maximum exponent for num-tokens." 19 | ) 20 | 21 | 22 | def main() -> None: 23 | """Create a vocabulary using an ExpressionTokenizer.""" 24 | args = parser.parse_args() 25 | input_filepath = args.input_filepath 26 | output_filepath = args.output_filepath 27 | max_exponent = args.max_exponent 28 | 29 | vocabulary_counter = Counter() 30 | tokenizer = ExpressionTokenizer() 31 | 32 | # tokens for properties 33 | vocabulary_counter.update( 34 | [ 35 | "", 36 | "", 37 | "", 38 | "", 39 | "", 40 | "", 41 | "", 42 | "", 43 | "", 44 | "", 45 | ] 46 | ) 47 | # tokens for property numerical values 48 | digits = list(range(10)) 49 | vocabulary_counter.update( 50 | [ 51 | f"_{digit}_{exponent}_" 52 | for exponent in range(max_exponent + 1) 53 | for digit in digits 54 | ] 55 | + [ 56 | f"_{digit}_-{exponent}_" 57 | for exponent in range(max_exponent + 1) 58 | for digit in digits 59 | ] 60 | ) 61 | with open(input_filepath, "rt") as fp: 62 | for line in tqdm(fp): 63 | vocabulary_counter.update(tokenizer.tokenize(line.strip())) 64 | 65 | # special tokens for the model training and keeping the possibility to extend the vocabulart 66 | special_tokens = [ 67 | "[PAD]", 68 | "[unused1]", 69 | "[unused2]", 70 | "[unused3]", 71 | "[unused4]", 72 | "[unused5]", 73 | "[unused6]", 74 | "[unused7]", 75 | "[unused8]", 76 | "[unused9]", 77 | "[unused10]", 78 | "[UNK]", 79 | "[CLS]", 80 | "[SEP]", 81 | "[MASK]", 82 | ] 83 | 84 | with open(output_filepath, "wt") as fp: 85 | tokens = special_tokens + [ 86 | token for token, _ in vocabulary_counter.most_common() 87 | ] 88 | fp.write(os.linesep.join(tokens)) 89 | 90 | 91 | if __name__ == "__main__": 92 | main() 93 | -------------------------------------------------------------------------------- /scripts/eval_language_modeling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Language modeling evaluation script 3 | """ 4 | import json 5 | import logging 6 | import math 7 | import os 8 | import sys 9 | from time import time 10 | 11 | import pandas as pd 12 | from transformers import ( 13 | AutoConfig, 14 | AutoModelWithLMHead, 15 | DataCollatorForPermutationLanguageModeling, 16 | HfArgumentParser, 17 | set_seed, 18 | ) 19 | 20 | from terminator.args import CustomTrainingArguments, EvalArguments 21 | from terminator.collators import ( 22 | ConditionalGenerationEvaluationCollator, 23 | PropertyCollator, 24 | ) 25 | from terminator.datasets import get_dataset 26 | from terminator.evaluator import Evaluator 27 | from terminator.property_predictors import PREDICT_FACTORY 28 | from terminator.tokenization import ExpressionBertTokenizer 29 | from terminator.trainer import get_trainer_dict 30 | from terminator.utils import ( 31 | disable_rdkit_logging, 32 | find_safe_path, 33 | get_latest_checkpoint, 34 | get_equispaced_ranges, 35 | ) 36 | 37 | logger = logging.getLogger(__name__) 38 | 39 | 40 | # setup logging 41 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 42 | 43 | 44 | def main(): 45 | 46 | parser = HfArgumentParser((CustomTrainingArguments, EvalArguments)) 47 | training_args, eval_args = parser.parse_args_into_dataclasses() 48 | 49 | # Setup logging 50 | logging.basicConfig( 51 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 52 | datefmt="%m/%d/%Y %H:%M:%S", 53 | level=logging.INFO, 54 | ) 55 | 56 | with open(eval_args.param_path, "r") as f: 57 | eval_params = json.load(f) 58 | 59 | param_filename = eval_args.param_path.split("/")[-1].split(".json")[0] 60 | 61 | # Wrap into args to be safe 62 | eval_args.__dict__.update(eval_params) 63 | 64 | # NOTE: Results will be stored in model folder 65 | model_dir = training_args.output_dir 66 | if "checkpoint" not in model_dir: 67 | model_dir = get_latest_checkpoint( 68 | model_dir, must_contain=eval_params.get("checkpoint-str", "best") 69 | ) 70 | 71 | config_name = os.path.join(model_dir, "config.json") 72 | with open(config_name, "r") as f: 73 | model_params = json.load(f) 74 | 75 | config = AutoConfig.from_pretrained( 76 | config_name, mem_len=model_params.get("mem_len", 1024) 77 | ) 78 | 79 | tokenizer = ExpressionBertTokenizer.from_pretrained(model_dir) 80 | sep = tokenizer.expression_separator 81 | 82 | model = AutoModelWithLMHead.from_pretrained( 83 | model_dir, from_tf=bool(".ckpt" in model_dir), config=config 84 | ) 85 | logger.info(f"Model restored from {model_dir}") 86 | 87 | model.resize_token_embeddings(len(tokenizer)) 88 | 89 | if eval_params.get("block_size", -1) <= 0: 90 | eval_params["block_size"] = tokenizer.max_len 91 | # Our input block size will be the max possible for the model 92 | else: 93 | eval_params["block_size"] = min(training_args.block_size, tokenizer.max_len) 94 | 95 | # Get datasets 96 | eval_dataset = get_dataset( 97 | eval_args.eval_file, 98 | block_size=eval_params["block_size"], 99 | tokenizer=tokenizer, 100 | line_by_line=eval_params.get("line_by_line", True), 101 | ) 102 | 103 | logger.info(f"Dataset size {len(eval_dataset)}.") 104 | num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 105 | logger.info(f"Number of parameters {num_params} of type {type(model)}") 106 | 107 | plm_prob = eval_params["plm_probability"] 108 | perplexity_plm_prob = eval_params.get("perplexity_plm_prob", 0.2) 109 | # NOTE: This collator does not provide an attention mask (unlike the refined training 110 | # collators which prevent attention on padding), however, the model will largely 111 | # ignore the paddings. 112 | vanilla_collator = DataCollatorForPermutationLanguageModeling( 113 | tokenizer=tokenizer, 114 | plm_probability=perplexity_plm_prob, 115 | max_span_length=eval_params["max_span_length"], 116 | ) 117 | 118 | custom_trainer_params = get_trainer_dict(model_params) 119 | 120 | # Initialize our Evaluator 121 | evaluator = Evaluator( 122 | model=model, 123 | args=training_args, 124 | eval_params=eval_params, 125 | data_collator=vanilla_collator, 126 | eval_dataset=eval_dataset, 127 | tokenizer=tokenizer, 128 | prediction_loss_only=False, 129 | **custom_trainer_params, 130 | ) 131 | 132 | # Evaluation 133 | result_dir = os.path.join(model_dir, "results") 134 | os.makedirs(result_dir, exist_ok=True) 135 | eval_filename = eval_args.eval_file.split("/")[-1].split("_")[-1].split(".")[0] 136 | logger.info("*** Evaluate perplexity ***") 137 | 138 | with open(eval_args.eval_file, "r") as f: 139 | prefix = sep.join(f.readline().split(sep)[:-1]) + sep 140 | 141 | # Set seed 142 | if eval_params.get("set_seed", True): 143 | set_seed(eval_params.get("seed", int(time()))) 144 | 145 | eval_output = evaluator.evaluate() 146 | perplexity = math.exp(eval_output["eval_loss"]) 147 | results = {"perplexity": perplexity} 148 | path = os.path.join( 149 | result_dir, f"{eval_filename}_perplexity_plm_{perplexity_plm_prob}.txt" 150 | ) 151 | 152 | with open(find_safe_path(path), "w") as writer: 153 | logger.info("***** Eval results *****") 154 | for key in sorted(results.keys()): 155 | logger.info(" %s = %s", key, str(results[key])) 156 | writer.write("%s = %s\n" % (key, str(results[key]))) 157 | 158 | disable_rdkit_logging() 159 | property_results = [] 160 | properties = eval_params["property_tokens"] 161 | orders = eval_params.get("property_token_masking_order", None) 162 | tokens_to_mask = eval_params.get("property_tokens_to_mask", None) 163 | conditioning_ranges = eval_params.get( 164 | "conditioning_range", 165 | get_equispaced_ranges( 166 | eval_args.eval_file, 167 | properties, 168 | precisions=eval_params.get("property_precisions", [2] * len(properties)), 169 | ), 170 | ) 171 | logger.info(f"Conditioning range is {conditioning_ranges}") 172 | 173 | # If the token masking orders is not specified we just evaluate all properties together 174 | if not orders: 175 | property_collator = PropertyCollator( 176 | tokenizer=tokenizer, 177 | property_tokens=properties, 178 | num_tokens_to_mask=tokens_to_mask, 179 | mask_token_order=orders, 180 | ) 181 | ps, rs = evaluator.multi_property_prediction( 182 | property_collator, 183 | save_path=os.path.join(result_dir, eval_filename), 184 | rmse_factor=eval_params.get("rmse_factor", 1), 185 | ) 186 | else: 187 | 188 | for prop, order, mask in zip(properties, orders, tokens_to_mask): 189 | logger.info(f"*** Evaluate property {prop} ***") 190 | 191 | for to_mask in mask: 192 | 193 | # We iteratively make the task harder by masking 1-4 tokens. 194 | # The order of this is determined by `property_token_masking_order`. 195 | property_collator = PropertyCollator( 196 | tokenizer=tokenizer, 197 | property_tokens=[prop], 198 | num_tokens_to_mask=[to_mask], 199 | mask_token_order=[order], 200 | ) 201 | print(f"Masking {to_mask} in order {order}") 202 | ps, rs, ss = evaluator.property_prediction( 203 | property_collator, 204 | save_path=os.path.join( 205 | result_dir, f"{prop[1:-1]}_{eval_filename}_mask_{to_mask}.csv" 206 | ), 207 | rmse_factor=eval_params.get("rmse_factor", 1), 208 | ) 209 | for p, r, s, n in zip(ps, rs, ss, ["Greedy", "Sampling", "Beam"]): 210 | prop_res_dict = { 211 | "prop": prop[1:-1], 212 | "pearson": p, 213 | "spearman": s, 214 | "rmse": r, 215 | "search": n, 216 | "num_masked": to_mask, 217 | } 218 | property_results.append(prop_res_dict) 219 | 220 | pd.DataFrame(property_results).to_csv( 221 | os.path.join(result_dir, f"property_prediction_{eval_filename}.csv") 222 | ) 223 | for prop, cr in zip(properties, conditioning_ranges): 224 | logger.info(f"Evaluating conditional generation for {prop} with {cr}") 225 | conditional_generation_collator = ConditionalGenerationEvaluationCollator( 226 | tokenizer=tokenizer, 227 | property_token=prop, 228 | conditioning_range=cr, 229 | plm_probability=plm_prob, 230 | max_span_length=eval_params["max_span_length"], 231 | entity_to_mask=eval_params.get("entity_to_mask", None), 232 | entity_separator_token=eval_params.get("entity_separator_token", None), 233 | ) 234 | 235 | # Retrieve the property prediction function from dictionary 236 | if prop[1:-1] in PREDICT_FACTORY.keys(): 237 | evaluate_fn = PREDICT_FACTORY[prop[1:-1]] 238 | logger.info(f"Found property predictor for {prop}") 239 | property_collator = None 240 | else: 241 | # If unavailable property is predicted 242 | evaluate_fn = None 243 | 244 | if orders: 245 | # In single property prediction mode we just mask the property 246 | property_collator = PropertyCollator( 247 | tokenizer=tokenizer, 248 | property_tokens=[prop], 249 | num_tokens_to_mask=[-1], 250 | mask_token_order=None, 251 | ) 252 | else: 253 | # in this case, we use the property predictor from above where all tokens are masked 254 | pass 255 | 256 | logger.info( 257 | f"No property predictor for {prop}, using model itself for evaluation" 258 | ) 259 | 260 | evaluator.conditional_generation( 261 | conditional_generation_collator, 262 | save_path=os.path.join( 263 | result_dir, 264 | f"{prop[1:-1]}_conditional_generation_{param_filename}_{eval_filename}.csv", 265 | ), 266 | passed_eval_fn=evaluate_fn, 267 | property_collator=property_collator, 268 | denormalize_params=eval_params.get("denormalize", {}).get(prop, None), 269 | prefix=prefix, 270 | ) 271 | 272 | print("Done, shutting down.") 273 | 274 | 275 | if __name__ == "__main__": 276 | main() 277 | -------------------------------------------------------------------------------- /scripts/eval_lm_nlp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Language modeling adapted from Huggingface transformers. 4 | 5 | The file is an adaptation of https://github.com/huggingface/transformers/blob/v3.1.0/examples/language-modeling/run_language_modeling.py 6 | 7 | """ 8 | 9 | import json 10 | import logging 11 | import math 12 | import os 13 | import warnings 14 | from dataclasses import dataclass, field 15 | from typing import Optional 16 | 17 | import pandas as pd 18 | import torch 19 | import transformers 20 | from transformers import ( 21 | CONFIG_MAPPING, 22 | MODEL_WITH_LM_HEAD_MAPPING, 23 | AutoConfig, 24 | AutoModelWithLMHead, 25 | DataCollatorForLanguageModeling, 26 | DataCollatorForPermutationLanguageModeling, 27 | HfArgumentParser, 28 | LineByLineTextDataset, 29 | PreTrainedTokenizer, 30 | TextDataset, 31 | XLNetConfig, 32 | XLNetLMHeadModel, 33 | set_seed, 34 | ) 35 | 36 | from terminator.args import CustomTrainingArguments, EvalArguments, ModelArguments 37 | from terminator.collators import TRAIN_COLLATORS, PropertyCollator 38 | from terminator.datasets import get_dataset 39 | from terminator.evaluator import Evaluator 40 | from terminator.tokenization import PropertyTokenizerSquare, XLNetRTTokenizer 41 | from terminator.trainer import CustomTrainer, get_trainer_dict 42 | from terminator.utils import get_latest_checkpoint 43 | 44 | logger = logging.getLogger(__name__) 45 | 46 | 47 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 48 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 49 | 50 | 51 | @dataclass 52 | class DataTrainingArguments: 53 | """ 54 | Arguments pertaining to what data we are going to input our model for training and eval. 55 | """ 56 | 57 | eval_data_file: Optional[str] = field( 58 | default=None, 59 | metadata={ 60 | "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)." 61 | }, 62 | ) 63 | line_by_line: bool = field( 64 | default=False, 65 | metadata={ 66 | "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences." 67 | }, 68 | ) 69 | 70 | mlm: bool = field( 71 | default=False, 72 | metadata={ 73 | "help": "Train with masked-language modeling loss instead of language modeling." 74 | }, 75 | ) 76 | mlm_probability: float = field( 77 | default=0.15, 78 | metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}, 79 | ) 80 | plm_probability: float = field( 81 | default=1 / 6, 82 | metadata={ 83 | "help": "Ratio of length of a span of masked tokens to surrounding context length for permutation language modeling." 84 | }, 85 | ) 86 | max_span_length: int = field( 87 | default=5, 88 | metadata={ 89 | "help": "Maximum length of a span of masked tokens for permutation language modeling." 90 | }, 91 | ) 92 | 93 | block_size: int = field( 94 | default=-1, 95 | metadata={ 96 | "help": "Optional input sequence length after tokenization." 97 | "The training dataset will be truncated in block of this size for training." 98 | "Default to the model max input length for single sentence inputs (take into account special tokens)." 99 | }, 100 | ) 101 | overwrite_cache: bool = field( 102 | default=False, 103 | metadata={"help": "Overwrite the cached training and evaluation sets"}, 104 | ) 105 | 106 | 107 | def main(): 108 | # See all possible arguments in src/transformers/training_args.py 109 | # or by passing the --help flag to this script. 110 | # We now keep distinct sets of args, for a cleaner separation of concerns. 111 | 112 | parser = HfArgumentParser((CustomTrainingArguments, EvalArguments)) 113 | training_args, eval_args = parser.parse_args_into_dataclasses() 114 | 115 | # Setup logging 116 | logging.basicConfig( 117 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 118 | datefmt="%m/%d/%Y %H:%M:%S", 119 | level=logging.INFO, 120 | ) 121 | 122 | with open(eval_args.param_path, "r") as f: 123 | eval_params = json.load(f) 124 | 125 | # Wrap into args to be safe 126 | eval_args.__dict__.update(eval_params) 127 | 128 | if not os.path.exists(training_args.output_dir): 129 | raise ValueError( 130 | f"Output directory ({training_args.output_dir}) does not exist" 131 | ) 132 | # Setup logging 133 | logging.basicConfig( 134 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 135 | datefmt="%m/%d/%Y %H:%M:%S", 136 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, 137 | ) 138 | 139 | # Set seed 140 | set_seed(training_args.seed) 141 | 142 | model_dir = training_args.output_dir 143 | if "checkpoint" not in model_dir: 144 | model_dir = get_latest_checkpoint( 145 | model_dir, must_contain=eval_params.get("checkpoint-str", "best") 146 | ) 147 | 148 | config_name = os.path.join(model_dir, "config.json") 149 | with open(config_name, "r") as f: 150 | model_params = json.load(f) 151 | 152 | config = AutoConfig.from_pretrained(config_name) 153 | 154 | model = XLNetLMHeadModel.from_pretrained(model_dir, config=config) 155 | logger.info(f"Model restored from {model_dir}") 156 | 157 | tokenizer = XLNetRTTokenizer.from_pretrained(model_dir) 158 | property_tokenizer = PropertyTokenizerSquare() 159 | tokenizer.set_property_tokenizer(property_tokenizer) 160 | tokenizer.set_vocab() 161 | # Otherwise the freshly added tokens are added as special tokens. 162 | # tokenizer.unique_no_split_tokens = tokenizer.unique_no_split_tokens[:9] 163 | 164 | logger.info(f"PyTorch version: {torch.__version__}") 165 | # model.resize_token_embeddings(len(tokenizer)) 166 | 167 | if eval_params.get("block_size", -1) <= 0: 168 | eval_params["block_size"] = tokenizer.max_len 169 | # Our input block size will be the max possible for the model 170 | else: 171 | eval_params["block_size"] = min(training_args.block_size, tokenizer.max_len) 172 | 173 | eval_dataset = get_dataset( 174 | eval_args.eval_file, 175 | tokenizer=tokenizer, 176 | block_size=eval_params["block_size"], 177 | line_by_line=eval_params.get("line_by_line", True), 178 | ) 179 | 180 | logger.info(f"Dataset sizes, {len(eval_dataset)}.") 181 | num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 182 | logger.info(f"Number of parameters {num_params} of type {type(model)}") 183 | 184 | custom_trainer_params = get_trainer_dict(model_params) 185 | 186 | _unused_collator = DataCollatorForPermutationLanguageModeling( 187 | tokenizer=tokenizer, plm_probability=0.1, max_span_length=2 188 | ) 189 | 190 | # Initialize our Evaluator 191 | evaluator = Evaluator( 192 | model=model, 193 | args=training_args, 194 | eval_params=eval_params, 195 | data_collator=_unused_collator, 196 | eval_dataset=eval_dataset, 197 | tokenizer=tokenizer, 198 | prediction_loss_only=False, 199 | **custom_trainer_params, 200 | ) 201 | 202 | # Evaluation 203 | result_dir = os.path.join(model_dir, "results") 204 | os.makedirs(result_dir, exist_ok=True) 205 | eval_filename = eval_args.eval_file.split("/")[-1].split("_")[-1].split(".")[0] 206 | logger.info("*** Evaluate perplexity ***") 207 | 208 | property_results = [] 209 | properties = eval_params["property_token"] 210 | orders = eval_params.get("property_token_masking_order", None) 211 | tokens_to_mask = eval_params.get("property_tokens_to_mask", None) 212 | 213 | for prop, order, mask in zip(properties, orders, tokens_to_mask): 214 | logger.info(f"*** Evaluate property {prop} ***") 215 | 216 | for to_mask in mask: 217 | 218 | # We iteratively make the task harder by masking 1-4 tokens. 219 | # The order of this is determined by `property_token_masking_order`. 220 | property_collator = PropertyCollator( 221 | tokenizer=tokenizer, 222 | property_tokens=[prop], 223 | num_tokens_to_mask=[to_mask], 224 | mask_token_order=[order], 225 | ) 226 | print(f"Masking {to_mask} in order {order}") 227 | ps, rs, ss = evaluator.property_prediction( 228 | property_collator, 229 | save_path=os.path.join( 230 | result_dir, f"{prop[1:-1]}_{eval_filename}_mask_{to_mask}.csv" 231 | ), 232 | ) 233 | for p, r, s, n in zip(ps, rs, ss, ["Greedy", "Sampling", "Beam"]): 234 | prop_res_dict = { 235 | "prop": prop[1:-1], 236 | "pearson": p, 237 | "spearman": s, 238 | "rmse": r, 239 | "search": n, 240 | "num_masked": to_mask, 241 | } 242 | property_results.append(prop_res_dict) 243 | 244 | pd.DataFrame(property_results).to_csv( 245 | os.path.join(result_dir, f"property_prediction_{eval_filename}.csv") 246 | ) 247 | 248 | 249 | if __name__ == "__main__": 250 | main() 251 | -------------------------------------------------------------------------------- /scripts/eval_regressionhead.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Language modeling adapted from Huggingface transformers. 4 | """ 5 | import json 6 | import logging 7 | import os 8 | from dataclasses import dataclass, field 9 | from typing import Dict, List, Optional, Tuple, Union 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import torch 14 | import torch.nn as nn 15 | import transformers 16 | from scipy.stats import pearsonr, spearmanr 17 | from selfies import decoder, encoder 18 | from sklearn.metrics import mean_squared_error 19 | from torch.optim import AdamW 20 | from torch.utils.data import DataLoader, Dataset 21 | from tqdm import tqdm 22 | from transformers import ( 23 | CONFIG_MAPPING, 24 | MODEL_WITH_LM_HEAD_MAPPING, 25 | AutoConfig, 26 | AutoModelWithLMHead, 27 | DataCollatorForLanguageModeling, 28 | DataCollatorForPermutationLanguageModeling, 29 | HfArgumentParser, 30 | LineByLineTextDataset, 31 | PreTrainedTokenizer, 32 | TextDataset, 33 | XLNetConfig, 34 | XLNetForSequenceClassification, 35 | XLNetLMHeadModel, 36 | get_linear_schedule_with_warmup, 37 | set_seed, 38 | ) 39 | from transformers.tokenization_utils_base import BatchEncoding 40 | 41 | from terminator.args import CustomTrainingArguments, ModelArguments 42 | from terminator.collators import TRAIN_COLLATORS 43 | from terminator.datasets import get_dataset 44 | from terminator.tokenization import ExpressionBertTokenizer 45 | from terminator.trainer import CustomTrainer, get_trainer_dict 46 | from terminator.utils import get_latest_checkpoint 47 | 48 | transformers.logging.set_verbosity_info() 49 | logger = logging.getLogger(__name__) 50 | # logger.setLevel(level=logging.DEBUG) 51 | 52 | 53 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 54 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 55 | 56 | 57 | @dataclass 58 | class DataTrainingArguments: 59 | """ 60 | Arguments pertaining to what data we are going to input our model for training and eval. 61 | """ 62 | 63 | eval_data_file: Optional[str] = field( 64 | default=None, 65 | metadata={ 66 | "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)." 67 | }, 68 | ) 69 | line_by_line: bool = field( 70 | default=False, 71 | metadata={ 72 | "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences." 73 | }, 74 | ) 75 | 76 | block_size: int = field( 77 | default=-1, 78 | metadata={ 79 | "help": "Optional input sequence length after tokenization." 80 | "The training dataset will be truncated in block of this size for training." 81 | "Default to the model max input length for single sentence inputs (take into account special tokens)." 82 | }, 83 | ) 84 | overwrite_cache: bool = field( 85 | default=False, 86 | metadata={"help": "Overwrite the cached training and evaluation sets"}, 87 | ) 88 | batch_size: Optional[int] = field(default=16, metadata={"help": "Batch size"}) 89 | 90 | 91 | class XLNetRegressionDataset(Dataset): 92 | def __init__(self, tokenizer, data_path): 93 | 94 | self.tokenizer = tokenizer 95 | 96 | # Lazy data loading 97 | with open(data_path, "r") as f: 98 | self.examples = [line.strip() for line in f.readlines()] 99 | 100 | def __len__(self): 101 | return len(self.examples) 102 | 103 | def __getitem__(self, i): 104 | prop, molecules = self.examples[i].split("|") 105 | label = float(prop.split(">")[-1]) 106 | model_input = self.tokenizer(molecules) 107 | return model_input, label 108 | 109 | 110 | @dataclass 111 | class Collator(DataCollatorForPermutationLanguageModeling): 112 | def finalize(self, batch: torch.Tensor, val: int = 0) -> torch.Tensor: 113 | """Sequence length has to be even for PLM collator, see: 114 | https://github.com/huggingface/transformers/issues/7341 115 | 116 | Args: 117 | batch (torch.Tensor): 2D Tensor (batch_size x seq_len) 118 | val (float): Value to fill with. 119 | 120 | Returns: 121 | torch.Tensor: 2D Tensor (batch_size x seq_len) 122 | """ 123 | if batch.size(1) % 2 != 0: 124 | return torch.cat([batch, torch.ones(batch.size(0), 1).long() * val], axis=1) 125 | return batch.long() 126 | 127 | def attention_mask(self, batch: torch.Tensor, dropout: float = 0.0) -> torch.Tensor: 128 | attention_mask = (~(batch == 0)).to(float) 129 | return attention_mask 130 | 131 | def __call__( 132 | self, examples: List[Tuple[Dict[str, List[int]], float]] 133 | ) -> Dict[str, torch.Tensor]: 134 | device = "cuda" if torch.cuda.is_available() else "cpu" 135 | model_inputs = [e[0]["input_ids"] for e in examples] 136 | inputs = self._tensorize_batch(model_inputs) 137 | inputs = self.finalize(inputs) 138 | 139 | attention_mask = self.attention_mask(inputs) 140 | 141 | labels = torch.Tensor([e[-1] for e in examples]) 142 | return labels.to(device), { 143 | "input_ids": inputs.to(device), 144 | "attention_mask": attention_mask.to(device), 145 | } 146 | 147 | 148 | def main(): 149 | 150 | # Switch off comet 151 | os.environ["COMET_MODE"] = "DISABLED" 152 | 153 | parser = HfArgumentParser( 154 | (ModelArguments, DataTrainingArguments, CustomTrainingArguments) 155 | ) 156 | model_args, data_args, train_args = parser.parse_args_into_dataclasses() 157 | print(model_args) 158 | print(data_args) 159 | 160 | if not os.path.exists(train_args.output_dir): 161 | raise ValueError(f"Output directory ({train_args.output_dir}) does not exists!") 162 | 163 | # Setup logging 164 | logging.basicConfig( 165 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 166 | datefmt="%m/%d/%Y %H:%M:%S", 167 | level=logging.INFO if train_args.local_rank in [-1, 0] else logging.WARN, 168 | ) 169 | logger.warning( 170 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 171 | train_args.local_rank, 172 | train_args.device, 173 | train_args.n_gpu, 174 | bool(train_args.local_rank != -1), 175 | train_args.fp16, 176 | ) 177 | logger.info("Training/evaluation parameters %s", train_args) 178 | 179 | # Set seed 180 | set_seed(train_args.seed) 181 | 182 | output_dir = train_args.output_dir 183 | model = XLNetForSequenceClassification.from_pretrained( 184 | output_dir, 185 | cache_dir=model_args.cache_dir, 186 | mem_len=1024, 187 | return_dict=True, 188 | ) 189 | 190 | logger.info(f"Model restored from {output_dir}") 191 | 192 | tokenizer = ExpressionBertTokenizer.from_pretrained(model_args.tokenizer_name) 193 | 194 | logger.info(f"PyTorch version: {torch.__version__}") 195 | # model.resize_token_embeddings(len(tokenizer)) 196 | 197 | if data_args.block_size <= 0: 198 | data_args.block_size = tokenizer.max_len 199 | # Our input block size will be the max possible for the model 200 | else: 201 | data_args.block_size = min(data_args.block_size, tokenizer.max_len) 202 | 203 | # Get datasets 204 | device = "cuda" if torch.cuda.is_available() else "cpu" 205 | fileprefix = data_args.eval_data_file.split("/")[-1].split(".")[0] 206 | logger.info(f"Results will be saved in {output_dir} with prefix {fileprefix}") 207 | # WHY ARE THE CORRELATIONS NEGATIVE? YEST WITH VALIDATIAON DATA 208 | dataset = XLNetRegressionDataset( 209 | tokenizer=tokenizer, data_path=data_args.eval_data_file 210 | ) 211 | model = model.to(device) 212 | collator = Collator(tokenizer=tokenizer) 213 | logger.info(f"Evaluation dataset size: {len(dataset)}.") 214 | num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 215 | total_params = sum(p.numel() for p in model.parameters()) 216 | logger.info( 217 | f"{total_params} parameters, {num_params} trainable. Model: {type(model)}" 218 | ) 219 | 220 | loader = DataLoader( 221 | dataset, 222 | batch_size=data_args.batch_size, 223 | drop_last=False, 224 | shuffle=False, 225 | collate_fn=collator, 226 | ) 227 | 228 | eval_seqs = [dataset.examples[i].split("|")[-1] for i in range(len(dataset))] 229 | 230 | model.eval() 231 | labels, predictions = [], [] 232 | with torch.no_grad(): 233 | for idx, (labs, inputs) in enumerate(loader): 234 | output = model(**inputs, labels=labs) 235 | prediction = output.logits.cpu().detach().squeeze().numpy() 236 | 237 | labels.extend(list(labs.cpu().detach().numpy())) 238 | predictions.extend(list(prediction)) 239 | 240 | rmse = np.sqrt(mean_squared_error(predictions, labels)) 241 | pearson = pearsonr(predictions, labels)[0] 242 | spearman = spearmanr(predictions, labels)[0] 243 | 244 | logger.info( 245 | f"Eval: RMSE:{rmse:.5f}, pearson:{pearson:.5f}, spearman:{spearman:.5f}" 246 | ) 247 | 248 | with open(os.path.join(output_dir, f"{fileprefix}_results.json"), "w") as f: 249 | json.dump( 250 | {"RMSE": str(rmse), "Pearson": str(pearson), "Spearman": str(spearman)}, 251 | f, 252 | indent=4, 253 | ) 254 | pd.DataFrame( 255 | { 256 | "sequence": eval_seqs, 257 | "predictions": list(predictions), 258 | "labels": list(labels), 259 | } 260 | ).to_csv(os.path.join(output_dir, f"{fileprefix}_predictions.csv")) 261 | 262 | 263 | if __name__ == "__main__": 264 | main() 265 | -------------------------------------------------------------------------------- /scripts/generate_example_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate example data starting from a .smi file. 3 | 4 | We use QED of molecules as an example. 5 | """ 6 | 7 | import argparse 8 | import os 9 | 10 | from rdkit import Chem 11 | from rdkit.Chem import QED 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("input_filepath", type=str, help="path to the .smi file.") 15 | parser.add_argument("output_filepath", type=str, help="output where to store the data.") 16 | 17 | 18 | def main() -> None: 19 | """Generate example data.""" 20 | args = parser.parse_args() 21 | input_filepath = args.input_filepath 22 | output_filepath = args.output_filepath 23 | 24 | with open(input_filepath, "rt") as fpr: 25 | with open(output_filepath, "wt") as fpw: 26 | smiles_generator = (line.strip().split("\t")[0] for line in fpr) 27 | for smiles in smiles_generator: 28 | try: 29 | fpw.write( 30 | f"{QED.qed(Chem.MolFromSmiles(smiles)):.4}|{smiles}{os.linesep}" 31 | ) 32 | except Exception: 33 | print(f"Problem processing SMILES={smiles}") 34 | 35 | 36 | if __name__ == "__main__": 37 | main() 38 | -------------------------------------------------------------------------------- /scripts/run_language_modeling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Language modeling adapted from Huggingface transformers. 4 | 5 | The file is an adaptation of https://github.com/huggingface/transformers/blob/v3.1.0/examples/language-modeling/run_language_modeling.py 6 | 7 | """ 8 | 9 | import json 10 | import logging 11 | import math 12 | import os 13 | import warnings 14 | from dataclasses import dataclass, field 15 | from typing import Optional 16 | 17 | import pandas as pd 18 | import torch 19 | import transformers 20 | from transformers import ( 21 | CONFIG_MAPPING, 22 | MODEL_WITH_LM_HEAD_MAPPING, 23 | AutoConfig, 24 | AutoModelWithLMHead, 25 | DataCollatorForLanguageModeling, 26 | DataCollatorForPermutationLanguageModeling, 27 | HfArgumentParser, 28 | LineByLineTextDataset, 29 | PreTrainedTokenizer, 30 | TextDataset, 31 | XLNetLMHeadModel, 32 | set_seed, 33 | ) 34 | 35 | from terminator.args import CustomTrainingArguments, ModelArguments 36 | from terminator.collators import TRAIN_COLLATORS 37 | from terminator.datasets import get_dataset 38 | from terminator.tokenization import ExpressionBertTokenizer 39 | from terminator.trainer import CustomTrainer, get_trainer_dict 40 | from terminator.utils import get_latest_checkpoint 41 | 42 | transformers.logging.set_verbosity_info() 43 | logger = logging.getLogger(__name__) 44 | # logger.setLevel(level=logging.DEBUG) 45 | 46 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 47 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 48 | 49 | 50 | @dataclass 51 | class DataTrainingArguments: 52 | """ 53 | Arguments pertaining to what data we are going to input our model for training and eval. 54 | """ 55 | 56 | train_data_file: Optional[str] = field( 57 | default=None, metadata={"help": "The input training data file (a text file)."} 58 | ) 59 | eval_data_file: Optional[str] = field( 60 | default=None, 61 | metadata={ 62 | "help": "Input evaluation data file to evaluate the perplexity on (a text file)." 63 | }, 64 | ) 65 | line_by_line: bool = field( 66 | default=False, 67 | metadata={ 68 | "help": "Whether lines of text in the dataset are to be handled as distinct samples." 69 | }, 70 | ) 71 | plm_probability: float = field( 72 | default=1 / 6, 73 | metadata={ 74 | "help": "Ratio of length of a span of masked tokens to surrounding context length for PLM." 75 | }, 76 | ) 77 | max_span_length: int = field( 78 | default=5, metadata={"help": "Max length of a span of masked tokens for PLM."} 79 | ) 80 | 81 | block_size: int = field( 82 | default=-1, 83 | metadata={"help": "Optional input sequence length after tokenization."}, 84 | ) 85 | overwrite_cache: bool = field( 86 | default=False, 87 | metadata={"help": "Overwrite the cached training and evaluation sets"}, 88 | ) 89 | 90 | 91 | def main(): 92 | # See all possible arguments in src/transformers/training_args.py 93 | # or by passing the --help flag to this script. 94 | # We now keep distinct sets of args, for a cleaner separation of concerns. 95 | 96 | # Switch off comet 97 | os.environ["COMET_MODE"] = "DISABLED" 98 | 99 | parser = HfArgumentParser( 100 | (ModelArguments, DataTrainingArguments, CustomTrainingArguments) 101 | ) 102 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 103 | 104 | if data_args.eval_data_file is None and training_args.do_eval: 105 | raise ValueError( 106 | "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " 107 | "or remove the --do_eval argument." 108 | ) 109 | 110 | if ( 111 | os.path.exists(training_args.output_dir) 112 | and os.listdir(training_args.output_dir) 113 | and training_args.do_train 114 | and not training_args.overwrite_output_dir 115 | ): 116 | raise ValueError( 117 | f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." 118 | ) 119 | os.makedirs(training_args.output_dir, exist_ok=True) 120 | 121 | # Setup logging 122 | logging.basicConfig( 123 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 124 | datefmt="%m/%d/%Y %H:%M:%S", 125 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, 126 | ) 127 | logger.warning( 128 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", 129 | training_args.local_rank, 130 | training_args.device, 131 | training_args.n_gpu, 132 | bool(training_args.local_rank != -1), 133 | ) 134 | logger.info("Training/evaluation parameters %s", training_args) 135 | 136 | # Set seed 137 | set_seed(training_args.seed) 138 | # Load the training configuration file 139 | if training_args.training_config_path is not None: 140 | with open(training_args.training_config_path, "r") as f: 141 | train_config = json.load(f) 142 | 143 | # Store training config file in model directory 144 | with open( 145 | os.path.join(training_args.output_dir, "training_configs.json"), "w" 146 | ) as f: 147 | json.dump(train_config, f, indent="\t") 148 | else: 149 | train_config = {} 150 | 151 | if model_args.config_name: 152 | with open(model_args.config_name, "r") as f: 153 | model_params = json.load(f) 154 | 155 | config = AutoConfig.from_pretrained( 156 | model_args.config_name, 157 | cache_dir=model_args.cache_dir, 158 | mem_len=model_params.get("mem_len", 1024), 159 | ) 160 | 161 | elif model_args.model_name_or_path: 162 | if "checkpoint" not in model_args.model_name_or_path: 163 | model_args.model_name_or_path = get_latest_checkpoint( 164 | model_args.model_name_or_path, 165 | must_contain=train_config.get("checkpoint-str", "best"), 166 | ) 167 | 168 | config = AutoConfig.from_pretrained( 169 | model_args.model_name_or_path, 170 | cache_dir=model_args.cache_dir, 171 | ) 172 | model_params = config.__dict__ 173 | 174 | else: 175 | config = CONFIG_MAPPING[model_args.model_type]() 176 | model_params = config.__dict__ 177 | logger.warning("You are instantiating a new config instance from scratch.") 178 | 179 | if model_args.tokenizer_name: 180 | tokenizer = ExpressionBertTokenizer.from_pretrained( 181 | model_args.tokenizer_name, cache_dir=model_args.cache_dir 182 | ) 183 | 184 | elif model_args.model_name_or_path: 185 | tokenizer = ExpressionBertTokenizer.from_pretrained( 186 | model_args.model_name_or_path, cache_dir=model_args.cache_dir 187 | ) 188 | else: 189 | raise ValueError( 190 | "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," 191 | "and load it from here, using --tokenizer_name" 192 | ) 193 | 194 | if model_args.model_name_or_path: 195 | 196 | # Restore checkpoint if available 197 | if "checkpoint" not in model_args.model_name_or_path: 198 | model_args.model_name_or_path = get_latest_checkpoint( 199 | model_args.model_name_or_path, 200 | must_contain=train_config.get("checkpoint-str", "best"), 201 | ) 202 | 203 | model = AutoModelWithLMHead.from_pretrained( 204 | model_args.model_name_or_path, 205 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 206 | config=config, 207 | cache_dir=model_args.cache_dir, 208 | ) 209 | logger.info("Model restored") 210 | 211 | # Get min loss so far 212 | try: 213 | loss_df = pd.read_csv( 214 | os.path.join(model_args.model_name_or_path, "training_log.csv"), 215 | index_col=0, 216 | ) 217 | model_params.update({"training_logs": list(loss_df.T.to_dict().values())}) 218 | logger.info("Restored training loss history.") 219 | except Exception: 220 | logger.warning( 221 | "Could not find loss history, might overwrite good checkpoints." 222 | ) 223 | 224 | else: 225 | logger.info("Training new model from scratch") 226 | model = AutoModelWithLMHead.from_config(config) 227 | 228 | logger.info(f"PyTorch version: {torch.__version__}") 229 | model.resize_token_embeddings(len(tokenizer)) 230 | 231 | if data_args.block_size <= 0: 232 | data_args.block_size = tokenizer.max_len 233 | # Our input block size will be the max possible for the model 234 | else: 235 | data_args.block_size = min(data_args.block_size, tokenizer.max_len) 236 | 237 | # Get datasets 238 | train_dataset = ( 239 | get_dataset( 240 | data_args.train_data_file, 241 | tokenizer=tokenizer, 242 | block_size=data_args.block_size, 243 | ) 244 | if training_args.do_train 245 | else None 246 | ) 247 | eval_dataset = ( 248 | get_dataset( 249 | data_args.eval_data_file, 250 | tokenizer=tokenizer, 251 | block_size=data_args.block_size, 252 | line_by_line=data_args.line_by_line, 253 | ) 254 | if training_args.do_eval 255 | else None 256 | ) 257 | if training_args.do_eval: 258 | logger.info(f"Dataset sizes {len(train_dataset)}, {len(eval_dataset)}.") 259 | num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 260 | logger.info(f"Number of parameters {num_params} of type {type(model)}") 261 | 262 | if config.model_type != "xlnet": 263 | warnings.warn(f"Full functionality only with XLNet; not {config.model_type}") 264 | 265 | # Set up the training strategy (PLM vs. alternating tasks) + loss function 266 | if train_config.get("alternate_tasks", False): 267 | logger.info("Training with alternate tasks") 268 | # The main collator is the one for property prediction 269 | data_collator = TRAIN_COLLATORS["property"]( 270 | tokenizer=tokenizer, 271 | property_tokens=train_config["property_tokens"], 272 | num_tokens_to_mask=train_config.get("num_tokens_to_mask", None), 273 | mask_token_order=train_config.get("mask_token_order", None), 274 | ) 275 | alternating_collator = TRAIN_COLLATORS[train_config["cg_collator"]]( 276 | tokenizer=tokenizer, **train_config["cg_collator_params"] 277 | ) 278 | 279 | else: 280 | if train_config["task"] == "proponly": 281 | data_collator = TRAIN_COLLATORS["property"]( 282 | tokenizer=tokenizer, 283 | property_tokens=train_config["property_tokens"], 284 | num_tokens_to_mask=train_config.get("num_tokens_to_mask", None), 285 | mask_token_order=train_config.get("mask_token_order", None), 286 | ) 287 | logger.warning("Training only on property predict") 288 | elif train_config["task"] == "gen_only": 289 | 290 | data_collator = TRAIN_COLLATORS[train_config["cg_collator"]]( 291 | tokenizer=tokenizer, **train_config["cg_collator_params"] 292 | ) 293 | logger.warning("Training ONLY on conditional generation") 294 | 295 | elif train_config["task"] == "plm": 296 | 297 | logger.info("Training with PLM") 298 | # Only vanilla PLM training 299 | data_collator = DataCollatorForPermutationLanguageModeling( 300 | tokenizer=tokenizer, 301 | plm_probability=data_args.plm_probability, 302 | max_span_length=data_args.max_span_length, 303 | ) 304 | alternating_collator = None 305 | 306 | custom_trainer_params = get_trainer_dict(model_params) 307 | 308 | # Initialize our Trainer 309 | trainer = CustomTrainer( 310 | model=model, 311 | args=training_args, 312 | data_collator=data_collator, 313 | train_dataset=train_dataset, 314 | eval_dataset=eval_dataset, 315 | tokenizer=tokenizer, 316 | prediction_loss_only=False, 317 | alternating_collator=alternating_collator, 318 | train_config=train_config, 319 | **custom_trainer_params, 320 | ) 321 | 322 | # Training 323 | if training_args.do_train: 324 | model_path = ( 325 | model_args.model_name_or_path 326 | if model_args.model_name_or_path is not None 327 | and os.path.isdir(model_args.model_name_or_path) 328 | else None 329 | ) 330 | trainer.train(model_path=model_path) 331 | trainer.save_model() 332 | # For convenience, we also re-save the tokenizer to the same directory, 333 | # so that you can share your model easily on huggingface.co/models =) 334 | if trainer.is_world_master(): 335 | tokenizer.save_pretrained(training_args.output_dir) 336 | 337 | # Evaluation 338 | results = {} 339 | if training_args.do_eval: 340 | logger.info("*** Evaluate ***") 341 | 342 | eval_output = trainer.evaluate() 343 | 344 | perplexity = math.exp(eval_output["eval_loss"]) 345 | result = {"perplexity": perplexity} 346 | 347 | output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") 348 | if trainer.is_world_master(): 349 | with open(output_eval_file, "w") as writer: 350 | logger.info("***** Eval results *****") 351 | for key in sorted(result.keys()): 352 | logger.info(" %s = %s", key, str(result[key])) 353 | writer.write("%s = %s\n" % (key, str(result[key]))) 354 | 355 | results.update(result) 356 | 357 | return results 358 | 359 | 360 | if __name__ == "__main__": 361 | main() 362 | -------------------------------------------------------------------------------- /scripts/run_lm_nlp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Language modeling adapted from Huggingface transformers. 4 | 5 | The file is an adaptation of https://github.com/huggingface/transformers/blob/v3.1.0/examples/language-modeling/run_language_modeling.py 6 | """ 7 | 8 | 9 | import json 10 | import logging 11 | import math 12 | import os 13 | import warnings 14 | from dataclasses import dataclass, field 15 | from typing import Optional 16 | 17 | import pandas as pd 18 | import torch 19 | import transformers 20 | from transformers import ( 21 | CONFIG_MAPPING, 22 | MODEL_WITH_LM_HEAD_MAPPING, 23 | AutoConfig, 24 | AutoModelWithLMHead, 25 | DataCollatorForLanguageModeling, 26 | DataCollatorForPermutationLanguageModeling, 27 | HfArgumentParser, 28 | LineByLineTextDataset, 29 | PreTrainedTokenizer, 30 | TextDataset, 31 | XLNetConfig, 32 | XLNetLMHeadModel, 33 | set_seed, 34 | ) 35 | 36 | from terminator.args import CustomTrainingArguments, ModelArguments 37 | from terminator.collators import TRAIN_COLLATORS 38 | from terminator.datasets import get_dataset 39 | from terminator.tokenization import PropertyTokenizerSquare, XLNetRTTokenizer 40 | from terminator.trainer import CustomTrainer, get_trainer_dict 41 | from terminator.utils import get_latest_checkpoint 42 | 43 | logger = logging.getLogger(__name__) 44 | 45 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 46 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 47 | 48 | 49 | @dataclass 50 | class DataTrainingArguments: 51 | """ 52 | Arguments pertaining to what data we are going to input our model for training and eval. 53 | """ 54 | 55 | train_data_file: Optional[str] = field( 56 | default=None, metadata={"help": "The input training data file (a text file)."} 57 | ) 58 | eval_data_file: Optional[str] = field( 59 | default=None, 60 | metadata={ 61 | "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)." 62 | }, 63 | ) 64 | line_by_line: bool = field( 65 | default=False, 66 | metadata={ 67 | "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences." 68 | }, 69 | ) 70 | 71 | mlm: bool = field( 72 | default=False, 73 | metadata={ 74 | "help": "Train with masked-language modeling loss instead of language modeling." 75 | }, 76 | ) 77 | mlm_probability: float = field( 78 | default=0.15, 79 | metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}, 80 | ) 81 | plm_probability: float = field( 82 | default=1 / 6, 83 | metadata={ 84 | "help": "Ratio of length of a span of masked tokens to surrounding context length for permutation language modeling." 85 | }, 86 | ) 87 | max_span_length: int = field( 88 | default=5, 89 | metadata={ 90 | "help": "Maximum length of a span of masked tokens for permutation language modeling." 91 | }, 92 | ) 93 | 94 | block_size: int = field( 95 | default=-1, 96 | metadata={ 97 | "help": "Optional input sequence length after tokenization." 98 | "The training dataset will be truncated in block of this size for training." 99 | "Default to the model max input length for single sentence inputs (take into account special tokens)." 100 | }, 101 | ) 102 | overwrite_cache: bool = field( 103 | default=False, 104 | metadata={"help": "Overwrite the cached training and evaluation sets"}, 105 | ) 106 | 107 | 108 | def main(): 109 | # See all possible arguments in src/transformers/training_args.py 110 | # or by passing the --help flag to this script. 111 | # We now keep distinct sets of args, for a cleaner separation of concerns. 112 | 113 | # Switch off comet 114 | os.environ["COMET_MODE"] = "DISABLED" 115 | 116 | parser = HfArgumentParser( 117 | (ModelArguments, DataTrainingArguments, CustomTrainingArguments) 118 | ) 119 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 120 | 121 | if data_args.eval_data_file is None and training_args.do_eval: 122 | raise ValueError( 123 | "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " 124 | "or remove the --do_eval argument." 125 | ) 126 | 127 | if ( 128 | os.path.exists(training_args.output_dir) 129 | and os.listdir(training_args.output_dir) 130 | and training_args.do_train 131 | and not training_args.overwrite_output_dir 132 | ): 133 | raise ValueError( 134 | f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." 135 | ) 136 | 137 | # Setup logging 138 | logging.basicConfig( 139 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 140 | datefmt="%m/%d/%Y %H:%M:%S", 141 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, 142 | ) 143 | logger.warning( 144 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 145 | training_args.local_rank, 146 | training_args.device, 147 | training_args.n_gpu, 148 | bool(training_args.local_rank != -1), 149 | training_args.fp16, 150 | ) 151 | logger.info("Training/evaluation parameters %s", training_args) 152 | 153 | # Set seed 154 | set_seed(training_args.seed) 155 | 156 | configuration = XLNetConfig("xlnet-base-cased") 157 | config_dict = configuration.to_dict() 158 | 159 | model = XLNetLMHeadModel.from_pretrained( 160 | "xlnet-base-cased", 161 | cache_dir=model_args.cache_dir, 162 | mem_len=1024, 163 | return_dict=True, 164 | ) 165 | 166 | if not os.path.exists(training_args.output_dir): 167 | os.makedirs(training_args.output_dir) 168 | 169 | tokenizer = XLNetRTTokenizer.from_pretrained( 170 | model_args.tokenizer_name, cache_dir=model_args.cache_dir 171 | ) 172 | property_tokenizer = PropertyTokenizerSquare() 173 | tokenizer.set_property_tokenizer(property_tokenizer) 174 | tokenizer.set_vocab() 175 | # Otherwise the freshly added tokens are added as special tokens. 176 | tokenizer.unique_no_split_tokens = tokenizer.unique_no_split_tokens[:9] 177 | 178 | if model_args.model_name_or_path: 179 | 180 | # Restore checkpoint if available 181 | if "checkpoint" not in model_args.model_name_or_path: 182 | ckpt_path = get_latest_checkpoint( 183 | model_args.model_name_or_path, must_contain="rmse" 184 | ) 185 | else: 186 | ckpt_path = model_args.model_name_or_path 187 | 188 | model = XLNetLMHeadModel.from_pretrained( 189 | ckpt_path, 190 | cache_dir=model_args.cache_dir, 191 | mem_len=1024, 192 | return_dict=True, 193 | ) 194 | logger.info(f"Model restored from {ckpt_path}") 195 | 196 | # Get min loss so far 197 | try: 198 | loss_df = pd.read_csv( 199 | os.path.join(ckpt_path, "training_log.csv"), 200 | index_col=0, 201 | ) 202 | configuration.update({"training_logs": list(loss_df.T.to_dict().values())}) 203 | logger.info("Restored training loss history.") 204 | except Exception: 205 | logger.warning( 206 | "Could not find loss history, might overwrite good checkpoints." 207 | ) 208 | 209 | logger.info(f"PyTorch version: {torch.__version__}") 210 | model.resize_token_embeddings(len(tokenizer)) 211 | 212 | if data_args.block_size <= 0: 213 | data_args.block_size = tokenizer.max_len 214 | # Our input block size will be the max possible for the model 215 | else: 216 | data_args.block_size = min(data_args.block_size, tokenizer.max_len) 217 | 218 | # Get datasets 219 | train_dataset = get_dataset( 220 | data_args.train_data_file, 221 | tokenizer=tokenizer, 222 | block_size=data_args.block_size, 223 | ) 224 | 225 | eval_dataset = get_dataset( 226 | data_args.eval_data_file, 227 | tokenizer=tokenizer, 228 | block_size=data_args.block_size, 229 | line_by_line=data_args.line_by_line, 230 | ) 231 | 232 | logger.info(f"Dataset sizes {len(train_dataset)}, {len(eval_dataset)}.") 233 | num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 234 | logger.info(f"Number of parameters {num_params} of type {type(model)}") 235 | 236 | # Set up the training strategy (PLM vs. alternating tasks) + loss function 237 | if training_args.training_config_path is not None: 238 | with open(training_args.training_config_path, "r") as f: 239 | train_config = json.load(f) 240 | 241 | # Store training config file in model directory 242 | with open( 243 | os.path.join(training_args.output_dir, "training_configs.json"), "w" 244 | ) as f: 245 | json.dump(train_config, f, indent="\t") 246 | else: 247 | train_config = {} 248 | 249 | if train_config.get("alternate_tasks", False): 250 | logger.info("Training with alternate tasks") 251 | # The main collator is the one for property prediction 252 | data_collator = TRAIN_COLLATORS["property"]( 253 | tokenizer=tokenizer, 254 | property_tokens=train_config["property_tokens"], 255 | num_tokens_to_mask=train_config.get("num_tokens_to_mask", None), 256 | mask_token_order=train_config.get("mask_token_order", None), 257 | ) 258 | alternating_collator = TRAIN_COLLATORS[train_config["cg_collator"]]( 259 | tokenizer=tokenizer, **train_config["cg_collator_params"] 260 | ) 261 | 262 | else: 263 | logger.info("Training with PLM") 264 | # Only vanilla PLM training 265 | data_collator = DataCollatorForPermutationLanguageModeling( 266 | tokenizer=tokenizer, 267 | plm_probability=data_args.plm_probability, 268 | max_span_length=data_args.max_span_length, 269 | ) 270 | alternating_collator = None 271 | 272 | custom_trainer_params = get_trainer_dict(config_dict) 273 | 274 | # Initialize our Trainer 275 | print("***DATA COLLATOR", data_collator) 276 | print("***ALTERNATING COLLATOR", alternating_collator) 277 | trainer = CustomTrainer( 278 | model=model, 279 | args=training_args, 280 | data_collator=data_collator, 281 | train_dataset=train_dataset, 282 | eval_dataset=eval_dataset, 283 | tokenizer=tokenizer, 284 | prediction_loss_only=False, 285 | alternating_collator=alternating_collator, 286 | train_config=train_config, 287 | **custom_trainer_params, 288 | ) 289 | 290 | # Training 291 | if training_args.do_train: 292 | model_path = ( 293 | model_args.model_name_or_path 294 | if model_args.model_name_or_path is not None 295 | and os.path.isdir(model_args.model_name_or_path) 296 | else None 297 | ) 298 | trainer.train(model_path=model_path) 299 | trainer.save_model() 300 | # For convenience, we also re-save the tokenizer to the same directory, 301 | # so that you can share your model easily on huggingface.co/models =) 302 | if trainer.is_world_master(): 303 | tokenizer.save_pretrained(training_args.output_dir) 304 | 305 | # Evaluation 306 | results = {} 307 | if training_args.do_eval: 308 | logger.info("*** Evaluate ***") 309 | 310 | eval_output = trainer.evaluate() 311 | 312 | perplexity = math.exp(eval_output["eval_loss"]) 313 | result = {"perplexity": perplexity} 314 | 315 | output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") 316 | if trainer.is_world_master(): 317 | with open(output_eval_file, "w") as writer: 318 | logger.info("***** Eval results *****") 319 | for key in sorted(result.keys()): 320 | logger.info(" %s = %s", key, str(result[key])) 321 | writer.write("%s = %s\n" % (key, str(result[key]))) 322 | 323 | results.update(result) 324 | 325 | return results 326 | 327 | 328 | if __name__ == "__main__": 329 | main() 330 | -------------------------------------------------------------------------------- /scripts/run_regressionhead.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Language modeling adapted from Huggingface transformers. 4 | """ 5 | import json 6 | import logging 7 | import os 8 | from dataclasses import dataclass, field 9 | from typing import Dict, List, Optional, Tuple, Union 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import torch 14 | import torch.nn as nn 15 | import transformers 16 | from scipy.stats import pearsonr, spearmanr 17 | from selfies import decoder, encoder 18 | from sklearn.metrics import mean_squared_error 19 | from torch.optim import AdamW 20 | from torch.utils.data import DataLoader, Dataset 21 | from tqdm import tqdm 22 | from transformers import ( 23 | CONFIG_MAPPING, 24 | MODEL_WITH_LM_HEAD_MAPPING, 25 | AutoConfig, 26 | AutoModelWithLMHead, 27 | DataCollatorForLanguageModeling, 28 | DataCollatorForPermutationLanguageModeling, 29 | HfArgumentParser, 30 | LineByLineTextDataset, 31 | PreTrainedTokenizer, 32 | TextDataset, 33 | XLNetConfig, 34 | XLNetForSequenceClassification, 35 | XLNetLMHeadModel, 36 | get_linear_schedule_with_warmup, 37 | set_seed, 38 | ) 39 | from transformers.tokenization_utils_base import BatchEncoding 40 | 41 | from terminator.args import CustomTrainingArguments, ModelArguments 42 | from terminator.collators import TRAIN_COLLATORS 43 | from terminator.datasets import get_dataset 44 | from terminator.tokenization import ExpressionBertTokenizer 45 | from terminator.trainer import CustomTrainer, get_trainer_dict 46 | from terminator.utils import get_latest_checkpoint 47 | 48 | transformers.logging.set_verbosity_info() 49 | logger = logging.getLogger(__name__) 50 | # logger.setLevel(level=logging.DEBUG) 51 | 52 | 53 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 54 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 55 | 56 | 57 | @dataclass 58 | class DataTrainingArguments: 59 | """ 60 | Arguments pertaining to what data we are going to input our model for training and eval. 61 | """ 62 | 63 | train_data_file: Optional[str] = field( 64 | default=None, metadata={"help": "The input training data file (a text file)."} 65 | ) 66 | eval_data_file: Optional[str] = field( 67 | default=None, 68 | metadata={ 69 | "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)." 70 | }, 71 | ) 72 | line_by_line: bool = field( 73 | default=False, 74 | metadata={ 75 | "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences." 76 | }, 77 | ) 78 | 79 | block_size: int = field( 80 | default=-1, 81 | metadata={ 82 | "help": "Optional input sequence length after tokenization." 83 | "The training dataset will be truncated in block of this size for training." 84 | "Default to the model max input length for single sentence inputs (take into account special tokens)." 85 | }, 86 | ) 87 | overwrite_cache: bool = field( 88 | default=False, 89 | metadata={"help": "Overwrite the cached training and evaluation sets"}, 90 | ) 91 | batch_size: Optional[int] = field(default=16, metadata={"help": "Batch size"}) 92 | 93 | 94 | class XLNetRegressionDataset(Dataset): 95 | def __init__(self, tokenizer, data_path): 96 | 97 | self.tokenizer = tokenizer 98 | 99 | # Lazy data loading 100 | with open(data_path, "r") as f: 101 | self.examples = [line.strip() for line in f.readlines()] 102 | 103 | def __len__(self): 104 | return len(self.examples) 105 | 106 | def __getitem__(self, i): 107 | prop, molecules = self.examples[i].split("|") 108 | label = float(prop.split(">")[-1]) 109 | model_input = self.tokenizer(molecules) 110 | return model_input, label 111 | 112 | 113 | @dataclass 114 | class Collator(DataCollatorForPermutationLanguageModeling): 115 | def finalize(self, batch: torch.Tensor, val: int = 0) -> torch.Tensor: 116 | """Sequence length has to be even for PLM collator, see: 117 | https://github.com/huggingface/transformers/issues/7341 118 | 119 | Args: 120 | batch (torch.Tensor): 2D Tensor (batch_size x seq_len) 121 | val (float): Value to fill with. 122 | 123 | Returns: 124 | torch.Tensor: 2D Tensor (batch_size x seq_len) 125 | """ 126 | if batch.size(1) % 2 != 0: 127 | return torch.cat([batch, torch.ones(batch.size(0), 1).long() * val], axis=1) 128 | return batch.long() 129 | 130 | def attention_mask(self, batch: torch.Tensor, dropout: float = 0.0) -> torch.Tensor: 131 | attention_mask = (~(batch == 0)).to(float) 132 | return attention_mask 133 | 134 | def __call__( 135 | self, examples: List[Tuple[Dict[str, List[int]], float]] 136 | ) -> Dict[str, torch.Tensor]: 137 | device = "cuda" if torch.cuda.is_available() else "cpu" 138 | model_inputs = [e[0]["input_ids"] for e in examples] 139 | inputs = self._tensorize_batch(model_inputs) 140 | inputs = self.finalize(inputs) 141 | 142 | attention_mask = self.attention_mask(inputs) 143 | 144 | labels = torch.Tensor([e[-1] for e in examples]) 145 | return labels.to(device), { 146 | "input_ids": inputs.to(device), 147 | "attention_mask": attention_mask.to(device), 148 | } 149 | 150 | 151 | def main(): 152 | 153 | # Switch off comet 154 | os.environ["COMET_MODE"] = "DISABLED" 155 | 156 | parser = HfArgumentParser( 157 | (ModelArguments, DataTrainingArguments, CustomTrainingArguments) 158 | ) 159 | model_args, data_args, train_args = parser.parse_args_into_dataclasses() 160 | 161 | if ( 162 | os.path.exists(train_args.output_dir) 163 | and os.listdir(train_args.output_dir) 164 | and train_args.do_train 165 | and not train_args.overwrite_output_dir 166 | ): 167 | raise ValueError( 168 | f"Output directory ({train_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." 169 | ) 170 | 171 | # Setup logging 172 | logging.basicConfig( 173 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 174 | datefmt="%m/%d/%Y %H:%M:%S", 175 | level=logging.INFO if train_args.local_rank in [-1, 0] else logging.WARN, 176 | ) 177 | logger.warning( 178 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 179 | train_args.local_rank, 180 | train_args.device, 181 | train_args.n_gpu, 182 | bool(train_args.local_rank != -1), 183 | train_args.fp16, 184 | ) 185 | logger.info("Training/evaluation parameters %s", train_args) 186 | 187 | # Set seed 188 | set_seed(train_args.seed) 189 | 190 | # Load pretrained model and tokenizer 191 | # 192 | # Distributed training: 193 | # The .from_pretrained methods guarantee that only one local process can concurrently 194 | # download model & vocab. 195 | 196 | model = XLNetForSequenceClassification.from_pretrained( 197 | "xlnet-base-cased", 198 | cache_dir=model_args.cache_dir, 199 | mem_len=1024, 200 | return_dict=True, 201 | ) 202 | # Do Regression 203 | model.num_labels = 1 204 | model.logits_proj = nn.Linear(768, 1) 205 | 206 | if not os.path.exists(train_args.output_dir): 207 | os.makedirs(train_args.output_dir) 208 | print(model_args.tokenizer_name) 209 | tokenizer = ExpressionBertTokenizer.from_pretrained(model_args.tokenizer_name) 210 | 211 | logger.info(f"PyTorch version: {torch.__version__}") 212 | model.resize_token_embeddings(len(tokenizer)) 213 | 214 | if data_args.block_size <= 0: 215 | data_args.block_size = tokenizer.max_len 216 | # Our input block size will be the max possible for the model 217 | else: 218 | data_args.block_size = min(data_args.block_size, tokenizer.max_len) 219 | 220 | # Get datasets 221 | device = "cuda" if torch.cuda.is_available() else "cpu" 222 | train_dataset = XLNetRegressionDataset( 223 | tokenizer=tokenizer, data_path=data_args.train_data_file 224 | ) 225 | eval_dataset = XLNetRegressionDataset( 226 | tokenizer=tokenizer, data_path=data_args.eval_data_file 227 | ) 228 | model = model.to(device) 229 | collator = Collator(tokenizer=tokenizer) 230 | logger.info(f"Dataset sizes {len(train_dataset)}, {len(eval_dataset)}.") 231 | num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 232 | total_params = sum(p.numel() for p in model.parameters()) 233 | logger.info( 234 | f"{total_params} parameters, {num_params} trainable. Model: {type(model)}" 235 | ) 236 | 237 | train_loader = DataLoader( 238 | train_dataset, 239 | batch_size=data_args.batch_size, 240 | drop_last=True, 241 | shuffle=True, 242 | collate_fn=collator, 243 | ) 244 | eval_loader = DataLoader( 245 | eval_dataset, 246 | batch_size=data_args.batch_size, 247 | drop_last=False, 248 | shuffle=False, 249 | collate_fn=collator, 250 | ) 251 | lr = train_args.learning_rate 252 | 253 | # Set up the optimizer 254 | no_decay = ["bias", "LayerNorm.weight"] 255 | optimizer_grouped_parameters = [ 256 | { 257 | "params": [ 258 | p 259 | for n, p in model.named_parameters() 260 | if not any(nd in n for nd in no_decay) 261 | ], 262 | "weight_decay": 0, 263 | }, 264 | { 265 | "params": [ 266 | p 267 | for n, p in model.named_parameters() 268 | if any(nd in n for nd in no_decay) 269 | ], 270 | "weight_decay": 0.0, 271 | }, 272 | ] 273 | optimizer = AdamW(optimizer_grouped_parameters, lr=lr, betas=(0.9, 0.999), eps=1e-7) 274 | total_steps = len(train_loader) * train_args.num_train_epochs 275 | scheduler = get_linear_schedule_with_warmup( 276 | optimizer, num_warmup_steps=0, num_training_steps=total_steps 277 | ) 278 | best_perf = [100, 0, 0] 279 | epochs = int(train_args.num_train_epochs) 280 | logger.info(f"Batch size: {data_args.batch_size}, LR={lr}") 281 | 282 | output_dir = train_args.output_dir 283 | eval_seqs = [ 284 | eval_dataset.examples[i].split("|")[-1] for i in range(len(eval_dataset)) 285 | ] 286 | 287 | for epoch in range(epochs): 288 | logger.info(f"Starting epoch {epoch}/{epochs}.") 289 | 290 | model.train() 291 | for idx, (labels, inputs) in tqdm( 292 | enumerate(train_loader), total=len(train_loader) 293 | ): 294 | output = model(**inputs, labels=labels) 295 | loss = output.loss 296 | optimizer.zero_grad() 297 | loss.backward() 298 | optimizer.step() 299 | scheduler.step() 300 | 301 | model.eval() 302 | labels, predictions = [], [] 303 | with torch.no_grad(): 304 | for idx, (labs, inputs) in enumerate(eval_loader): 305 | output = model(**inputs, labels=labs) 306 | prediction = output.logits.cpu().detach().squeeze().numpy() 307 | 308 | labels.extend(list(labs.cpu().detach().numpy())) 309 | predictions.extend(list(prediction)) 310 | 311 | rmse = np.sqrt(mean_squared_error(predictions, labels)) 312 | pearson = pearsonr(predictions, labels)[0] 313 | spearman = spearmanr(predictions, labels)[0] 314 | 315 | logger.info( 316 | f"Epoch {epoch}: RMSE:{rmse:.8f}, pearson:{pearson:.3f}, spearman:{spearman:.3f}" 317 | ) 318 | if pearson > best_perf[1]: 319 | best_perf[1] = pearson 320 | logger.info(f"New best Pearson: {pearson}") 321 | with open( 322 | os.path.join(output_dir, "best_eval_perf_pearson.json"), "w" 323 | ) as f: 324 | json.dump( 325 | { 326 | "RMSE": str(best_perf[0]), 327 | "Pearson": str(best_perf[1]), 328 | "Spearman": str(best_perf[2]), 329 | "Epoch": str(epoch), 330 | }, 331 | f, 332 | indent=4, 333 | ) 334 | pd.DataFrame( 335 | { 336 | "sequence": eval_seqs, 337 | "predictions": list(predictions), 338 | "labels": list(labels), 339 | } 340 | ).to_csv(os.path.join(output_dir, "best_pearson_preds.csv")) 341 | if rmse < best_perf[0]: 342 | best_perf[0] = rmse 343 | logger.info(f"New best RMSE: {rmse}") 344 | torch.save(model.state_dict(), os.path.join(output_dir, "rmse.bin")) 345 | with open(os.path.join(output_dir, "best_eval_perf_rmse.json"), "w") as f: 346 | json.dump( 347 | { 348 | "RMSE": str(best_perf[0]), 349 | "Pearson": str(best_perf[1]), 350 | "Spearman": str(best_perf[2]), 351 | "Epoch": epoch, 352 | }, 353 | f, 354 | indent=4, 355 | ) 356 | pd.DataFrame( 357 | { 358 | "sequence": eval_seqs, 359 | "predictions": list(predictions), 360 | "labels": list(labels), 361 | } 362 | ).to_csv(os.path.join(output_dir, "best_rmse_preds.csv")) 363 | if spearman > best_perf[2]: 364 | best_perf[2] = spearman 365 | logger.info(f"New best Spearman: {spearman}") 366 | torch.save(model.state_dict(), os.path.join(output_dir, "spearman.bin")) 367 | with open( 368 | os.path.join(output_dir, "best_eval_perf_spearman.json"), "w" 369 | ) as f: 370 | json.dump( 371 | { 372 | "RMSE": str(best_perf[0]), 373 | "Pearson": str(best_perf[1]), 374 | "Spearman": str(best_perf[2]), 375 | "Epoch": epoch, 376 | }, 377 | f, 378 | indent=4, 379 | ) 380 | pd.DataFrame( 381 | { 382 | "sequence": eval_seqs, 383 | "predictions": list(predictions), 384 | "labels": list(labels), 385 | } 386 | ).to_csv(os.path.join(output_dir, "best_spearman_preds.csv")) 387 | 388 | 389 | if __name__ == "__main__": 390 | main() 391 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 80 3 | select = C,E,F,W,B,B950 4 | ignore = E203, E501, W503 5 | 6 | [mypy] 7 | check_untyped_defs = True 8 | 9 | [mypy-pytest.*] 10 | ignore_missing_imports = True 11 | 12 | [mypy-rdkit.*] 13 | ignore_missing_imports = True 14 | 15 | [mypy-setuptools.*] 16 | ignore_missing_imports = True 17 | 18 | [mypy-transformers.*] 19 | ignore_missing_imports = True 20 | 21 | [mypy-numpy.*] 22 | ignore_missing_imports = True 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Package setup.""" 2 | import io 3 | import re 4 | 5 | from setuptools import find_packages, setup 6 | 7 | match = re.search( 8 | r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]', 9 | io.open("terminator/__init__.py", encoding="utf_8_sig").read(), 10 | ) 11 | if match is None: 12 | raise SystemExit("Version number not found.") 13 | __version__ = match.group(1) 14 | 15 | setup( 16 | name="terminator", 17 | version=__version__, 18 | author="IBM Resarch team", 19 | author_email=["jannis.born@gmx.de, drugilsberg@gmail.com"], 20 | packages=find_packages(), 21 | long_description=open("README.md").read(), 22 | long_description_content_type="text/markdown", 23 | package_data={"terminator": ["py.typed"]}, 24 | install_requires=["transformers", "numpy", "tqdm", "selfies==1.0.4", "modlamp"], 25 | ) 26 | -------------------------------------------------------------------------------- /terminator/__init__.py: -------------------------------------------------------------------------------- 1 | """Utiltities for transformer-based conditional molecule generation.""" 2 | __version__ = "0.0.1" 3 | __name__ = "terminator" 4 | -------------------------------------------------------------------------------- /terminator/args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | from transformers import MODEL_WITH_LM_HEAD_MAPPING 5 | from transformers.training_args import TrainingArguments 6 | 7 | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) 8 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 9 | 10 | 11 | @dataclass 12 | class CustomTrainingArguments(TrainingArguments): 13 | """ 14 | NOTE: Expanding TrainingArguments class from transformers with custom arguments. 15 | 16 | eval_accumulation_steps (:obj:`int`, `optional`): 17 | Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If 18 | left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but 19 | requires more memory). 20 | """ 21 | 22 | # Was introduced only in transformers 3.4.0 23 | eval_accumulation_steps: Optional[int] = field( 24 | default=None, 25 | metadata={ 26 | "help": "Number of predictions steps to accumulate before moving the tensors to the CPU." 27 | }, 28 | ) 29 | training_config_path: Optional[str] = field( 30 | default=None, 31 | metadata={ 32 | "help": """ 33 | Path to a file specifying the training objective hyperparameter. 34 | 35 | Defaults to None, meaning the vanilla PLM objective is used. 36 | 37 | 38 | Optional keys include: 39 | - 'alternate_tasks' (bool): Whether the model is trained specifically on 40 | property prediction and conditional generation task or not. 41 | NOTE: If False, then all other keys are ignored and we fall back to the 42 | PLM objective (identical to not providing a path). Default: False. 43 | - 'cc_loss' (bool): Whether the model is trained with the cycle-consistency 44 | loss in the CG task or with a regular BCE between logits of generated 45 | tokens and the real molecule. Default: False. 46 | - 'cg_collator' (str): Name of collator to use for conditional generation. 47 | Should be either `vanilla_cg` or `bimodal_cg`. 48 | - 'generation_token' (str): Token which should be masked for CC loss. Only 49 | required if cc_loss is True. 50 | 51 | - 'cg_collator_params' (dict): Parameters to pass to the collator. Keys e.g. 52 | 'do_sample' (bool): Whether property is sampled. 53 | 'property_value_ranges' (Iterable[float]): 54 | 'property_value_thresholds' (Iterable[float]): 55 | 'prob_near_sampling' (float): Probability of sampling nearby values. 56 | """ 57 | }, 58 | ) 59 | 60 | 61 | @dataclass 62 | class ModelArguments: 63 | """ 64 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 65 | """ 66 | 67 | model_name_or_path: Optional[str] = field( 68 | default=None, 69 | metadata={ 70 | "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch." 71 | }, 72 | ) 73 | model_type: Optional[str] = field( 74 | default=None, 75 | metadata={ 76 | "help": "If training from scratch, pass a model type from the list: " 77 | + ", ".join(MODEL_TYPES) 78 | }, 79 | ) 80 | config_name: Optional[str] = field( 81 | default=None, 82 | metadata={ 83 | "help": "Pretrained config name or path if not the same as model_name" 84 | }, 85 | ) 86 | tokenizer_name: Optional[str] = field( 87 | default=None, 88 | metadata={ 89 | "help": "Pretrained tokenizer name or path if not the same as model_name" 90 | }, 91 | ) 92 | cache_dir: Optional[str] = field( 93 | default=None, 94 | metadata={ 95 | "help": "Where do you want to store the pretrained models downloaded from s3" 96 | }, 97 | ) 98 | 99 | 100 | @dataclass 101 | class EvalArguments: 102 | """ 103 | Argumnts for model evaluation. 104 | 105 | eval_accumulation_steps (:obj:`int`, `optional`): 106 | Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If 107 | left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but 108 | requires more memory). 109 | """ 110 | 111 | eval_file: str = field(metadata={"help": "Path to the data used for evaluation"}) 112 | param_path: str = field( 113 | metadata={"help": "Path to the .json file with evaluation parameter"} 114 | ) 115 | -------------------------------------------------------------------------------- /terminator/collator_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | import transformers 5 | 6 | 7 | def get_mask( 8 | labels: torch.Tensor, 9 | max_span_length: int, 10 | plm_probability: float, 11 | mask_start_idxs: Optional[torch.Tensor] = None, 12 | mask_end_idxs: Optional[torch.Tensor] = None, 13 | ) -> (torch.Tensor, torch.Tensor): 14 | """Receives a tensor of labels and computes the masked_indices and the target 15 | mapping. 16 | 17 | Args: 18 | labels (torch.Tensor): Input tensor (2D) 19 | max_span_length (int): Maximal length for the span of masked tokens 20 | plm_probability (float): Probability for each token to be masked. 21 | mask_start_idxs (torch.Tensor, Optional): Tensor of length labels with indices 22 | for first possible token to mask. 23 | mask_end_idxs (torch.Tensor, Optional): Tensor of length labels with indices 24 | for last possible token to mask. 25 | 26 | Returns: 27 | masked_indices: 2D Tensor of masked indices. 28 | target_mapping: 3D tensor of diagonal matrices for each sample. 29 | """ 30 | 31 | # Creating the mask and target_mapping tensors 32 | masked_indices = torch.full(labels.shape, 0, dtype=torch.bool) 33 | target_mapping = torch.zeros( 34 | (labels.size(0), labels.size(1), labels.size(1)), dtype=torch.float32 35 | ) 36 | # If on-/offset for masking are not provided we can mask from start to end 37 | if mask_start_idxs is None: 38 | mask_start_idxs = [0] * labels.size(0) 39 | if mask_end_idxs is None: 40 | mask_end_idxs = [1 * labels.size(1)] * labels.size(0) 41 | 42 | for i in range(labels.size(0)): 43 | # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far). 44 | cur_len = mask_start_idxs[i] 45 | max_len = mask_end_idxs[i] 46 | 47 | # If the masking range is just a single token, we always mask it 48 | if cur_len == max_len: 49 | masked_indices[i, cur_len] = 1 50 | 51 | while cur_len < max_len: 52 | # Sample (length of span of tokens to be masked), take the minimum to avoid 53 | # that the span length is longer than the molecule length 54 | span_length = min( 55 | torch.randint(1, max_span_length + 1, (1,)).item(), max_len - cur_len 56 | ) 57 | # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked 58 | context_length = int(span_length / plm_probability) 59 | # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length` 60 | # the min is needed to avoid that the span extends over max_len 61 | # the max is needed to avoid that the span starts before cur_len 62 | start_index = max( 63 | min( 64 | cur_len 65 | + torch.randint(context_length - span_length + 1, (1,)).item(), 66 | max_len - span_length, 67 | ), 68 | cur_len, 69 | ) 70 | masked_indices[i, start_index : start_index + span_length] = 1 71 | # Set `cur_len = cur_len + context_length` 72 | cur_len += context_length 73 | 74 | # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether, 75 | # the i-th predict corresponds to the i-th token. 76 | target_mapping[i] = torch.eye(labels.size(1)) 77 | 78 | return masked_indices, target_mapping 79 | 80 | 81 | def get_permutation_order( 82 | labels: torch.Tensor, 83 | masked_indices: torch.Tensor, 84 | non_func_mask: torch.Tensor, 85 | device: str = "cpu", 86 | ) -> torch.Tensor: 87 | 88 | perm_mask = torch.zeros( 89 | (labels.size(0), labels.size(1), labels.size(1)), 90 | dtype=torch.float32, 91 | device=device, 92 | ) 93 | 94 | for i in range(labels.size(0)): 95 | # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will 96 | # determine which tokens a given token can attend to (encoded in `perm_mask`). 97 | # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length 98 | # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation, 99 | # we assume that reused length is half of sequence length and permutation length is equal to reused length. 100 | # This requires that the sequence length be even. 101 | 102 | # Create a linear factorisation order 103 | perm_index = torch.arange(labels.size(1), device=device) 104 | # Split this into two halves, assuming that half the sequence is reused each time 105 | perm_index = perm_index.reshape((-1, labels.size(1) // 2)).transpose(0, 1) 106 | # Permute the two halves such that they do not cross over 107 | perm_index = perm_index[torch.randperm(labels.size(1) // 2)] 108 | # Flatten this out into the desired permuted factorisation order 109 | perm_index = torch.flatten(perm_index.transpose(0, 1)) 110 | # Set the permutation indices of non-masked (non-functional) tokens to the 111 | # smallest index (-1) so that: 112 | # (1) They can be seen by all other positions 113 | # (2) They cannot see masked positions, so there won't be information leak 114 | perm_index.masked_fill_(~masked_indices[i] & non_func_mask[i], -1) 115 | # The logic for whether the i-th token can attend on the j-th token based on the factorisation order: 116 | # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token 117 | # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token 118 | perm_mask[i] = ( 119 | perm_index.reshape((labels.size(1), 1)) 120 | <= perm_index.reshape((1, labels.size(1))) 121 | ) & masked_indices[i] 122 | 123 | return perm_mask 124 | -------------------------------------------------------------------------------- /terminator/datasets.py: -------------------------------------------------------------------------------- 1 | from transformers import LineByLineTextDataset, PreTrainedTokenizer, TextDataset 2 | 3 | 4 | def get_dataset( 5 | filepath: str, 6 | tokenizer: PreTrainedTokenizer, 7 | block_size: int, 8 | line_by_line: bool = True, 9 | ): 10 | if line_by_line: 11 | return LineByLineTextDataset( 12 | tokenizer=tokenizer, file_path=filepath, block_size=block_size 13 | ) 14 | else: 15 | return TextDataset( 16 | tokenizer=tokenizer, 17 | file_path=filepath, 18 | block_size=block_size, 19 | ) 20 | -------------------------------------------------------------------------------- /terminator/factories.py: -------------------------------------------------------------------------------- 1 | from .numerical_encodings import FloatEncoding, IntEncoding 2 | 3 | NUM_ENCODING_FACTORY = {"float": FloatEncoding, "int": IntEncoding} 4 | 5 | MODEL_TO_EMBEDDING_FN = { 6 | "albert": "model.albert.embeddings", 7 | "xlnet": "self.model.transformer.word_embedding", 8 | } 9 | -------------------------------------------------------------------------------- /terminator/functional_groups.py: -------------------------------------------------------------------------------- 1 | # 2 | # Original authors: Richard Hall and Guillaume Godin 3 | # This file is part of the RDKit. 4 | # The contents are covered by the terms of the BSD license 5 | # which is included in the file license.txt, found at the root 6 | # of the RDKit source tree. 7 | 8 | from collections import namedtuple 9 | 10 | # 11 | # 12 | # Richard hall 2017 13 | # IFG main code 14 | # Guillaume Godin 2017 15 | # refine output function 16 | # astex_ifg: identify functional groups a la Ertl, J. Cheminform (2017) 9:36 17 | from rdkit import Chem 18 | 19 | 20 | def merge(mol, marked, aset): 21 | bset = set() 22 | for idx in aset: 23 | atom = mol.GetAtomWithIdx(idx) 24 | for nbr in atom.GetNeighbors(): 25 | jdx = nbr.GetIdx() 26 | if jdx in marked: 27 | marked.remove(jdx) 28 | bset.add(jdx) 29 | if not bset: 30 | return 31 | merge(mol, marked, bset) 32 | aset.update(bset) 33 | 34 | 35 | # atoms connected by non-aromatic double or triple bond to any heteroatom 36 | # c=O should not match (see fig1, box 15). I think using A instead of * should sort that out? 37 | PATT_DOUBLE_TRIPLE = Chem.MolFromSmarts("A=,#[!#6]") 38 | # atoms in non aromatic carbon-carbon double or triple bonds 39 | PATT_CC_DOUBLE_TRIPLE = Chem.MolFromSmarts("C=,#C") 40 | # acetal carbons, i.e. sp3 carbons connected to tow or more oxygens, nitrogens or sulfurs; these O, N or S atoms must have only single bonds 41 | PATT_ACETAL = Chem.MolFromSmarts("[CX4](-[O,N,S])-[O,N,S]") 42 | # all atoms in oxirane, aziridine and thiirane rings 43 | PATT_OXIRANE_ETC = Chem.MolFromSmarts("[O,N,S]1CC1") 44 | 45 | PATT_TUPLE = (PATT_DOUBLE_TRIPLE, PATT_CC_DOUBLE_TRIPLE, PATT_ACETAL, PATT_OXIRANE_ETC) 46 | 47 | 48 | def identify_functional_groups(mol): 49 | marked = set() 50 | # mark all heteroatoms in a molecule, including halogens 51 | for atom in mol.GetAtoms(): 52 | if atom.GetAtomicNum() not in (6, 1): # would we ever have hydrogen? 53 | marked.add(atom.GetIdx()) 54 | 55 | # mark the four specific types of carbon atom 56 | for patt in PATT_TUPLE: 57 | for path in mol.GetSubstructMatches(patt): 58 | for atomindex in path: 59 | marked.add(atomindex) 60 | 61 | # merge all connected marked atoms to a single FG 62 | groups = [] 63 | while marked: 64 | grp = set([marked.pop()]) 65 | merge(mol, marked, grp) 66 | groups.append(grp) 67 | 68 | # extract also connected unmarked carbon atoms 69 | ifg = namedtuple("IFG", ["atomIds", "atoms", "type"]) 70 | ifgs = [] 71 | for g in groups: 72 | uca = set() 73 | for atomidx in g: 74 | for n in mol.GetAtomWithIdx(atomidx).GetNeighbors(): 75 | if n.GetAtomicNum() == 6: 76 | uca.add(n.GetIdx()) 77 | ifgs.append( 78 | ifg( 79 | atomIds=tuple(list(g)), 80 | atoms=Chem.MolFragmentToSmiles(mol, g, canonical=True), 81 | type=Chem.MolFragmentToSmiles(mol, g.union(uca), canonical=True), 82 | ) 83 | ) 84 | return ifgs 85 | -------------------------------------------------------------------------------- /terminator/nlp.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | from transformers import XLNetTokenizer 5 | 6 | 7 | def parse_humicroedit( 8 | dataset, expression_separator: str = "{", expression_end: str = "}" 9 | ) -> List[str]: 10 | """ 11 | Parse the humicrocredit dataset in an appropriate format. 12 | - token separating numbers from text: { 13 | - oken separating text items: } 14 | 15 | Args: 16 | dataset: The respective chunkk of the humicroedit dataset loaded via Huggingface. 17 | 18 | Raises: 19 | ValueError: If the joke cant be extracted uniquely 20 | 21 | Returns: 22 | _description_ 23 | """ 24 | 25 | lines = [] 26 | for sample in dataset: 27 | prop = "[funny]" + str(round(float(sample["meanGrade"]), 1)) 28 | text = sample["original"] 29 | if text.count("<") > 1 or text.count("/>") > 1: 30 | raise ValueError(text) 31 | if "{" in text or "}" in text: 32 | print(text) 33 | text = text.replace("<", "START ").replace("/>", " END") 34 | 35 | line = prop + expression_separator + sample["edit"] + expression_end + text 36 | lines.append(line) 37 | return lines 38 | 39 | 40 | def compute_topk(predictions: np.array) -> List[float]: 41 | """ 42 | Computes the topk accuracy of a boolean np array 43 | 44 | Args: 45 | predictions: boolean np.array of shape batch_size x k with correctness of each 46 | prediction 47 | 48 | Returns: 49 | List of floats denoting the top-k accuracies 50 | """ 51 | 52 | topk = [np.mean(predictions[:, 0])] 53 | for k in range(1, predictions.shape[1]): 54 | topk.append(topk[-1] + np.mean(predictions[:, k])) 55 | return topk 56 | -------------------------------------------------------------------------------- /terminator/numerical_encodings.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import warnings 3 | from math import cos, inf, sin 4 | from typing import Dict, Optional 5 | 6 | import torch 7 | import torch.nn as nn 8 | import transformers 9 | from torch import Tensor 10 | 11 | from .utils import get_device 12 | 13 | 14 | def get_float_encoding( 15 | token: str, embedding_size: int, vmax: float = 1.0 16 | ) -> torch.Tensor: 17 | """Convert a token representing a float into a _fixed_ embedding vector. 18 | NOTE: This can be used for *any* range of numbers > 0. 19 | 20 | Args: 21 | token (str): A token representing a float. NOTE: Needs to follow notation 22 | _8_-1_ to represent 0.8 or _5_-2_ to represent 0.05. 23 | embedding_size (int): Size of the embedding. 24 | vmax (int, optional): Maximal value of float, defaults to 1. Normalizes 25 | values to be in the range ~ [-10, 10]. 26 | NOTE: If remaining nn.embeddings in model use `max_norm`, this might result 27 | in large range discrepancies. 28 | 29 | Returns: 30 | torch.Tensor: Tensor of length embedding_size containing the embedding. 31 | """ 32 | if embedding_size % 2 != 0: 33 | raise ValueError("Embedding size cant be odd.") 34 | 35 | vals = torch.zeros((embedding_size,)) 36 | if len(token) == 1 or not ( 37 | token.startswith("_") and token.endswith("_") and token.count("_") == 3 38 | ): 39 | return vals 40 | else: 41 | digit = int(token[1]) 42 | order = int(token.split("_")[-2]) 43 | val = digit * 10**order 44 | 45 | for i in range(0, embedding_size, 2): 46 | vals[i] = val / (i + 1) 47 | vals[i + 1] = -val / (i + 1) 48 | 49 | return vals / (vmax / 10) 50 | 51 | 52 | def get_full_float_encoding( 53 | value: float, embedding_size: int, vmax: float = 1.0 54 | ) -> Tensor: 55 | """ 56 | Convert a float value into a _fixed_ embedding vector. 57 | 58 | Args: 59 | value: The float value to be encoded. 60 | embedding_size: The size of the embedding. 61 | vmax: Maximal value the `value` variable can take. This normalizes values 62 | to be in the range ~ [-10, 10]. NOTE: If remaining nn.embeddings in 63 | model use `max_norm`, this might result in large range discrepancies. 64 | 65 | Returns: 66 | torch.Tensor of shape (embedding_size, ) containing the embedding. 67 | """ 68 | if embedding_size % 2 != 0: 69 | raise ValueError(f"Embedding size {embedding_size} cant be odd.") 70 | integer = int(value) 71 | decimal = value - integer 72 | scalar = integer * 10**decimal 73 | embedding = torch.zeros((embedding_size,)) 74 | for i in range(0, embedding_size, 2): 75 | embedding[i] = scalar / (i + 1) 76 | embedding[i + 1] = -scalar / (i + 1) 77 | return embedding 78 | 79 | 80 | def get_int_encoding(token: str, embedding_size: int) -> torch.Tensor: 81 | """Convert a token representing an integer into a _fixed_ embedding vector. 82 | NOTE: This can be used only for positive integers - the generation of the 83 | encodings is *identical* to positional encodings. 84 | 85 | Args: 86 | token (str): A token representing an integer. NOTE: Needs to follow notation 87 | _8_2_ to represent 80 or _5_1_ to represent 5. 88 | embedding_size (int): Size of the embedding. 89 | 90 | Returns: 91 | torch.Tensor: Tensor of length embedding_size containing the embedding. 92 | """ 93 | ed = embedding_size 94 | vals = torch.zeros((ed,)) 95 | 96 | if len(token) == 1 or not ( 97 | token.startswith("_") and token.endswith("_") and token.count("_") == 3 98 | ): 99 | return vals 100 | else: 101 | digit = int(token[1]) 102 | order = int(token.split("_")[-2]) 103 | val = digit * 10**order 104 | 105 | if order < 0: 106 | raise ValueError( 107 | f"Found float encoding in {token}. Pass positive ints only." 108 | ) 109 | 110 | sine = lambda p, i: sin(p / (10000.0 ** (2 * i / ed))) 111 | cose = lambda p, i: cos(p / (10000.0 ** (2 * i / ed))) 112 | for i in range(0, ed, 2): 113 | vals[i] = sine(val, i) 114 | vals[i + 1] = cose(val, i) 115 | return vals 116 | 117 | 118 | class FloatEncoding(nn.Embedding): 119 | """ 120 | A nn.Embedding inspired class to generate fixed embedding vectors that represent 121 | numbers passed as tokens. 122 | NOTE: Tokens representing numbers need to follow notation _8_-1_ to represent 0.8. 123 | """ 124 | 125 | def __init__( 126 | self, 127 | num_embeddings: int, 128 | embedding_dim: int, 129 | vocab: Dict, 130 | vmax: Optional[float] = None, 131 | *args, 132 | **kwargs, 133 | ) -> None: 134 | """ 135 | Constructor for FloatEmbedding; sets up the fixed embedding matrix. 136 | 137 | Args: 138 | num_embeddings (int): size of the dictionary of embeddings. 139 | embedding_dim (int): the size of each embedding vector 140 | vocab (Dict): the language dictionary with tokens as keys and indexes as 141 | values. Length needs to match num_embeddings 142 | vmax (Optional[float]): Maximal value of float, defaults to None. 143 | 144 | Raises: 145 | ValueError: if num_embeddings does not match len(vocab). 146 | TypeError: if neither None nor a number is passed as vmax 147 | ValueError: if vmax is negative. 148 | """ 149 | 150 | super(FloatEncoding, self).__init__( 151 | num_embeddings, embedding_dim, *args, **kwargs 152 | ) 153 | 154 | if not len(vocab) == num_embeddings: 155 | raise ValueError( 156 | f"num_embeddings needs to match size of vocabulary ({num_embeddings}!={len(vocab)})" 157 | ) 158 | if not (vmax is None or isinstance(3, numbers.Number)): 159 | raise TypeError(f"vmax needs to be a number or None, not {vmax}.") 160 | 161 | if vmax is None: 162 | # Infer the highest number in the dictionary (for normalization) 163 | test = lambda t: len(t) == 1 or not ( 164 | t.startswith("_") and t.endswith("_") and t.count("_") == 3 165 | ) 166 | vmax = max( 167 | [ 168 | -inf 169 | if test(token) 170 | else int(token[1]) * 10 ** int(token.split("_")[-2]) 171 | for token in vocab.keys() 172 | ] 173 | ) 174 | warnings.warn( 175 | f"The inferred maximum float ({vmax}) is used for normalizing all float embeddings" 176 | " which might result in diminishing embeddings." 177 | ) 178 | 179 | if vmax < 0: 180 | raise ValueError(f"Can not work only with negative numbers (vmax = {vmax})") 181 | 182 | weights = torch.zeros(num_embeddings, embedding_dim) 183 | for idx, (token, index) in enumerate(vocab.items()): 184 | assert ( 185 | idx == index 186 | ), "Please sort vocab indexes in ascending order starting from 0" 187 | weights[idx, :] = get_float_encoding(token, embedding_dim, vmax) 188 | weights = weights.to(device=get_device()) 189 | self.embedding = nn.Embedding.from_pretrained(weights, freeze=True) 190 | self.vocab = vocab 191 | 192 | def forward(self, x: Tensor) -> Tensor: 193 | return self.embedding(x) 194 | 195 | 196 | class IntEncoding(nn.Embedding): 197 | """ 198 | A nn.Embedding inspired class to generate fixed embedding vectors that represent 199 | positive integers passed as tokens. 200 | NOTE: Tokens representing numbers need to follow notation _8_2_ to represent 80. 201 | """ 202 | 203 | def __init__( 204 | self, num_embeddings: int, embedding_dim: int, vocab: Dict, *args, **kwargs 205 | ) -> None: 206 | """ 207 | Constructor for FloatEmbedding; sets up the fixed embedding matrix. 208 | 209 | Args: 210 | num_embeddings (int): size of the dictionary of embeddings. 211 | embedding_dim (int): the size of each embedding vector 212 | vocab (Dict): the language dictionary with tokens as keys and indexes as 213 | values. Length needs to match num_embeddings 214 | 215 | Raises: 216 | ValueError: if num_embeddings does not match len(vocab). 217 | TypeError: if neither None nor a number is passed as vmax 218 | ValueError: if vmax is negative. 219 | """ 220 | 221 | if "vmax" in kwargs.keys(): 222 | kwargs.pop("vmax") 223 | 224 | super(IntEncoding, self).__init__( 225 | num_embeddings, embedding_dim, *args, **kwargs 226 | ) 227 | 228 | if not len(vocab) == num_embeddings: 229 | raise ValueError( 230 | f"num_embeddings needs to match size of vocabulary ({num_embeddings}!={len(vocab)})" 231 | ) 232 | 233 | weights = torch.zeros(num_embeddings, embedding_dim) 234 | for idx, (token, index) in enumerate(vocab.items()): 235 | assert ( 236 | idx == index 237 | ), "Please sort vocab indexes in ascending order starting from 0" 238 | weights[idx, :] = get_int_encoding(token, embedding_dim) 239 | 240 | weights = weights.to(device=get_device()) 241 | self.embedding = nn.Embedding.from_pretrained(weights, freeze=True) 242 | self.vocab = vocab 243 | 244 | def forward(self, x: Tensor) -> Tensor: 245 | return self.embedding(x) 246 | -------------------------------------------------------------------------------- /terminator/property_predictors.py: -------------------------------------------------------------------------------- 1 | """Factory of property predictors based on strings""" 2 | from rdkit import Chem 3 | from modlamp.descriptors import GlobalDescriptor 4 | from rdkit.Chem.QED import qed 5 | 6 | 7 | def predict_qed(smiles: str) -> float: 8 | try: 9 | q = qed(Chem.MolFromSmiles(smiles, sanitize=False)) 10 | return q, {"qed": q} 11 | except Exception: 12 | return -1, {"qed": -1} 13 | 14 | 15 | def boman_index(sequence: str) -> float: 16 | """Calculate the Boman index of a protein. 17 | The Boman index is a measure of protein interactions (potential to bind to 18 | membranes or others proteins). It's the average solubility for all residues 19 | in the sequence. Above 2.48 is considered high binding potential. 20 | 21 | For details see: 22 | Boman, H. G. "Antibacterial peptides: basic facts and emerging concepts." 23 | Journal of internal medicine 254.3 (2003): 197-215. 24 | 25 | Args: 26 | sequence (str): An AA sequence 27 | 28 | Returns: 29 | float: The boman index. 30 | """ 31 | try: 32 | sequence = sequence.strip().upper() 33 | desc = GlobalDescriptor(sequence) 34 | desc.boman_index() 35 | b = float(desc.descriptor) 36 | return b, {"boman": b} 37 | except Exception: 38 | return -100, {"boman": -100} 39 | 40 | 41 | PREDICT_FACTORY = {"qed": predict_qed, "boman": boman_index} 42 | -------------------------------------------------------------------------------- /terminator/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/regression-transformer/6820b45e4548ee2648557fcf120b3361414a8cd9/terminator/py.typed -------------------------------------------------------------------------------- /terminator/search.py: -------------------------------------------------------------------------------- 1 | """Decoding utilities.""" 2 | from math import log 3 | from sys import float_info 4 | 5 | import numpy as np 6 | import torch 7 | import transformers 8 | from torch import nn 9 | 10 | from .utils import get_device 11 | 12 | 13 | class Search(nn.Module): 14 | """Base search class.""" 15 | 16 | def __init__(self, *args, **kwargs): 17 | super().__init__() 18 | self.device = get_device() 19 | 20 | def forward(self, logits: torch.Tensor) -> object: 21 | """ 22 | Error handling. 23 | 24 | Args: 25 | logits: torch.Tensor (Tensor): the model's 26 | logits. (batch_size, length, vocabulary_size) 27 | Returns: 28 | object: the search output. 29 | """ 30 | if not len(logits.shape) == 3: 31 | raise ValueError(f"Logits need to be 3D Tensor, was: {logits.shape}") 32 | if not type(logits) == torch.Tensor: 33 | raise TypeError(f"Logits need to be torch.Tensor, was: {type(logits)}") 34 | 35 | def step(self, logits: torch.Tensor) -> object: 36 | """ 37 | Error handling. 38 | 39 | Args: 40 | logits: torch.Tensor (Tensor): the model's 41 | logits. (batch_size, vocabulary_size) 42 | Returns: 43 | object: the search output. 44 | """ 45 | if len(logits.shape) > 3: 46 | raise ValueError(f"Logits need to be 2D or 3D Tensor, was: {logits.shape}") 47 | if not type(logits) == torch.Tensor: 48 | raise TypeError(f"Logits need to be torch.Tensor, was: {type(logits)}") 49 | 50 | 51 | class GreedySearch(Search): 52 | """ "Greedy search.""" 53 | 54 | def __init__(self, *args, **kwargs): 55 | super().__init__(*args, **kwargs) 56 | 57 | def forward(self, logits: torch.Tensor) -> torch.Tensor: 58 | """ 59 | Perform the greedy search. 60 | 61 | Args: 62 | logits: torch.Tensor (Tensor): the model's 63 | logits. (batch_size, length, vocabulary_size) 64 | Returns: 65 | torch.Tensor: the token indexes selected. (batch_size, length) 66 | """ 67 | super().forward(logits) 68 | 69 | return torch.argmax(logits, 2) 70 | 71 | def step(self, logits: torch.Tensor) -> torch.Tensor: 72 | """ 73 | Perform a greedy search step. 74 | 75 | Args: 76 | logits (torch.Tensor): the model's 77 | logits. (batch_size, vocabulary_size) 78 | Returns: 79 | torch.Tensor: the token indexes for all the batch. (batch_size, 1). 80 | """ 81 | super().step(logits) 82 | return torch.argmax(logits, 1, keepdim=True) 83 | 84 | 85 | class SamplingSearch(Search): 86 | """ "Sampling search.""" 87 | 88 | def __init__(self, temperature: float = 1.0, *args, **kwargs): 89 | """ 90 | Initialize the sampling search. 91 | 92 | Args: 93 | temperature (float, optional): temperature parameter. Defaults to 94 | 1.0, a.k.a., no temperature. Temperature < 1 results in a more 95 | descriminative softmax, > 1 in a flatter distribution. 96 | """ 97 | super().__init__(*args, **kwargs) 98 | self.temperature = temperature 99 | 100 | def forward(self, logits: torch.Tensor) -> torch.Tensor: 101 | """ 102 | Perform the sampling search. 103 | 104 | Args: 105 | logits: torch.Tensor (Tensor): the model's 106 | logits. (batch_size, length, vocabulary_size) 107 | Returns: 108 | torch.Tensor: the token indexes selected. (batch_size, length) 109 | """ 110 | super().forward(logits) 111 | probabilities = torch.softmax(logits.div(self.temperature), 2) 112 | return torch.stack( 113 | [torch.multinomial(probability, 1) for probability in probabilities] 114 | ).squeeze(dim=-1) 115 | 116 | def step(self, logits: torch.Tensor) -> torch.Tensor: 117 | """ 118 | Perform a sampling search step. 119 | 120 | Args: 121 | logits (torch.Tensor): the model's 122 | logits. (batch_size, vocabulary_size) 123 | Returns: 124 | torch.Tensor: the token indexes for all the batch. (batch_size, 1). 125 | """ 126 | super().step(logits) 127 | probabilities = torch.softmax(logits.div(self.temperature), 1) 128 | return torch.stack( 129 | [torch.multinomial(probability, 1) for probability in probabilities] 130 | ) 131 | 132 | 133 | class BeamSearch(Search): 134 | """Beam search.""" 135 | 136 | def __init__( 137 | self, beam_width: int = 3, temperature: float = 1.0, top_tokens: int = 5 138 | ): 139 | """ 140 | Initialize the beam search. 141 | Args: 142 | beam_width (int, optional): top sequences returned. Defaults to 3. 143 | temperature (float, optional): temperature parameter. Defaults to 144 | 1.0, a.k.a., no temperature. Temperature < 1 results in a more 145 | descriminative softmax, > 1 in a flatter distribution. 146 | top_tokens (int, optional): number of top dictionary tokens kept 147 | for the search, defaults to 5. 148 | """ 149 | super().__init__() 150 | self.beam_width = beam_width 151 | self.temperature = temperature 152 | self.top_tokens = top_tokens 153 | 154 | def _beam_step_per_sequence(self, probabilities: torch.Tensor, beams: list) -> list: 155 | """ 156 | Perform a beam search step. 157 | Args: 158 | probabilities (torch.Tensor): probabilities for the current step. 159 | (beam_width, vocabulary_size). 160 | beams (list): beams containg sequence and score. Length is equal 161 | to beam_width. 162 | Returns: 163 | list: updated beams. 164 | """ 165 | all_candidates = list() 166 | # expand each current candidate 167 | for probability, beam in zip(probabilities, beams): 168 | a_sequence, score = beam 169 | # Sort the probabilities over dict and select indices of top n 170 | top_token_indexes = np.argsort(-probability)[: self.top_tokens] 171 | for top_token in top_token_indexes: 172 | candidate = [ 173 | a_sequence + [top_token], 174 | score + log(probability[top_token] + float_info.epsilon), 175 | ] 176 | all_candidates.append(candidate) 177 | # order all candidates by score 178 | ordered = sorted(all_candidates, key=lambda pair: pair[1], reverse=True) 179 | # select best 180 | return ordered[: self.beam_width] 181 | 182 | def _beam_per_sequence(self, logits: torch.Tensor) -> tuple: 183 | """ 184 | Beam per sequence in the batch. 185 | Args: 186 | logits (torch.Tensor): logits. 187 | (length, vocabulary_size) 188 | Returns: 189 | tuple: a tuple containing: 190 | - a tensor with tokens. (length, beam_width) 191 | - score. (beam_width) 192 | """ 193 | beams = [[list(), 0.0]] 194 | probabilities = torch.softmax(logits.div(self.temperature), 1) 195 | # walk over each step in sequence 196 | for probability in probabilities: 197 | probability_beams = torch.stack( 198 | [probability] + [probability.clone() for _ in range(self.beam_width)] 199 | ) 200 | beams = self._beam_step_per_sequence(probability_beams, beams) 201 | sequences, scores = zip(*beams) 202 | return (torch.tensor(list(sequences)).T, torch.tensor(list(scores))) 203 | 204 | def forward(self, logits: torch.Tensor) -> tuple: 205 | """ 206 | Perform the beam search for a non-autoregressive generator. 207 | Args: 208 | logits (torch.Tensor): the model's 209 | logits. (batch_size, length, vocabulary_size) 210 | Returns: 211 | tuple: a tuple containing: 212 | - the token indexes for each top sequence. 213 | (batch_size, length, beam_width) 214 | - scores. (batch_size, beam_width) 215 | """ 216 | super().forward(logits) 217 | tokens, scores = zip( 218 | *[self._beam_per_sequence(sequence) for sequence in logits] 219 | ) 220 | return (torch.stack(tokens), torch.stack(scores)) 221 | 222 | def step(self, logits: torch.Tensor, beams: list) -> tuple: 223 | """ 224 | Perform a single beam search step for an autoregressive model. 225 | Args: 226 | logits (torch.Tensor): the model's 227 | logits. (beam_width, batch_size, vocabulary_size) 228 | beams (list): beams for all the batch. 229 | Returns: 230 | tuple: a tuple containing: 231 | - the token indexes for all the batch. 232 | (beam_width, batch_size) 233 | - updated beams for all the batch. 234 | """ 235 | super().step(logits) 236 | probabilities = torch.softmax(logits.div(self.temperature), 2) 237 | updated_beams = [ 238 | self._beam_step_per_sequence(sample_probability, sample_beams) 239 | for sample_probability, sample_beams in zip( 240 | probabilities.permute(1, 0, 2), beams 241 | ) 242 | ] 243 | token_beams = ( 244 | torch.stack( 245 | [ 246 | # get last token for each beam 247 | torch.tensor([beam[0][-1] for beam in sample_beams]) 248 | for sample_beams in updated_beams 249 | ] 250 | ) 251 | .permute(1, 0) 252 | .to(self.device) 253 | ) 254 | return (token_beams, updated_beams) 255 | 256 | 257 | SEARCH_FACTORY = {"greedy": GreedySearch, "beam": BeamSearch, "sample": SamplingSearch} 258 | -------------------------------------------------------------------------------- /terminator/tokenization.py: -------------------------------------------------------------------------------- 1 | """Tokenization utilties for exrepssions.""" 2 | import re 3 | from typing import Dict, List, Tuple 4 | 5 | import torch 6 | from selfies import decoder, split_selfies 7 | from transformers import BertTokenizer, XLNetTokenizer 8 | 9 | SMILES_TOKENIZER_PATTERN = r"(\%\([0-9]{3}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\||\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" 10 | 11 | 12 | class RegexTokenizer: 13 | """Run regex tokenization""" 14 | 15 | def __init__(self, regex_pattern: str) -> None: 16 | """Constructs a RegexTokenizer. 17 | 18 | Args: 19 | regex_pattern: regex pattern used for tokenization 20 | """ 21 | self.regex_pattern = regex_pattern 22 | self.regex = re.compile(self.regex_pattern) 23 | 24 | def tokenize(self, text: str) -> List[str]: 25 | """Regex tokenization. 26 | 27 | Args: 28 | text: text to tokenize. 29 | 30 | Returns: 31 | extracted tokens. 32 | """ 33 | tokens = [token for token in self.regex.findall(text)] 34 | return tokens 35 | 36 | 37 | class PropertyTokenizer: 38 | """Run a property tokenization.""" 39 | 40 | def __init__(self) -> None: 41 | """Constructs a PropertyTokenizer.""" 42 | self.regex = re.compile(r"\s*(<\w+>)\s*?(\+|-)?(\d+)(\.)?(\d+)?\s*") 43 | 44 | def tokenize(self, text: str) -> List[str]: 45 | """Tokenization of a property. 46 | 47 | Args: 48 | text: text to tokenize. 49 | 50 | Returns: 51 | extracted tokens. 52 | """ 53 | tokens = [] 54 | matched = self.regex.match(text) 55 | if matched: 56 | property_name, sign, units, dot, decimals = matched.groups() 57 | tokens = [property_name] 58 | if sign: 59 | tokens += [f"_{sign}_"] 60 | tokens += [ 61 | f"_{number}_{position}_" for position, number in enumerate(units[::-1]) 62 | ][::-1] 63 | if dot: 64 | tokens += [f"_{dot}_"] 65 | if decimals: 66 | tokens += [ 67 | f"_{number}_-{position}_" 68 | for position, number in enumerate(decimals, 1) 69 | ] 70 | return tokens 71 | 72 | 73 | class PropertyTokenizerSquare(PropertyTokenizer): 74 | """Run a property tokenization.""" 75 | 76 | def __init__(self) -> None: 77 | """Constructs a PropertyTokenizer.""" 78 | self.regex = re.compile(r"\s*(\[\w+\])\s*?(\+|-)?(\d+)(\.)?(\d+)?\s*") 79 | 80 | 81 | class CharacterTokenizer: 82 | def __init__(self) -> None: 83 | """Constructs a tokenizer that simply splits each character""" 84 | self.tokenizer = lambda x: list(x) 85 | 86 | def tokenize(self, text: str) -> List[str]: 87 | """Tokenize an expression. 88 | 89 | Args: 90 | text: text to tokenize. 91 | 92 | Returns: 93 | extracted tokens. 94 | """ 95 | return self.tokenizer(text) 96 | 97 | 98 | class SelfiesTokenizer(CharacterTokenizer): 99 | def __init__(self) -> None: 100 | """Constructs an expression tokenizer for SELFIES 101 | 102 | Args: 103 | expression_tokenizer: Separator token for properties and molecule. 104 | Defaults to '|'. 105 | """ 106 | self.tokenizer = lambda x: list(split_selfies(x)) 107 | 108 | 109 | class ExpressionTokenizer: 110 | def __init__( 111 | self, expression_tokenizer: str = "|", language: str = "SMILES" 112 | ) -> None: 113 | """Constructs an expression tokenizer. 114 | 115 | Args: 116 | expression_tokenizer (str): Token separating the property. Defaults to '|'. 117 | Must not occur in the language itself. 118 | language (str): Identifier for the (chemical) language. Should be either 119 | 'SMILES', 'SELFIES' or 'AAS'. 120 | """ 121 | self.language = language 122 | if language == "SMILES": 123 | self.text_tokenizer = RegexTokenizer(regex_pattern=SMILES_TOKENIZER_PATTERN) 124 | elif language == "SELFIES": 125 | self.text_tokenizer = SelfiesTokenizer() 126 | elif language == "AAS": 127 | self.text_tokenizer = CharacterTokenizer() 128 | else: 129 | raise ValueError( 130 | f"Unsupported language {language}, choose 'SMILES', 'SELFIES' or 'AAS'." 131 | ) 132 | self.property_tokenizer = PropertyTokenizer() 133 | self.expression_separator = expression_tokenizer 134 | 135 | def tokenize(self, text: str) -> List[str]: 136 | """Tokenize an expression. 137 | 138 | Args: 139 | text: text to tokenize. 140 | 141 | Returns: 142 | extracted tokens. 143 | """ 144 | splitted_expression = text.split(self.expression_separator) 145 | tokens = [] 146 | for property_expression in splitted_expression[:-1]: 147 | tokens.extend(self.property_tokenizer.tokenize(property_expression)) 148 | tokens.append(self.expression_separator) 149 | tokens.extend(self.text_tokenizer.tokenize(splitted_expression[-1])) 150 | return tokens 151 | 152 | 153 | class ExpressionBertTokenizer(BertTokenizer): 154 | """ 155 | Constructs a bert-based tokenizer used for the Regression Transformer. 156 | 157 | Args: 158 | vocab_file: path to a token per line vocabulary file. 159 | """ 160 | 161 | def __init__( 162 | self, 163 | vocab_file, 164 | unk_token="[UNK]", 165 | sep_token="[SEP]", 166 | pad_token="[PAD]", 167 | cls_token="[CLS]", 168 | mask_token="[MASK]", 169 | pad_even: bool = True, 170 | language: str = "SMILES", 171 | **kwargs, 172 | ) -> None: 173 | """Constructs an ExpressionTokenizer. 174 | 175 | Args: 176 | vocab_file: vocabulary file containing tokens. 177 | unk_token: unknown token. Defaults to "[UNK]". 178 | sep_token: separator token. Defaults to "[SEP]". 179 | pad_token: pad token. Defaults to "[PAD]". 180 | cls_token: cls token. Defaults to "[CLS]". 181 | mask_token: mask token. Defaults to "[MASK]". 182 | pad_even (bool): Boolean indicating whether sequences of odd length should 183 | be padded to have an even length. Neede for PLM in XLNet. Defaults to 184 | True. 185 | language (str): Identifier for the (chemical) language. Should be either 186 | 'SMILES', 'SELFIES' or 'AAS'. 187 | """ 188 | super().__init__( 189 | vocab_file=vocab_file, 190 | do_lower_case=False, 191 | do_basic_tokenize=True, 192 | unk_token=unk_token, 193 | sep_token=sep_token, 194 | pad_token=pad_token, 195 | cls_token=cls_token, 196 | mask_token=mask_token, 197 | **kwargs, 198 | ) 199 | # define tokenization utilities 200 | self.language = language 201 | if language == "SMILES": 202 | self.text_tokenizer = RegexTokenizer(regex_pattern=SMILES_TOKENIZER_PATTERN) 203 | elif self.language == "SELFIES": 204 | self.text_tokenizer = SelfiesTokenizer() 205 | elif language == "AAS": 206 | self.text_tokenizer = CharacterTokenizer() 207 | else: 208 | raise ValueError( 209 | f"Unsupported language {language}, choose 'SMILES', 'SELFIES' or 'AAS'." 210 | ) 211 | 212 | self.property_tokenizer = PropertyTokenizer() 213 | self.expression_separator = "|" 214 | self.separator_idx = self.vocab[self.expression_separator] 215 | self.pad_even = pad_even 216 | 217 | # DEPRECATED 218 | if pad_even: 219 | self.pad_even_fn = lambda x: x if len(x) % 2 == 0 else x + [self.pad_token] 220 | else: 221 | self.pad_even_fn = lambda x: x 222 | 223 | @property 224 | def vocab_list(self) -> List[str]: 225 | """List vocabulary tokens. 226 | 227 | Returns: 228 | a list of vocabulary tokens. 229 | """ 230 | return list(self.vocab.keys()) 231 | 232 | def _tokenize(self, text: str) -> List[str]: 233 | """Tokenize a text representing an expression. 234 | 235 | Args: 236 | text: text to tokenize. 237 | 238 | Returns: 239 | extracted tokens. 240 | """ 241 | splitted_expression = text.split(self.expression_separator) 242 | tokens = [] 243 | for property_expression in splitted_expression[:-1]: 244 | tokens.extend(self.property_tokenizer.tokenize(property_expression)) 245 | tokens.append(self.expression_separator) 246 | tokens.extend(self.text_tokenizer.tokenize(splitted_expression[-1])) 247 | # TODO: remove this hack 248 | # This is a hack to get around DataCollatorForLanguageModeling requiring even 249 | # length sequences 250 | return self.pad_even_fn(tokens) 251 | 252 | def add_padding_tokens( 253 | self, token_ids: List[int], max_length: int, padding_right: bool = True 254 | ) -> List[int]: 255 | """Adds padding tokens to return a sequence of length max_length. 256 | 257 | By default padding tokens are added to the right of the sequence. 258 | 259 | Args: 260 | token_ids: token indexes. 261 | max_length: maximum length of the sequence. 262 | padding_right: whether the sequence is padded on the right. Defaults to True. 263 | 264 | Returns: 265 | padded sequence of token indexes. 266 | """ 267 | padding_ids = [self.pad_token_id] * (max_length - len(token_ids)) 268 | if padding_right: 269 | return token_ids + padding_ids 270 | else: 271 | return padding_ids + token_ids 272 | 273 | @staticmethod 274 | def get_sample_label(mlm_label: List[str], mlm_input: List[str]) -> List[str]: 275 | """MLM case: Retrieve true sample sequence from mlm label and mlm input. 276 | NOTE: Also works for PLM. 277 | 278 | Args: 279 | mlm_label (List[str]): Target sample used in MLM. 280 | mlm_input (List[str]): MLM input sample. 281 | 282 | Returns: 283 | List[str]: Sample sequence as part of the dataset 284 | """ 285 | 286 | return [i if el == "[UNK]" else el for el, i in zip(mlm_label, mlm_input)] 287 | 288 | @staticmethod 289 | def get_sample_prediction( 290 | mlm_prediction: List[str], mlm_input: List[str] 291 | ) -> List[str]: 292 | """MLM case: Retrieve predicted sequence from mlm prediction and mlm input 293 | NOTE: Also works for PLM. 294 | 295 | Args: 296 | mlm_label (List[str]): Target sample used in MLM. 297 | mlm_input (List[str]): MLM input sample. 298 | 299 | Returns: 300 | List[str]: Sample sequence as part of the dataset 301 | """ 302 | return [ 303 | i if i not in ["[MASK]"] else o for o, i in zip(mlm_prediction, mlm_input) 304 | ] 305 | 306 | @staticmethod 307 | def floating_tokens_to_float(token_ids: List[str]) -> float: 308 | """Converts tokens representing a float value into a float. 309 | NOTE: Expects that non-floating tokens are strippped off 310 | 311 | Args: 312 | token_ids: List of tokens, each representing a float. 313 | E.g.: ['_0_0_', '_._', '_9_-1_', '_3_-2_', '_1_-3_'] 314 | 315 | Returns: 316 | float: Float representation for the list of tokens. 317 | """ 318 | try: 319 | float_string = "".join([token.split("_")[1] for token in token_ids]) 320 | float_value = float(float_string) 321 | except ValueError: 322 | float_value = -1 323 | return float_value 324 | 325 | def aggregate_tokens( 326 | self, token_ids: List[str], label_mode: bool, cls_first: bool = True 327 | ) -> Tuple[str, Dict]: 328 | """Receives tokens of one sample and returns sequence (e.g. SMILES) and 329 | a dict of properties. 330 | 331 | Args: 332 | token_ids (List[str]): List of tokens. 333 | label_mode (bool): Whether the token_ids are labels or predictions. 334 | cls_first (bool, optional): Whether CLS token occurres first, default: True 335 | 336 | Returns: 337 | Tuple[str, Dict]: 338 | str: SMILES/SELFIES sequence of sample. 339 | Dict: A dictionary with property names (e.g. 'qed') as key and 340 | properties as values. 341 | """ 342 | edx = min( 343 | token_ids.index("[SEP]") if "[SEP]" in token_ids else 1000, 344 | token_ids.index("[PAD]") if "[PAD]" in token_ids else 1000, 345 | ) 346 | 347 | edx = -1 if edx == 1000 else edx 348 | 349 | seq = ( 350 | "".join(token_ids[token_ids.index("|") + 1 : edx]) 351 | if "|" in token_ids 352 | else "".join(token_ids) 353 | ) 354 | property_dict = {} 355 | for idx, t in enumerate(token_ids): 356 | if t.startswith("<") and t.endswith(">"): 357 | key = t[1:-1] 358 | 359 | # Convert float 360 | end_floating_idx = idx + 1 361 | while token_ids[end_floating_idx].startswith("_"): 362 | end_floating_idx += 1 363 | 364 | prop = self.floating_tokens_to_float( 365 | token_ids[idx + 1 : end_floating_idx] 366 | ) 367 | 368 | property_dict[key] = prop 369 | 370 | return seq, property_dict 371 | 372 | def to_readable(self, sequence: str) -> str: 373 | """Safely returns a readable string irrespective of whether the language is 374 | SMILES, SELFIES or AAS. 375 | 376 | Args: 377 | sequence (str): A string representing a molecule (either SMILES or SELFIES) 378 | or amino acid sequence. 379 | 380 | Returns: 381 | str: A SMILES representing the same molecule. 382 | """ 383 | if self.language == "SMILES": 384 | return sequence 385 | elif self.language == "SELFIES": 386 | return decoder(sequence) 387 | elif self.language == "AAS": 388 | return sequence 389 | else: 390 | raise AttributeError(f"Unknown language {self.language}") 391 | 392 | 393 | class XLNetRTTokenizer(XLNetTokenizer): 394 | """ 395 | A XLNet-based tokenizer for the Regression Transformer, build for the 396 | humicroedit dataset 397 | """ 398 | 399 | def set_property_tokenizer( 400 | self, 401 | tokenizer: PropertyTokenizer, 402 | expression_separator: str = "{", 403 | expression_end: str = "}", 404 | property_token: str = "[funny]", 405 | ): 406 | """ 407 | Set the property tokenizer to be used by the main tokenizer. 408 | 409 | Args: 410 | tokenizer: a property tokenizer. 411 | expression_separator: a token that separates the property from the rest. 412 | expression_end: a token that ends the joke-token sequence. 413 | property_token: the property token. 414 | """ 415 | self.property_tokenizer = tokenizer 416 | # The start token indicating the joke tokens 417 | self.expression_separator = expression_separator 418 | self.expressiond_end = expression_end 419 | self.property_token = property_token 420 | 421 | def set_vocab(self): 422 | self.vocab = self.get_vocab() 423 | self.idx_to_token = dict(zip(self.vocab.values(), self.vocab.keys())) 424 | 425 | def _tokenize(self, text: str) -> List[str]: 426 | """ 427 | Core tokenization function. 428 | 429 | Args: 430 | text: A string to be tokenized. 431 | 432 | Returns: 433 | A list of tokens. 434 | """ 435 | prop, rest = text.split(self.expression_separator) 436 | tokens = self.property_tokenizer.tokenize(prop) + [self.expression_separator] 437 | 438 | entities = rest.split(self.expressiond_end) 439 | for idx, entity in enumerate(entities): 440 | tokens.extend(super()._tokenize(entity)) 441 | if idx < len(entities) - 1: 442 | tokens.extend([self.expressiond_end]) 443 | 444 | if len(tokens) % 2 != 0: 445 | tokens + [self.pad_token] 446 | return tokens 447 | 448 | @property 449 | def vocab_list(self): 450 | return list(self.vocab.keys()) 451 | 452 | @staticmethod 453 | def floating_tokens_to_float(token_ids: List[str]) -> float: 454 | """Converts tokens representing a float value into a float. 455 | NOTE: Expects that non-floating tokens are strippped off 456 | 457 | Args: 458 | token_ids: List of tokens, each representing a float. 459 | E.g.: ['_0_0_', '_._', '_9_-1_', '_3_-2_', '_1_-3_'] 460 | 461 | Returns: 462 | float: Float representation for the list of tokens. 463 | """ 464 | try: 465 | float_string = "".join([token.split("_")[1] for token in token_ids]) 466 | float_value = float(float_string) 467 | except ValueError: 468 | float_value = -1 469 | return float_value 470 | 471 | def decode_internal(self, token_ids: torch.Tensor, *args, **kwargs) -> str: 472 | tokens = "" 473 | for _id in token_ids.tolist(): 474 | token = self.idx_to_token[_id] if _id != -100 else "[UNK]" 475 | tokens += token + " " 476 | return tokens 477 | 478 | @staticmethod 479 | def get_sample_label(mlm_label: List[str], mlm_input: List[str]) -> List[str]: 480 | """MLM case: Retrieve true sample sequence from mlm label and mlm input. 481 | NOTE: Also works for PLM. 482 | 483 | Args: 484 | mlm_label (List[str]): Target sample used in MLM. 485 | mlm_input (List[str]): MLM input sample. 486 | 487 | Returns: 488 | List[str]: Sample sequence as part of the dataset 489 | """ 490 | 491 | return [i if el == "[UNK]" else el for el, i in zip(mlm_label, mlm_input)] 492 | 493 | @staticmethod 494 | def get_sample_prediction( 495 | mlm_prediction: List[str], mlm_input: List[str] 496 | ) -> List[str]: 497 | """MLM case: Retrieve predicted sequence from mlm prediction and mlm input 498 | NOTE: Also works for PLM. 499 | 500 | Args: 501 | mlm_label (List[str]): Target sample used in MLM. 502 | mlm_input (List[str]): MLM input sample. 503 | 504 | Returns: 505 | List[str]: Sample sequence as part of the dataset 506 | """ 507 | return [ 508 | i if i not in ["[MASK]", ""] else o 509 | for o, i in zip(mlm_prediction, mlm_input) 510 | ] 511 | 512 | def aggregate_tokens( 513 | self, token_ids: List[str], label_mode: bool, cls_first: bool = True 514 | ) -> Tuple[str, Dict]: 515 | """Receives tokens of one sample and returns sequence (e.g. SMILES) and 516 | a dict of properties. 517 | 518 | Args: 519 | token_ids (List[str]): List of tokens. 520 | label_mode (bool): Whether the token_ids are labels or predictions. 521 | cls_first (bool, optional): Whether CLS token occurres first, default: True 522 | 523 | Returns: 524 | Tuple[str, Dict]: 525 | str: SMILES/SELFIES sequence of sample. 526 | Dict: A dictionary with property names (e.g. 'qed') as key and 527 | properties as values. 528 | """ 529 | edx = min( 530 | token_ids.index("[SEP]") if "[SEP]" in token_ids else 1000, 531 | token_ids.index("[PAD]") if "[PAD]" in token_ids else 1000, 532 | ) 533 | 534 | edx = -1 if edx == 1000 else edx 535 | 536 | seq = ( 537 | "".join(token_ids[token_ids.index("|") + 1 : edx]) 538 | if "|" in token_ids 539 | else "".join(token_ids) 540 | ) 541 | property_dict = {} 542 | for idx, t in enumerate(token_ids): 543 | if t == self.property_token: 544 | key = t[1:-1] 545 | 546 | # Convert float 547 | end_floating_idx = idx + 1 548 | while token_ids[end_floating_idx].startswith("_"): 549 | end_floating_idx += 1 550 | 551 | prop = self.floating_tokens_to_float( 552 | token_ids[idx + 1 : end_floating_idx] 553 | ) 554 | 555 | property_dict[key] = prop 556 | 557 | return seq, property_dict 558 | -------------------------------------------------------------------------------- /terminator/trainer_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Any, Dict, Optional 3 | 4 | import numpy as np 5 | import torch 6 | import transformers 7 | from torch import Tensor 8 | from transformers.utils import logging 9 | 10 | logger = logging.get_logger(__name__) 11 | 12 | 13 | def get_trainer_dict(dictionary: Dict[str, Any]) -> Dict[str, Any]: 14 | """Helper function to take out a subset of a dictionary with keys that are 15 | important for `CustomTrainer` but cant be passed down to `Trainer`. 16 | 17 | Args: 18 | dictionary (dict): Dict with keyword arguments for `CustomTrainer` constructor. 19 | 20 | Returns: 21 | dict: Dict with keyword arguments for `CustomTrainer` that cant be passed to 22 | childclass constructor (`Trainer`). 23 | """ 24 | keys_to_keep = [ 25 | "verbose_evaluation", 26 | "numerical", 27 | "d_model", 28 | "vocab_size", 29 | "vmax", 30 | "model_type", 31 | "mem_len", 32 | "training_logs", 33 | "train_config", 34 | "alternating_collator", 35 | ] 36 | keep_dict = {} 37 | for keep_key in keys_to_keep: 38 | for key, val in dictionary.items(): 39 | if re.search(keep_key, key) is not None: 40 | keep_dict[key] = val 41 | return keep_dict 42 | 43 | 44 | """ 45 | All below code is taken from transformers==3.5.0 to remedy issues with tensor stacking. 46 | NOTE: 3.4.0 introduces accumulation steps in evaluation, but only 3.5.0 allows the 47 | Trainer to handle dynamic sequence lengths. 48 | """ 49 | 50 | 51 | def nested_new_like(arrays, num_samples, padding_index=-100): 52 | """Create the same nested structure as `arrays` with a first dimension always at `num_samples`.""" 53 | if isinstance(arrays, (list, tuple)): 54 | return type(arrays)(nested_new_like(x, num_samples) for x in arrays) 55 | return np.full_like(arrays, padding_index, shape=(num_samples, *arrays.shape[1:])) 56 | 57 | 58 | def nested_truncate(tensors, limit): 59 | "Truncate `tensors` at `limit` (even if it's a nested list/tuple of tensors)." 60 | if isinstance(tensors, (list, tuple)): 61 | return type(tensors)(nested_truncate(t, limit) for t in tensors) 62 | return tensors[:limit] 63 | 64 | 65 | def nested_expand_like(arrays, new_seq_length, padding_index=-100): 66 | """Expand the `arrays` so that the second dimension grows to `new_seq_length`. 67 | Uses `padding_index` for padding.""" 68 | if isinstance(arrays, (list, tuple)): 69 | return type(arrays)( 70 | nested_expand_like(x, new_seq_length, padding_index=padding_index) 71 | for x in arrays 72 | ) 73 | 74 | result = np.full_like( 75 | arrays, 76 | padding_index, 77 | shape=(arrays.shape[0], new_seq_length) + arrays.shape[2:], 78 | ) 79 | result[:, : arrays.shape[1]] = arrays 80 | return result 81 | 82 | 83 | def _get_first_shape(arrays): 84 | """Return the shape of the first array found in the nested struct `arrays`.""" 85 | if isinstance(arrays, (list, tuple)): 86 | return _get_first_shape(arrays[0]) 87 | return arrays.shape 88 | 89 | 90 | class DistributedTensorGatherer: 91 | """ 92 | A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU 93 | by chunks. 94 | If our dataset has 16 samples with a batch size of 2 on 3 processes and we gather then transfer on 95 | CPU at every step, our sampler will generate the following indices: 96 | :obj:`[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]` 97 | to get something of size a multiple of 3 (so that each process gets the same dataset length). Then 98 | process 0, 1 and 2 will be responsible of making predictions for the following samples: 99 | - P0: :obj:`[0, 1, 2, 3, 4, 5]` 100 | - P1: :obj:`[6, 7, 8, 9, 10, 11]` 101 | - P2: :obj:`[12, 13, 14, 15, 0, 1]` 102 | The first batch treated on each process will be 103 | - P0: :obj:`[0, 1]` 104 | - P1: :obj:`[6, 7]` 105 | - P2: :obj:`[12, 13]` 106 | So if we gather at the end of the first batch, we will get a tensor (nested list/tuple of tensor) 107 | corresponding to the following indices: 108 | :obj:`[0, 1, 6, 7, 12, 13]` 109 | If we directly concatenate our results without taking any precautions, the user will then get 110 | the predictions for the indices in this order at the end of the prediction loop: 111 | :obj:`[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]` 112 | For some reason, that's not going to roll their boat. This class is there to solve that problem. 113 | Args: 114 | world_size (:obj:`int`): 115 | The number of processes used in the distributed training. 116 | num_samples (:obj:`int`): 117 | The number of samples in our dataset. 118 | make_multiple_of (:obj:`int`, `optional`): 119 | If passed, the class assumes the datasets passed to each process are made to be a multiple of this argument 120 | (by adding samples). 121 | """ 122 | 123 | def __init__( 124 | self, world_size, num_samples, make_multiple_of=None, padding_index=-100 125 | ): 126 | self.world_size = world_size 127 | self.num_samples = num_samples 128 | total_size = ( 129 | world_size if make_multiple_of is None else world_size * make_multiple_of 130 | ) 131 | self.total_samples = int(np.ceil(num_samples / total_size)) * total_size 132 | self.process_length = self.total_samples // world_size 133 | self._storage = None 134 | self._offsets = None 135 | self.padding_index = padding_index 136 | 137 | def add_arrays(self, arrays): 138 | """ 139 | Add :obj:`arrays` to the internal storage, Will initialize the storage to the full size at the first arrays 140 | passed so that if we're bound to get an OOM, it happens at the beginning. 141 | """ 142 | if arrays is None: 143 | return 144 | if self._storage is None: 145 | self._storage = nested_new_like( 146 | arrays, self.total_samples, padding_index=self.padding_index 147 | ) 148 | self._offsets = list(range(0, self.total_samples, self.process_length)) 149 | else: 150 | storage_shape = _get_first_shape(self._storage) 151 | arrays_shape = _get_first_shape(arrays) 152 | if len(storage_shape) > 1 and storage_shape[1] < arrays_shape[1]: 153 | # If we get new arrays that are too big too fit, we expand the shape fo the storage 154 | self._storage = nested_expand_like( 155 | self._storage, arrays_shape[1], padding_index=self.padding_index 156 | ) 157 | slice_len = self._nested_set_tensors(self._storage, arrays) 158 | for i in range(self.world_size): 159 | self._offsets[i] += slice_len 160 | 161 | def _nested_set_tensors(self, storage, arrays): 162 | if isinstance(arrays, (list, tuple)): 163 | for x, y in zip(storage, arrays): 164 | slice_len = self._nested_set_tensors(x, y) 165 | return slice_len 166 | assert ( 167 | arrays.shape[0] % self.world_size == 0 168 | ), f"Arrays passed should all have a first dimension multiple of {self.world_size}, found {arrays.shape[0]}." 169 | 170 | slice_len = arrays.shape[0] // self.world_size 171 | for i in range(self.world_size): 172 | if len(arrays.shape) == 1: 173 | storage[self._offsets[i] : self._offsets[i] + slice_len] = arrays[ 174 | i * slice_len : (i + 1) * slice_len 175 | ] 176 | else: 177 | storage[ 178 | self._offsets[i] : self._offsets[i] + slice_len, : arrays.shape[1] 179 | ] = arrays[i * slice_len : (i + 1) * slice_len] 180 | return slice_len 181 | 182 | def finalize(self): 183 | """ 184 | Return the properly gathered arrays and truncate to the number of samples (since the sampler added some extras 185 | to get each process a dataset of the same length). 186 | """ 187 | if self._storage is None: 188 | return 189 | if self._offsets[0] != self.process_length: 190 | logger.warn( 191 | "Not all data has been set. Are you sure you passed all values?" 192 | ) 193 | return nested_truncate(self._storage, self.num_samples) 194 | 195 | 196 | def torch_pad_and_concatenate( 197 | tensor1: Tensor, tensor2: Tensor, padding_index: int = -100 198 | ) -> Tensor: 199 | """Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary.""" 200 | if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]: 201 | return torch.cat((tensor1, tensor2), dim=0) 202 | 203 | # Let's figure out the new shape 204 | new_shape = ( 205 | tensor1.shape[0] + tensor2.shape[0], 206 | max(tensor1.shape[1], tensor2.shape[1]), 207 | ) + tensor1.shape[2:] 208 | 209 | # Now let's fill the result tensor 210 | result = tensor1.new_full(new_shape, padding_index) 211 | result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1 212 | result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2 213 | return result.detach() 214 | 215 | 216 | def numpy_pad_and_concatenate( 217 | array1: np.array, array2: np.array, padding_index: str = -100 218 | ) -> np.array: 219 | """Concatenates `array1` and `array2` on first axis, applying padding on the second if necessary.""" 220 | if len(array1.shape) == 1 or array1.shape[1] == array2.shape[1]: 221 | return np.concatenate((array1, array2), dim=0) 222 | 223 | # Let's figure out the new shape 224 | new_shape = ( 225 | array1.shape[0] + array2.shape[0], 226 | max(array1.shape[1], array2.shape[1]), 227 | ) + array1.shape[2:] 228 | 229 | # Now let's fill the result tensor 230 | result = np.full_like(array1, padding_index, shape=new_shape) 231 | result[: array1.shape[0], : array1.shape[1]] = array1 232 | result[array1.shape[0] :, : array2.shape[1]] = array2 233 | return result 234 | 235 | 236 | def nested_concat(tensors, new_tensors, padding_index=-100): 237 | """ 238 | Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or 239 | nested list/tuples of tensors. 240 | """ 241 | assert type(tensors) == type( 242 | new_tensors 243 | ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}." 244 | if isinstance(tensors, (list, tuple)): 245 | return type(tensors)( 246 | nested_concat(t, n, padding_index=padding_index) 247 | for t, n in zip(tensors, new_tensors) 248 | ) 249 | elif isinstance(tensors, torch.Tensor): 250 | return torch_pad_and_concatenate( 251 | tensors, new_tensors, padding_index=padding_index 252 | ) 253 | elif isinstance(tensors, np.ndarray): 254 | return numpy_pad_and_concatenate( 255 | tensors, new_tensors, padding_index=padding_index 256 | ) 257 | else: 258 | raise TypeError(f"Unsupported type for concatenation: got {type(tensors)}") 259 | 260 | 261 | def distributed_concat( 262 | tensor: "torch.Tensor", num_total_examples: Optional[int] = None 263 | ) -> torch.Tensor: 264 | try: 265 | if isinstance(tensor, (tuple, list)): 266 | return type(tensor)( 267 | distributed_concat(t, num_total_examples) for t in tensor 268 | ) 269 | output_tensors = [ 270 | tensor.clone() for _ in range(torch.distributed.get_world_size()) 271 | ] 272 | torch.distributed.all_gather(output_tensors, tensor) 273 | concat = torch.cat(output_tensors, dim=0) 274 | 275 | # truncate the dummy elements added by SequentialDistributedSampler 276 | if num_total_examples is not None: 277 | concat = concat[:num_total_examples] 278 | return concat 279 | except AssertionError: 280 | raise AssertionError("Not currently using distributed training") 281 | 282 | 283 | def nested_numpify(tensors): 284 | "Numpify `tensors` (even if it's a nested list/tuple of tensors)." 285 | if isinstance(tensors, (list, tuple)): 286 | return type(tensors)(nested_numpify(t) for t in tensors) 287 | return tensors.cpu().numpy() 288 | -------------------------------------------------------------------------------- /terminator/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import subprocess as sp 4 | import sys 5 | from typing import List 6 | 7 | import numpy as np 8 | import psutil 9 | import rdkit.rdBase as rkrb 10 | import rdkit.RDLogger as rkl 11 | import torch 12 | 13 | logger = logging.getLogger(__name__) 14 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 15 | 16 | 17 | def get_gpu_memory(): 18 | if not cuda(): 19 | return 0, 0, 0 20 | command = "nvidia-smi --query-gpu=memory.free --format=csv" 21 | memory_free_info = ( 22 | sp.check_output(command.split()).decode("ascii").split("\n")[:-1][1:] 23 | ) 24 | memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)] 25 | 26 | tot_m, used_m, free_m = map(int, os.popen("free -t -m").readlines()[-1].split()[1:]) 27 | return memory_free_values, used_m, tot_m 28 | 29 | 30 | def get_cpu_memory(): 31 | mem = psutil.virtual_memory() 32 | return mem.total / 1000**3, mem.percent, psutil.cpu_percent() 33 | 34 | 35 | def get_process_mmeory(): 36 | process = psutil.Process(os.getpid()) 37 | return process.memory_percent() 38 | 39 | 40 | def get_device(): 41 | return torch.device("cuda" if cuda() else "cpu") 42 | 43 | 44 | def cuda(): 45 | return torch.cuda.is_available() 46 | 47 | 48 | def get_latest_checkpoint(model_path: str, must_contain: str = "best") -> str: 49 | """ 50 | Given a path to the model folder it searches the latest saved checkpoint 51 | and returns the path to it. 52 | Args: 53 | model_path (str): Path to model folder. Has to contain folders called 54 | 'checkpoint-best-STEP' and 'checkpoint-latest-STEP' where STEP is 55 | a positive integer. 56 | must_contain (str, optional): Subselect checkpoints that contain a 57 | certain query. Defaults to 'best'. 58 | Returns: 59 | str: Path to latest checkpoint 60 | """ 61 | 62 | # Finding checkpoints 63 | checkpoints = [f for f in os.listdir(model_path) if f.startswith("checkpoint")] 64 | if must_contain is not None: 65 | checkpoints = list(filter(lambda x: must_contain in x, checkpoints)) 66 | 67 | if len(checkpoints) == 0: 68 | logger.warning( 69 | f"No checkpoints found that contain {must_contain} in {model_path}." 70 | ) 71 | # Relax criteria and retry 72 | next_try = "checkpoint" if must_contain != "checkpoint" else "" 73 | return get_latest_checkpoint(model_path, must_contain=next_try) 74 | 75 | # Sorting 76 | try: 77 | idx = np.argsort([int(c.split("-")[-1]) for c in checkpoints])[-1] 78 | except ValueError: 79 | raise ValueError(f"Checkpoints dont seem to follow format: {checkpoints}.") 80 | 81 | return os.path.join(model_path, checkpoints[idx]) 82 | 83 | 84 | def disable_rdkit_logging(): 85 | """ 86 | Disables RDKit whiny logging. 87 | """ 88 | logger = rkl.logger() 89 | logger.setLevel(rkl.ERROR) 90 | rkrb.DisableLog("rdApp.error") 91 | 92 | 93 | def find_safe_path(path: str) -> str: 94 | """Method to find a safe path that does not exist yet. 95 | Args: 96 | path (str): Desired path. 97 | Returns: 98 | str: Non existing path. 99 | """ 100 | safe_path = path 101 | c = 0 102 | while os.path.exists(safe_path): 103 | c += 1 104 | safe_path = ".".join( 105 | [ 106 | s if i != path.count(".") - 1 else f"{s}_v{c}" 107 | for i, s in enumerate(path.split(".")) 108 | ] 109 | ) 110 | return safe_path 111 | 112 | 113 | def get_equispaced_ranges( 114 | data_path: str, properties: List[str], n: int = 10, precisions: List[int] = [2] 115 | ) -> List[List[float]]: 116 | """ 117 | Given a path to a data file it returns the ranges of the properties. 118 | Args: 119 | data_path : Path to data file. 120 | properties: List of properties to consider. 121 | n: number of points per property (will be equally spaced). 122 | precisions: number of decimal places to round to (one per property). 123 | Returns: 124 | List of ranges for each property. 125 | """ 126 | with open(data_path, "r") as f: 127 | data = f.readlines() 128 | 129 | ranges = [] 130 | 131 | for prop, pre in zip(properties, precisions): 132 | 133 | values = [float(line.split(prop)[-1].split("|")[0]) for line in data] 134 | _range = [] 135 | for x in np.linspace(np.min(values), np.max(values), n): 136 | if pre == 1: 137 | _range.append(f"{x:.1f}") 138 | elif pre == 2: 139 | _range.append(f"{x:.2f}") 140 | elif pre == 3: 141 | _range.append(f"{x:.3f}") 142 | elif pre == 4: 143 | _range.append(f"{x:.4f}") 144 | ranges.append(_range) 145 | return ranges 146 | -------------------------------------------------------------------------------- /training_configs/qed_alternated_cc.json: -------------------------------------------------------------------------------- 1 | { 2 | "reset_training_loss": true, 3 | "alternate_tasks": true, 4 | "cc_loss": true, 5 | "property_tokens": [ 6 | "" 7 | ], 8 | "alternate_steps": 50, 9 | "checkpoint-str": "best", 10 | "cg_collator": "vanilla_cg", 11 | "cg_collator_params": { 12 | "do_sample": false, 13 | "property_tokens": [ 14 | "" 15 | ], 16 | "plm_probability": 0.4, 17 | "max_span_length": 12 18 | } 19 | } -------------------------------------------------------------------------------- /training_configs/qed_proponly.json: -------------------------------------------------------------------------------- 1 | { 2 | "reset_training_loss": true, 3 | "alternate_tasks": false, 4 | "task": "proponly", 5 | "checkpoint-str": "pearson", 6 | "property_tokens": [ 7 | "" 8 | ] 9 | } -------------------------------------------------------------------------------- /training_configs/reactions_alternating_cc.json: -------------------------------------------------------------------------------- 1 | { 2 | "reset_training_loss": true, 3 | "alternate_tasks": true, 4 | "cg_collator": "multientity_cg", 5 | "cc_loss": true, 6 | "property_tokens": [ 7 | "" 8 | ], 9 | "alternate_steps": 50, 10 | "cg_collator_params": { 11 | "do_sample": false, 12 | "property_tokens": [ 13 | "" 14 | ], 15 | "plm_probability": 1.0, 16 | "max_span_length": 7, 17 | "entity_to_mask": -1, 18 | "entity_separator_token": "" 19 | } 20 | } -------------------------------------------------------------------------------- /vocabs/proteins.txt: -------------------------------------------------------------------------------- 1 | 2 | [PAD] 3 | [unused1] 4 | [unused2] 5 | [unused3] 6 | [unused4] 7 | [unused5] 8 | [unused6] 9 | [unused7] 10 | [unused8] 11 | [unused9] 12 | 13 | [UNK] 14 | [CLS] 15 | [SEP] 16 | [MASK] 17 | | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | _._ 28 | _0_-0_ 29 | _0_-1_ 30 | _0_-2_ 31 | _0_-3_ 32 | _0_-4_ 33 | _0_-5_ 34 | _0_0_ 35 | _0_1_ 36 | _0_2_ 37 | _0_3_ 38 | _0_4_ 39 | _0_5_ 40 | _1_-0_ 41 | _1_-1_ 42 | _1_-2_ 43 | _1_-3_ 44 | _1_-4_ 45 | _1_-5_ 46 | _1_-6_ 47 | _1_0_ 48 | _1_1_ 49 | _1_2_ 50 | _1_3_ 51 | _1_4_ 52 | _1_5_ 53 | _2_-0_ 54 | _2_-1_ 55 | _2_-2_ 56 | _2_-3_ 57 | _2_-4_ 58 | _2_-5_ 59 | _2_-6_ 60 | _2_0_ 61 | _2_1_ 62 | _2_2_ 63 | _2_3_ 64 | _2_4_ 65 | _2_5_ 66 | _3_-0_ 67 | _3_-1_ 68 | _3_-2_ 69 | _3_-3_ 70 | _3_-4_ 71 | _3_-5_ 72 | _3_-6_ 73 | _3_0_ 74 | _3_1_ 75 | _3_2_ 76 | _3_3_ 77 | _3_4_ 78 | _3_5_ 79 | _4_-0_ 80 | _4_-1_ 81 | _4_-2_ 82 | _4_-3_ 83 | _4_-4_ 84 | _4_-5_ 85 | _4_-6_ 86 | _4_0_ 87 | _4_1_ 88 | _4_2_ 89 | _4_3_ 90 | _4_4_ 91 | _4_5_ 92 | _5_-0_ 93 | _5_-1_ 94 | _5_-2_ 95 | _5_-3_ 96 | _5_-4_ 97 | _5_-5_ 98 | _5_-6_ 99 | _5_0_ 100 | _5_1_ 101 | _5_2_ 102 | _5_3_ 103 | _5_4_ 104 | _5_5_ 105 | _6_-0_ 106 | _6_-1_ 107 | _6_-2_ 108 | _6_-3_ 109 | _6_-4_ 110 | _6_-5_ 111 | _6_-6_ 112 | _6_0_ 113 | _6_1_ 114 | _6_2_ 115 | _6_3_ 116 | _6_4_ 117 | _6_5_ 118 | _7_-0_ 119 | _7_-1_ 120 | _7_-2_ 121 | _7_-3_ 122 | _7_-4_ 123 | _7_-5_ 124 | _7_-6_ 125 | _7_0_ 126 | _7_1_ 127 | _7_2_ 128 | _7_3_ 129 | _7_4_ 130 | _7_5_ 131 | _8_-0_ 132 | _8_-1_ 133 | _8_-2_ 134 | _8_-3_ 135 | _8_-4_ 136 | _8_-5_ 137 | _8_-6_ 138 | _8_0_ 139 | _8_1_ 140 | _8_2_ 141 | _8_3_ 142 | _8_4_ 143 | _8_5_ 144 | _9_-0_ 145 | _9_-1_ 146 | _9_-2_ 147 | _9_-3_ 148 | _9_-4_ 149 | _9_-5_ 150 | _9_-6_ 151 | _9_0_ 152 | _9_1_ 153 | _9_2_ 154 | _9_3_ 155 | _9_4_ 156 | _9_5_ 157 | A 158 | B 159 | C 160 | D 161 | E 162 | F 163 | G 164 | H 165 | I 166 | J 167 | K 168 | L 169 | M 170 | N 171 | O 172 | P 173 | Q 174 | R 175 | S 176 | T 177 | U 178 | V 179 | W 180 | X 181 | Y 182 | Z 183 | a 184 | b 185 | c 186 | d 187 | e 188 | f 189 | g 190 | h 191 | i 192 | j 193 | k 194 | l 195 | m 196 | n 197 | o 198 | p 199 | q 200 | r 201 | s 202 | t 203 | u 204 | v 205 | w 206 | x 207 | y 208 | z 209 | [Branch2_1] 210 | [=O] 211 | [epsilon] 212 | [Ring1] 213 | [=C] 214 | [Ring2] 215 | [Branch1_3] 216 | [N] 217 | [Branch1_1] 218 | [C] 219 | [=N] 220 | [Branch2_3] 221 | [Branch1_2] 222 | [#N] 223 | [Br] 224 | [O] 225 | [Branch2_2] 226 | [F] 227 | [S] 228 | [=S] 229 | [#C] 230 | [Cl] 231 | [O-expl] 232 | [N+expl] 233 | [P] 234 | [.] 235 | [I] 236 | [c] 237 | [-c] 238 | [s] 239 | [nHexpl] 240 | [\c] 241 | [n] 242 | [\C] 243 | [o] 244 | [C@@Hexpl] 245 | [C@expl] 246 | [C@@expl] 247 | [C@Hexpl] 248 | [/C] 249 | [/c] 250 | [Ptexpl] 251 | [\N] 252 | [\C@@Hexpl] 253 | [/C@Hexpl] 254 | [\C@Hexpl] 255 | [-n] 256 | [=c] 257 | [B] 258 | [\S] 259 | [/n] 260 | [=N+expl] 261 | [Expl\Ring2] 262 | [Expl/Ring1] 263 | [n+expl] 264 | [Expl\Ring1] 265 | [Asexpl] 266 | [N@@expl] 267 | [S@@expl] 268 | [/O] 269 | [Expl-Ring1] 270 | [/N] 271 | [S+expl] 272 | [/S] 273 | [Pexpl] 274 | [=Nexpl] 275 | [#O+expl] 276 | [C-expl] 277 | [Iexpl] 278 | [O+expl] 279 | [Brexpl] 280 | [Clexpl] 281 | [=N-expl] 282 | [N-expl] 283 | [P+expl] 284 | [Oexpl] 285 | [#C-expl] 286 | [=Oexpl] 287 | [#N+expl] 288 | [=Iexpl] 289 | [CH-expl] 290 | [P-expl] 291 | [s+expl] 292 | [=P] 293 | [=I++expl] 294 | [o+expl] 295 | [=O+expl] 296 | [CH2expl] 297 | [=S+expl] 298 | [I+expl] 299 | [IHexpl] 300 | [CHexpl] 301 | [-n+expl] 302 | [=CHexpl] 303 | [=O-expl] 304 | [c-expl] 305 | [S-expl] 306 | [p] 307 | [Nexpl] 308 | [Cexpl] 309 | [=P+expl] 310 | [n-expl] 311 | [cH-expl] 312 | [B-expl] 313 | [Expl-Ring2] 314 | [C+expl] 315 | [c+expl] 316 | [=n+expl] 317 | [NH-expl] 318 | [NH2+expl] 319 | [Expl/Ring2] 320 | [Expl=Ring1] 321 | [Cl-expl] 322 | [Na+expl] 323 | [Hexpl] 324 | [NH4+expl] 325 | [Hgexpl] 326 | [\O] 327 | [Br-expl] 328 | [N@expl] 329 | [Ca++expl] 330 | [Snexpl] 331 | [I-expl] 332 | [Co+expl] 333 | [N@@+expl] 334 | [K+expl] 335 | [Fe--expl] 336 | [\Hexpl] 337 | [N@+expl] 338 | [Fe+3expl] 339 | [Gd+3expl] 340 | [/N+expl] 341 | [NH+expl] 342 | [=NH+expl] 343 | [Zn++expl] 344 | [/Br] 345 | [/Cl] 346 | [/C@@Hexpl] 347 | [\N+expl] 348 | [NH3+expl] 349 | [Alexpl] 350 | [Hg++expl] 351 | [Cu++expl] 352 | [Znexpl] 353 | [Au-expl] 354 | [Auexpl] 355 | [Crexpl] 356 | [Cd++expl] 357 | [Cdexpl] 358 | [Siexpl] 359 | [Sbexpl] 360 | [Seexpl] 361 | [=Seexpl] 362 | [Cuexpl] 363 | [Li+expl] 364 | [Tl+expl] 365 | [Biexpl] 366 | [Inexpl] 367 | [/Hexpl] 368 | [Caexpl] 369 | [Dyexpl] 370 | [Co++expl] 371 | [Cr+3expl] 372 | [Fe++expl] 373 | [Pt-2expl] 374 | [Sb+3expl] 375 | [Be++expl] 376 | [Mg++expl] 377 | [Tiexpl] 378 | [Fe-expl] 379 | [Ndexpl] 380 | [Pdexpl] 381 | [#Inexpl] 382 | [Ba++expl] 383 | [H+expl] 384 | [Mn+expl] 385 | [Mn++expl] 386 | [SiHexpl] 387 | [\Cl] 388 | [Ni++expl] 389 | [Zrexpl] 390 | [Niexpl] 391 | [PbH2++expl] 392 | [Ybexpl] 393 | [Naexpl] 394 | [=Moexpl] 395 | [=Cdexpl] 396 | [Cu+expl] 397 | [Geexpl] 398 | [Baexpl] 399 | [=Crexpl] 400 | [Cr++expl] 401 | [OH-expl] 402 | [SnH2++expl] 403 | [Mg+2expl] 404 | [=Siexpl] 405 | [\Br] 406 | [\C@expl] 407 | [Vexpl] 408 | [Ag+expl] 409 | [\C@@expl] 410 | [Pt+2expl] 411 | [2Hexpl] 412 | [Ti++expl] 413 | [Sr++expl] 414 | [=Auexpl] 415 | [Ruexpl] 416 | [\O-expl] 417 | [P@expl] 418 | [Liexpl] 419 | [/C@@expl] 420 | [As+expl] 421 | [\Siexpl] 422 | [/Alexpl] 423 | [\O+expl] 424 | [/Crexpl] 425 | [/Feexpl] 426 | [Euexpl] 427 | [Scexpl] 428 | [Zn+2expl] 429 | [Ca+2expl] 430 | [Hg+2expl] 431 | [=Zrexpl] 432 | [nH+expl] 433 | [Cl+3expl] 434 | [Ba+2expl] 435 | [TlH2+expl] 436 | [Fe+2expl] 437 | [AlH3expl] 438 | [=PHexpl] 439 | [Co+2expl] 440 | [Cu+2expl] 441 | [PbH2+2expl] 442 | [\s] 443 | [Ni+2expl] 444 | [Cd+2expl] 445 | [SnH2+2expl] 446 | [Ti+2expl] 447 | [PHexpl] 448 | [Mn+2expl] 449 | [Sr+2expl] 450 | [Be+2expl] 451 | [seexpl] 452 | [Cr+2expl] 453 | [=Biexpl] 454 | [=C-expl] 455 | [SbH6+3expl] 456 | [\n] 457 | [Fe-2expl] 458 | [=OH+expl] 459 | [-c-expl] 460 | [/s] 461 | [=NH2+expl] 462 | [#S+expl] 463 | [/F] 464 | [F-expl] 465 | [SH-expl] 466 | [CH+expl] 467 | [\NH+expl] 468 | [\CH-expl] 469 | [\c-expl] 470 | [/o] 471 | [CH2-expl] 472 | [\N-expl] 473 | [/n-expl] 474 | [\C-expl] 475 | [/NH+expl] 476 | [/N-expl] 477 | [\F] 478 | [Gd-4expl] 479 | [Gd-5expl] 480 | [N@@H+expl] 481 | [SiH3expl] 482 | [Branch3_3] 483 | [Sexpl] 484 | [\I] 485 | [BiH3expl] 486 | [SeHexpl] 487 | [SiH2expl] 488 | [Feexpl] 489 | [S@expl] 490 | [\P] 491 | [/nHexpl] 492 | [SH+expl] 493 | [-oexpl] 494 | [-sexpl] 495 | [Kexpl] 496 | [=S@@expl] 497 | [*expl] 498 | [CH2+expl] 499 | [S@+expl] 500 | [S@@+expl] 501 | [-nexpl] 502 | [P@@expl] 503 | [/I] 504 | [Reexpl] 505 | [=SH+expl] 506 | [/CH-expl] 507 | [\nHexpl] 508 | [=C@@expl] 509 | [N@H+expl] -------------------------------------------------------------------------------- /vocabs/reactions.txt: -------------------------------------------------------------------------------- 1 | [PAD] 2 | _-_ 3 | _+_ 4 | 5 | 6 | 7 | 8 | [Agexpl] 9 | 10 | 11 | 12 | [UNK] 13 | [CLS] 14 | [SEP] 15 | [MASK] 16 | | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | _._ 27 | _0_-0_ 28 | _0_-1_ 29 | _0_-2_ 30 | _0_-3_ 31 | _0_-4_ 32 | _0_-5_ 33 | _0_0_ 34 | _0_1_ 35 | _0_2_ 36 | _0_3_ 37 | _0_4_ 38 | _0_5_ 39 | _1_-0_ 40 | _1_-1_ 41 | _1_-2_ 42 | _1_-3_ 43 | _1_-4_ 44 | _1_-5_ 45 | _1_-6_ 46 | _1_0_ 47 | _1_1_ 48 | _1_2_ 49 | _1_3_ 50 | _1_4_ 51 | _1_5_ 52 | _2_-0_ 53 | _2_-1_ 54 | _2_-2_ 55 | _2_-3_ 56 | _2_-4_ 57 | _2_-5_ 58 | _2_-6_ 59 | _2_0_ 60 | _2_1_ 61 | _2_2_ 62 | _2_3_ 63 | _2_4_ 64 | _2_5_ 65 | _3_-0_ 66 | _3_-1_ 67 | _3_-2_ 68 | _3_-3_ 69 | _3_-4_ 70 | _3_-5_ 71 | _3_-6_ 72 | _3_0_ 73 | _3_1_ 74 | _3_2_ 75 | _3_3_ 76 | _3_4_ 77 | _3_5_ 78 | _4_-0_ 79 | _4_-1_ 80 | _4_-2_ 81 | _4_-3_ 82 | _4_-4_ 83 | _4_-5_ 84 | _4_-6_ 85 | _4_0_ 86 | _4_1_ 87 | _4_2_ 88 | _4_3_ 89 | _4_4_ 90 | _4_5_ 91 | _5_-0_ 92 | _5_-1_ 93 | _5_-2_ 94 | _5_-3_ 95 | _5_-4_ 96 | _5_-5_ 97 | _5_-6_ 98 | _5_0_ 99 | _5_1_ 100 | _5_2_ 101 | _5_3_ 102 | _5_4_ 103 | _5_5_ 104 | _6_-0_ 105 | _6_-1_ 106 | _6_-2_ 107 | _6_-3_ 108 | _6_-4_ 109 | _6_-5_ 110 | _6_-6_ 111 | _6_0_ 112 | _6_1_ 113 | _6_2_ 114 | _6_3_ 115 | _6_4_ 116 | _6_5_ 117 | _7_-0_ 118 | _7_-1_ 119 | _7_-2_ 120 | _7_-3_ 121 | _7_-4_ 122 | _7_-5_ 123 | _7_-6_ 124 | _7_0_ 125 | _7_1_ 126 | _7_2_ 127 | _7_3_ 128 | _7_4_ 129 | _7_5_ 130 | _8_-0_ 131 | _8_-1_ 132 | _8_-2_ 133 | _8_-3_ 134 | _8_-4_ 135 | _8_-5_ 136 | _8_-6_ 137 | _8_0_ 138 | _8_1_ 139 | _8_2_ 140 | _8_3_ 141 | _8_4_ 142 | _8_5_ 143 | _9_-0_ 144 | _9_-1_ 145 | _9_-2_ 146 | _9_-3_ 147 | _9_-4_ 148 | _9_-5_ 149 | _9_-6_ 150 | _9_0_ 151 | _9_1_ 152 | _9_2_ 153 | _9_3_ 154 | _9_4_ 155 | _9_5_ 156 | [Branch2_1] 157 | [=O] 158 | [epsilon] 159 | [Ring1] 160 | [=C] 161 | [Ring2] 162 | [Branch1_3] 163 | [N] 164 | [Branch1_1] 165 | [C] 166 | [=N] 167 | [Branch2_3] 168 | [Branch1_2] 169 | [#N] 170 | [Br] 171 | [O] 172 | [Branch2_2] 173 | [F] 174 | [S] 175 | [=S] 176 | [#C] 177 | [Cl] 178 | [O-expl] 179 | [N+expl] 180 | [P] 181 | [.] 182 | [I] 183 | [c] 184 | [-c] 185 | [s] 186 | [nHexpl] 187 | [\c] 188 | [n] 189 | [\C] 190 | [o] 191 | [C@@Hexpl] 192 | [C@expl] 193 | [C@@expl] 194 | [C@Hexpl] 195 | [/C] 196 | [/c] 197 | [Ptexpl] 198 | [\N] 199 | [\C@@Hexpl] 200 | [/C@Hexpl] 201 | [\C@Hexpl] 202 | [-n] 203 | [=c] 204 | [B] 205 | [\S] 206 | [/n] 207 | [=N+expl] 208 | [Expl\Ring2] 209 | [Expl/Ring1] 210 | [n+expl] 211 | [Expl\Ring1] 212 | [Asexpl] 213 | [N@@expl] 214 | [S@@expl] 215 | [/O] 216 | [Expl-Ring1] 217 | [/N] 218 | [S+expl] 219 | [/S] 220 | [Pexpl] 221 | [=Nexpl] 222 | [#O+expl] 223 | [C-expl] 224 | [Iexpl] 225 | [O+expl] 226 | [Brexpl] 227 | [Clexpl] 228 | [=N-expl] 229 | [N-expl] 230 | [P+expl] 231 | [Oexpl] 232 | [#C-expl] 233 | [=Oexpl] 234 | [#N+expl] 235 | [=Iexpl] 236 | [CH-expl] 237 | [P-expl] 238 | [s+expl] 239 | [=P] 240 | [=I++expl] 241 | [o+expl] 242 | [=O+expl] 243 | [CH2expl] 244 | [=S+expl] 245 | [I+expl] 246 | [IHexpl] 247 | [CHexpl] 248 | [-n+expl] 249 | [=CHexpl] 250 | [=O-expl] 251 | [c-expl] 252 | [S-expl] 253 | [p] 254 | [Nexpl] 255 | [Cexpl] 256 | [=P+expl] 257 | [n-expl] 258 | [cH-expl] 259 | [B-expl] 260 | [Expl-Ring2] 261 | [C+expl] 262 | [c+expl] 263 | [=n+expl] 264 | [NH-expl] 265 | [NH2+expl] 266 | [Expl/Ring2] 267 | [Expl=Ring1] 268 | [Cl-expl] 269 | [Na+expl] 270 | [Hexpl] 271 | [NH4+expl] 272 | [Hgexpl] 273 | [\O] 274 | [Br-expl] 275 | [N@expl] 276 | [Ca++expl] 277 | [Snexpl] 278 | [I-expl] 279 | [Co+expl] 280 | [N@@+expl] 281 | [K+expl] 282 | [Fe--expl] 283 | [\Hexpl] 284 | [N@+expl] 285 | [Fe+3expl] 286 | [Gd+3expl] 287 | [/N+expl] 288 | [NH+expl] 289 | [=NH+expl] 290 | [Zn++expl] 291 | [/Br] 292 | [/Cl] 293 | [/C@@Hexpl] 294 | [\N+expl] 295 | [NH3+expl] 296 | [Alexpl] 297 | [Hg++expl] 298 | [Cu++expl] 299 | [Znexpl] 300 | [Au-expl] 301 | [Auexpl] 302 | [Crexpl] 303 | [Cd++expl] 304 | [Cdexpl] 305 | [Siexpl] 306 | [Sbexpl] 307 | [Seexpl] 308 | [=Seexpl] 309 | [Cuexpl] 310 | [Li+expl] 311 | [Tl+expl] 312 | [Biexpl] 313 | [Inexpl] 314 | [/Hexpl] 315 | [Caexpl] 316 | [Dyexpl] 317 | [Co++expl] 318 | [Cr+3expl] 319 | [Fe++expl] 320 | [Pt-2expl] 321 | [Sb+3expl] 322 | [Be++expl] 323 | [Mg++expl] 324 | [Tiexpl] 325 | [Fe-expl] 326 | [Ndexpl] 327 | [Pdexpl] 328 | [#Inexpl] 329 | [Ba++expl] 330 | [H+expl] 331 | [Mn+expl] 332 | [Mn++expl] 333 | [SiHexpl] 334 | [\Cl] 335 | [Ni++expl] 336 | [Zrexpl] 337 | [Niexpl] 338 | [PbH2++expl] 339 | [Ybexpl] 340 | [Naexpl] 341 | [=Moexpl] 342 | [=Cdexpl] 343 | [Cu+expl] 344 | [Geexpl] 345 | [Baexpl] 346 | [=Crexpl] 347 | [Cr++expl] 348 | [OH-expl] 349 | [SnH2++expl] 350 | [Mg+2expl] 351 | [=Siexpl] 352 | [\Br] 353 | [\C@expl] 354 | [Vexpl] 355 | [Ag+expl] 356 | [\C@@expl] 357 | [Pt+2expl] 358 | [2Hexpl] 359 | [Ti++expl] 360 | [Sr++expl] 361 | [=Auexpl] 362 | [Ruexpl] 363 | [\O-expl] 364 | [P@expl] 365 | [Liexpl] 366 | [/C@@expl] 367 | [As+expl] 368 | [\Siexpl] 369 | [/Alexpl] 370 | [\O+expl] 371 | [/Crexpl] 372 | [/Feexpl] 373 | [Euexpl] 374 | [Scexpl] 375 | [Zn+2expl] 376 | [Ca+2expl] 377 | [Hg+2expl] 378 | [=Zrexpl] 379 | [nH+expl] 380 | [Cl+3expl] 381 | [Ba+2expl] 382 | [TlH2+expl] 383 | [Fe+2expl] 384 | [AlH3expl] 385 | [=PHexpl] 386 | [Co+2expl] 387 | [Cu+2expl] 388 | [PbH2+2expl] 389 | [\s] 390 | [Ni+2expl] 391 | [Cd+2expl] 392 | [SnH2+2expl] 393 | [Ti+2expl] 394 | [PHexpl] 395 | [Mn+2expl] 396 | [Sr+2expl] 397 | [Be+2expl] 398 | [seexpl] 399 | [Cr+2expl] 400 | [=Biexpl] 401 | [=C-expl] 402 | [SbH6+3expl] 403 | [\n] 404 | [Fe-2expl] 405 | [=OH+expl] 406 | [-c-expl] 407 | [/s] 408 | [=NH2+expl] 409 | [#S+expl] 410 | [/F] 411 | [F-expl] 412 | [SH-expl] 413 | [CH+expl] 414 | [\NH+expl] 415 | [\CH-expl] 416 | [\c-expl] 417 | [/o] 418 | [CH2-expl] 419 | [\N-expl] 420 | [/n-expl] 421 | [\C-expl] 422 | [/NH+expl] 423 | [/N-expl] 424 | [\F] 425 | [Gd-4expl] 426 | [Gd-5expl] 427 | [N@@H+expl] 428 | [SiH3expl] 429 | [Branch3_3] 430 | [Sexpl] 431 | [\I] 432 | [BiH3expl] 433 | [SeHexpl] 434 | [SiH2expl] 435 | [Feexpl] 436 | [S@expl] 437 | [\P] 438 | [/nHexpl] 439 | [SH+expl] 440 | [-oexpl] 441 | [-sexpl] 442 | [Kexpl] 443 | [=S@@expl] 444 | [*expl] 445 | [CH2+expl] 446 | [S@+expl] 447 | [S@@+expl] 448 | [-nexpl] 449 | [P@@expl] 450 | [/I] 451 | [Reexpl] 452 | [=SH+expl] 453 | [/CH-expl] 454 | [\nHexpl] 455 | [=C@@expl] 456 | [N@H+expl] 457 | [Teexpl] 458 | [Osexpl] 459 | [=Ru-expl] 460 | [Re-expl] 461 | [Zn+expl] 462 | [nexpl] 463 | [Mnexpl] 464 | [BH-expl] 465 | [\NH-expl] 466 | [BH3-expl] 467 | [11CH3expl] 468 | [/O-expl] 469 | [PH+expl] 470 | [Wexpl] 471 | [OH+expl] 472 | [/P] 473 | [Nb--expl] 474 | [Pt--expl] 475 | [Fe-3expl] 476 | [Al-3expl] 477 | [Cu-expl] 478 | [Ag-expl] 479 | [As-expl] 480 | [Pd--expl] 481 | [Se-expl] 482 | [cexpl] 483 | [3Hexpl] 484 | [\B] 485 | [Ring3] 486 | [=B] 487 | [Coexpl] 488 | [BH2-expl] 489 | [125Iexpl] 490 | [18Fexpl] 491 | [=CH+expl] 492 | [/C@expl] 493 | [=Ruexpl] 494 | [\n+expl] 495 | [\CH+expl] 496 | [Moexpl] 497 | [cH+expl] 498 | 499 | 500 | 501 | 502 | [NHexpl] 503 | [Expl=Ring2] 504 | . 505 | [Branch3_1] 506 | [Branch3_2] 507 | [Expl=Ring3] 508 | [Pr+3expl] 509 | [=Pbexpl] 510 | [\NHexpl] 511 | [=Ag+expl] 512 | [P+3expl] 513 | [=Reexpl] 514 | [Pt-expl] 515 | [=V+3expl] 516 | [SH2+expl] 517 | [=Tiexpl] 518 | [Ag+2expl] 519 | [U+6expl] 520 | [=SH2expl] 521 | [PH2+expl] 522 | [Sm+2expl] 523 | [Hf+3expl] 524 | [=PH3expl] 525 | [=Mo+2expl] 526 | [Y+3expl] 527 | [=V+2expl] 528 | [Ga+3expl] 529 | [=SiHexpl] 530 | [\S@@expl] 531 | [Fe+5expl] 532 | [Cr+4expl] 533 | [=Se+expl] 534 | [SeH+expl] 535 | [#Sbexpl] 536 | [Fe+4expl] 537 | [PH2-expl] 538 | [Ru+2expl] 539 | [=Al-expl] 540 | [AlH-expl] 541 | [Zr+4expl] 542 | [Ag+3expl] 543 | [#Si+expl] 544 | [=Zr+2expl] 545 | [Hf+4expl] 546 | [=Mnexpl] 547 | [Ceexpl] 548 | [=PH2expl] 549 | [SH2expl] 550 | [=As+3expl] 551 | [AsH2expl] 552 | [Ce+3expl] 553 | [I+3expl] 554 | [=Pd-2expl] 555 | [Taexpl] 556 | [131Iexpl] 557 | [32PH3expl] 558 | [Sn+2expl] 559 | [Nb+5expl] 560 | [=Agexpl] 561 | [=Sbexpl] 562 | [Ga+2expl] 563 | [=Bi+expl] 564 | [SnHexpl] 565 | [=Au-expl] 566 | [Bi+2expl] 567 | [Br+2expl] 568 | [=Niexpl] 569 | [229Thexpl] 570 | [P+2expl] 571 | [=Hfexpl] 572 | [Ti+6expl] 573 | [PH2expl] 574 | [11CH4expl] 575 | [V+5expl] 576 | [Ta+2expl] 577 | [Cd+expl] 578 | [Ir+3expl] 579 | [=Pd-3expl] 580 | [/Siexpl] 581 | [=SiH2expl] 582 | [BH4-expl] 583 | [=Ptexpl] 584 | [/S@expl] 585 | [=Geexpl] 586 | [GeH3expl] 587 | [=Cuexpl] 588 | [Al+3expl] 589 | [HeHexpl] 590 | [=Vexpl] 591 | [Ru+expl] 592 | [Fe+expl] 593 | [/S-expl] 594 | [Zr+3expl] 595 | [PH3expl] 596 | [Pb+2expl] 597 | [Gaexpl] 598 | [Sb+expl] 599 | [=Teexpl] 600 | [Rh+4expl] 601 | [AsH+expl] 602 | [=Pt+3expl] 603 | [=I] 604 | [#CH3+3expl] 605 | [\S@expl] 606 | [=Re+5expl] 607 | [Sb-expl] 608 | [=Pd-expl] 609 | [Mn+4expl] 610 | [=Znexpl] 611 | [IH2expl] 612 | [33PH3expl] 613 | [=AsH2expl] 614 | [=Hgexpl] 615 | [Pd-expl] 616 | [Se+expl] 617 | [=Wexpl] 618 | [Si@Hexpl] 619 | [=Irexpl] 620 | [AlH2-expl] 621 | [Sn+6expl] 622 | [Cu+4expl] 623 | [TaH3expl] 624 | [TeHexpl] 625 | [Er+3expl] 626 | [CH3+expl] 627 | [P+5expl] 628 | [=Sb+expl] 629 | [BrH+expl] 630 | [Ga+expl] 631 | [CuH2-expl] 632 | [=Pdexpl] 633 | [=Au-2expl] 634 | [=C+expl] 635 | [Fe-4expl] 636 | [S-2expl] 637 | [Cs+expl] 638 | [Ti+5expl] 639 | [Co+3expl] 640 | [Mn+6expl] 641 | [AsH3expl] 642 | [ClH+expl] 643 | [SnH4expl] 644 | [60Coexpl] 645 | [Pd-4expl] 646 | [Au+3expl] 647 | [#B] 648 | [=Hf+2expl] 649 | [GeHexpl] 650 | [=Cu-2expl] 651 | [Ni+4expl] 652 | [=SHexpl] 653 | [SiH4expl] 654 | [13CH4expl] 655 | [P-3expl] 656 | [Pd+2expl] 657 | [=Coexpl] 658 | [13NH3expl] 659 | [=Al+expl] 660 | [Au+expl] 661 | [Tc+6expl] 662 | [=Alexpl] 663 | [SH3+expl] 664 | [Mn+5expl] 665 | [PH4+expl] 666 | [\Mgexpl] 667 | [/B-expl] 668 | [Rh+expl] 669 | [Heexpl] 670 | [Ni+3expl] 671 | [SnH3expl] 672 | [Sm+3expl] 673 | [Tlexpl] 674 | [Smexpl] 675 | [Al-expl] 676 | [In+3expl] 677 | [Arexpl] 678 | [Sn+3expl] 679 | [SeH-expl] 680 | [V+expl] 681 | [Nd+expl] 682 | [=SnH2expl] 683 | [=Thexpl] 684 | [Pt+expl] 685 | [Cu+3expl] 686 | [I+2expl] 687 | [=Re+expl] 688 | [CH3-expl] 689 | [=Pt-expl] 690 | [Xeexpl] 691 | [GeH2expl] 692 | [18FHexpl] 693 | [IH2+expl] 694 | [Cl+expl] 695 | [Br+expl] 696 | [=Ti+2expl] 697 | [Sn+5expl] 698 | [=S@expl] 699 | [=99Tcexpl] 700 | [Si@@Hexpl] 701 | [Sc+3expl] 702 | [Cr-expl] 703 | [Yexpl] 704 | [13Cexpl] 705 | [Mn+3expl] 706 | [PH5expl] 707 | [Ir+expl] 708 | [Ti+3expl] 709 | [Hfexpl] 710 | [Ir-4expl] 711 | [/B] 712 | [Eu+3expl] 713 | [AlHexpl] 714 | [Ti+expl] 715 | >> 716 | [Csexpl] 717 | [=Rhexpl] 718 | [Thexpl] 719 | [/S+expl] 720 | [=Feexpl] 721 | [Al+2expl] 722 | [/NHexpl] 723 | [Al+expl] 724 | [=Ceexpl] 725 | [Neexpl] 726 | [=Mgexpl] 727 | [Ni+expl] 728 | [Mg+expl] 729 | [Mo+4expl] 730 | [=Se-expl] 731 | [Rhexpl] 732 | [Sb+5expl] 733 | [B+expl] 734 | [Zr+2expl] 735 | [N+3expl] 736 | [Ce+2expl] 737 | [=I+3expl] 738 | [MgHexpl] 739 | [Yb+2expl] 740 | [OHexpl] 741 | [Mgexpl] 742 | [Pd+expl] 743 | [Pb+3expl] 744 | [Si-expl] 745 | [AlH2+expl] 746 | [=Ag-expl] 747 | [Laexpl] 748 | [Sn+4expl] 749 | [=U+2expl] 750 | [=P+3expl] 751 | [Si@@expl] 752 | [U+4expl] 753 | [Srexpl] 754 | [Bi+3expl] 755 | [Rh+3expl] 756 | [B+3expl] 757 | [#Wexpl] 758 | [=Osexpl] 759 | [=W-2expl] 760 | [=Snexpl] 761 | [Nbexpl] 762 | [Tbexpl] 763 | [Rh+2expl] 764 | [Bexpl] 765 | [=Ru-2expl] 766 | [Gdexpl] 767 | [#Tiexpl] 768 | [99Tcexpl] 769 | [=Ru-4expl] 770 | [Hf+2expl] 771 | [Fe+6expl] 772 | [Th+4expl] 773 | [C+4expl] 774 | [1HHexpl] 775 | [Tb+3expl] 776 | [/S@@expl] 777 | [In+expl] 778 | [Expl#Ring2] 779 | [Ti+4expl] 780 | [=Rh+expl] 781 | [Si+2expl] 782 | [Re+5expl] 783 | [Dy+3expl] 784 | [Expl#Ring1] 785 | [Irexpl] 786 | [/Snexpl] 787 | [Se-2expl] 788 | [=Cu-expl] 789 | [=Taexpl] 790 | [32Pexpl] 791 | [W+6expl] 792 | [Ta+5expl] 793 | [=Ag-2expl] 794 | [Si@expl] 795 | [=CH2+expl] 796 | [Uexpl] 797 | [#S] 798 | [#Zrexpl] 799 | [Tl+2expl] 800 | [I+7expl] 801 | [Pd-3expl] 802 | [Pb+4expl] 803 | [B+2expl] 804 | [SnH2expl] 805 | [Tl+3expl] 806 | [Ru-2expl] 807 | [Pd+3expl] 808 | [Cr+6expl] 809 | [#Niexpl] 810 | [=SeH-expl] 811 | [V+2expl] 812 | [V+3expl] 813 | [\SHexpl] 814 | [AlH4-expl] 815 | [OH2+expl] 816 | [H-expl] 817 | [O-2expl] 818 | [\Snexpl] 819 | [AsH4+expl] 820 | [=IH2expl] 821 | [NiH6-5expl] 822 | [Rb+expl] 823 | [As+3expl] 824 | [SiH-expl] 825 | [SHexpl] 826 | [La+3expl] 827 | [Yb+3expl] 828 | [2H-expl] 829 | [Pbexpl] 830 | [Prexpl] 831 | [Sb+2expl] 832 | [IH+expl] 833 | [Ni-expl] 834 | [PdH2expl] 835 | [Hg+expl] 836 | [PH4expl] 837 | [Nd+3expl] 838 | [15OH2expl] 839 | [V+4expl] 840 | [123I-expl] 841 | [Rh-expl] 842 | [14Cexpl] 843 | [PH3+expl] 844 | [Si+4expl] 845 | [Pt+4expl] 846 | [AsHexpl] 847 | [=Asexpl] 848 | [Ge+3expl] 849 | [Ce+4expl] 850 | [Pd-2expl] 851 | [Ru-expl] 852 | [Zr+expl] 853 | [Cl+2expl] 854 | [NH2-expl] 855 | [99Tc+4expl] 856 | [OH3+expl] 857 | [Sn+expl] 858 | [Ru+3expl] 859 | [=Tcexpl] 860 | [/Mgexpl] 861 | [Hg-2expl] -------------------------------------------------------------------------------- /vocabs/smallmolecules.txt: -------------------------------------------------------------------------------- 1 | [PAD] 2 | _-_ 3 | _+_ 4 | 5 | 6 | 7 | 8 | [Agexpl] 9 | 10 | 11 | [unused10] 12 | [UNK] 13 | [CLS] 14 | [SEP] 15 | [MASK] 16 | | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | _._ 27 | _0_-0_ 28 | _0_-1_ 29 | _0_-2_ 30 | _0_-3_ 31 | _0_-4_ 32 | _0_-5_ 33 | _0_0_ 34 | _0_1_ 35 | _0_2_ 36 | _0_3_ 37 | _0_4_ 38 | _0_5_ 39 | _1_-0_ 40 | _1_-1_ 41 | _1_-2_ 42 | _1_-3_ 43 | _1_-4_ 44 | _1_-5_ 45 | _1_-6_ 46 | _1_0_ 47 | _1_1_ 48 | _1_2_ 49 | _1_3_ 50 | _1_4_ 51 | _1_5_ 52 | _2_-0_ 53 | _2_-1_ 54 | _2_-2_ 55 | _2_-3_ 56 | _2_-4_ 57 | _2_-5_ 58 | _2_-6_ 59 | _2_0_ 60 | _2_1_ 61 | _2_2_ 62 | _2_3_ 63 | _2_4_ 64 | _2_5_ 65 | _3_-0_ 66 | _3_-1_ 67 | _3_-2_ 68 | _3_-3_ 69 | _3_-4_ 70 | _3_-5_ 71 | _3_-6_ 72 | _3_0_ 73 | _3_1_ 74 | _3_2_ 75 | _3_3_ 76 | _3_4_ 77 | _3_5_ 78 | _4_-0_ 79 | _4_-1_ 80 | _4_-2_ 81 | _4_-3_ 82 | _4_-4_ 83 | _4_-5_ 84 | _4_-6_ 85 | _4_0_ 86 | _4_1_ 87 | _4_2_ 88 | _4_3_ 89 | _4_4_ 90 | _4_5_ 91 | _5_-0_ 92 | _5_-1_ 93 | _5_-2_ 94 | _5_-3_ 95 | _5_-4_ 96 | _5_-5_ 97 | _5_-6_ 98 | _5_0_ 99 | _5_1_ 100 | _5_2_ 101 | _5_3_ 102 | _5_4_ 103 | _5_5_ 104 | _6_-0_ 105 | _6_-1_ 106 | _6_-2_ 107 | _6_-3_ 108 | _6_-4_ 109 | _6_-5_ 110 | _6_-6_ 111 | _6_0_ 112 | _6_1_ 113 | _6_2_ 114 | _6_3_ 115 | _6_4_ 116 | _6_5_ 117 | _7_-0_ 118 | _7_-1_ 119 | _7_-2_ 120 | _7_-3_ 121 | _7_-4_ 122 | _7_-5_ 123 | _7_-6_ 124 | _7_0_ 125 | _7_1_ 126 | _7_2_ 127 | _7_3_ 128 | _7_4_ 129 | _7_5_ 130 | _8_-0_ 131 | _8_-1_ 132 | _8_-2_ 133 | _8_-3_ 134 | _8_-4_ 135 | _8_-5_ 136 | _8_-6_ 137 | _8_0_ 138 | _8_1_ 139 | _8_2_ 140 | _8_3_ 141 | _8_4_ 142 | _8_5_ 143 | _9_-0_ 144 | _9_-1_ 145 | _9_-2_ 146 | _9_-3_ 147 | _9_-4_ 148 | _9_-5_ 149 | _9_-6_ 150 | _9_0_ 151 | _9_1_ 152 | _9_2_ 153 | _9_3_ 154 | _9_4_ 155 | _9_5_ 156 | [Branch2_1] 157 | [=O] 158 | [epsilon] 159 | [Ring1] 160 | [=C] 161 | [Ring2] 162 | [Branch1_3] 163 | [N] 164 | [Branch1_1] 165 | [C] 166 | [=N] 167 | [Branch2_3] 168 | [Branch1_2] 169 | [#N] 170 | [Br] 171 | [O] 172 | [Branch2_2] 173 | [F] 174 | [S] 175 | [=S] 176 | [#C] 177 | [Cl] 178 | [O-expl] 179 | [N+expl] 180 | [P] 181 | [.] 182 | [I] 183 | [c] 184 | [-c] 185 | [s] 186 | [nHexpl] 187 | [\c] 188 | [n] 189 | [\C] 190 | [o] 191 | [C@@Hexpl] 192 | [C@expl] 193 | [C@@expl] 194 | [C@Hexpl] 195 | [/C] 196 | [/c] 197 | [Ptexpl] 198 | [\N] 199 | [\C@@Hexpl] 200 | [/C@Hexpl] 201 | [\C@Hexpl] 202 | [-n] 203 | [=c] 204 | [B] 205 | [\S] 206 | [/n] 207 | [=N+expl] 208 | [Expl\Ring2] 209 | [Expl/Ring1] 210 | [n+expl] 211 | [Expl\Ring1] 212 | [Asexpl] 213 | [N@@expl] 214 | [S@@expl] 215 | [/O] 216 | [Expl-Ring1] 217 | [/N] 218 | [S+expl] 219 | [/S] 220 | [Pexpl] 221 | [=Nexpl] 222 | [#O+expl] 223 | [C-expl] 224 | [Iexpl] 225 | [O+expl] 226 | [Brexpl] 227 | [Clexpl] 228 | [=N-expl] 229 | [N-expl] 230 | [P+expl] 231 | [Oexpl] 232 | [#C-expl] 233 | [=Oexpl] 234 | [#N+expl] 235 | [=Iexpl] 236 | [CH-expl] 237 | [P-expl] 238 | [s+expl] 239 | [=P] 240 | [=I++expl] 241 | [o+expl] 242 | [=O+expl] 243 | [CH2expl] 244 | [=S+expl] 245 | [I+expl] 246 | [IHexpl] 247 | [CHexpl] 248 | [-n+expl] 249 | [=CHexpl] 250 | [=O-expl] 251 | [c-expl] 252 | [S-expl] 253 | [p] 254 | [Nexpl] 255 | [Cexpl] 256 | [=P+expl] 257 | [n-expl] 258 | [cH-expl] 259 | [B-expl] 260 | [Expl-Ring2] 261 | [C+expl] 262 | [c+expl] 263 | [=n+expl] 264 | [NH-expl] 265 | [NH2+expl] 266 | [Expl/Ring2] 267 | [Expl=Ring1] 268 | [Cl-expl] 269 | [Na+expl] 270 | [Hexpl] 271 | [NH4+expl] 272 | [Hgexpl] 273 | [\O] 274 | [Br-expl] 275 | [N@expl] 276 | [Ca++expl] 277 | [Snexpl] 278 | [I-expl] 279 | [Co+expl] 280 | [N@@+expl] 281 | [K+expl] 282 | [Fe--expl] 283 | [\Hexpl] 284 | [N@+expl] 285 | [Fe+3expl] 286 | [Gd+3expl] 287 | [/N+expl] 288 | [NH+expl] 289 | [=NH+expl] 290 | [Zn++expl] 291 | [/Br] 292 | [/Cl] 293 | [/C@@Hexpl] 294 | [\N+expl] 295 | [NH3+expl] 296 | [Alexpl] 297 | [Hg++expl] 298 | [Cu++expl] 299 | [Znexpl] 300 | [Au-expl] 301 | [Auexpl] 302 | [Crexpl] 303 | [Cd++expl] 304 | [Cdexpl] 305 | [Siexpl] 306 | [Sbexpl] 307 | [Seexpl] 308 | [=Seexpl] 309 | [Cuexpl] 310 | [Li+expl] 311 | [Tl+expl] 312 | [Biexpl] 313 | [Inexpl] 314 | [/Hexpl] 315 | [Caexpl] 316 | [Dyexpl] 317 | [Co++expl] 318 | [Cr+3expl] 319 | [Fe++expl] 320 | [Pt-2expl] 321 | [Sb+3expl] 322 | [Be++expl] 323 | [Mg++expl] 324 | [Tiexpl] 325 | [Fe-expl] 326 | [Ndexpl] 327 | [Pdexpl] 328 | [#Inexpl] 329 | [Ba++expl] 330 | [H+expl] 331 | [Mn+expl] 332 | [Mn++expl] 333 | [SiHexpl] 334 | [\Cl] 335 | [Ni++expl] 336 | [Zrexpl] 337 | [Niexpl] 338 | [PbH2++expl] 339 | [Ybexpl] 340 | [Naexpl] 341 | [=Moexpl] 342 | [=Cdexpl] 343 | [Cu+expl] 344 | [Geexpl] 345 | [Baexpl] 346 | [=Crexpl] 347 | [Cr++expl] 348 | [OH-expl] 349 | [SnH2++expl] 350 | [Mg+2expl] 351 | [=Siexpl] 352 | [\Br] 353 | [\C@expl] 354 | [Vexpl] 355 | [Ag+expl] 356 | [\C@@expl] 357 | [Pt+2expl] 358 | [2Hexpl] 359 | [Ti++expl] 360 | [Sr++expl] 361 | [=Auexpl] 362 | [Ruexpl] 363 | [\O-expl] 364 | [P@expl] 365 | [Liexpl] 366 | [/C@@expl] 367 | [As+expl] 368 | [\Siexpl] 369 | [/Alexpl] 370 | [\O+expl] 371 | [/Crexpl] 372 | [/Feexpl] 373 | [Euexpl] 374 | [Scexpl] 375 | [Zn+2expl] 376 | [Ca+2expl] 377 | [Hg+2expl] 378 | [=Zrexpl] 379 | [nH+expl] 380 | [Cl+3expl] 381 | [Ba+2expl] 382 | [TlH2+expl] 383 | [Fe+2expl] 384 | [AlH3expl] 385 | [=PHexpl] 386 | [Co+2expl] 387 | [Cu+2expl] 388 | [PbH2+2expl] 389 | [\s] 390 | [Ni+2expl] 391 | [Cd+2expl] 392 | [SnH2+2expl] 393 | [Ti+2expl] 394 | [PHexpl] 395 | [Mn+2expl] 396 | [Sr+2expl] 397 | [Be+2expl] 398 | [seexpl] 399 | [Cr+2expl] 400 | [=Biexpl] 401 | [=C-expl] 402 | [SbH6+3expl] 403 | [\n] 404 | [Fe-2expl] 405 | [=OH+expl] 406 | [-c-expl] 407 | [/s] 408 | [=NH2+expl] 409 | [#S+expl] 410 | [/F] 411 | [F-expl] 412 | [SH-expl] 413 | [CH+expl] 414 | [\NH+expl] 415 | [\CH-expl] 416 | [\c-expl] 417 | [/o] 418 | [CH2-expl] 419 | [\N-expl] 420 | [/n-expl] 421 | [\C-expl] 422 | [/NH+expl] 423 | [/N-expl] 424 | [\F] 425 | [Gd-4expl] 426 | [Gd-5expl] 427 | [N@@H+expl] 428 | [SiH3expl] 429 | [Branch3_3] 430 | [Sexpl] 431 | [\I] 432 | [BiH3expl] 433 | [SeHexpl] 434 | [SiH2expl] 435 | [Feexpl] 436 | [S@expl] 437 | [\P] 438 | [/nHexpl] 439 | [SH+expl] 440 | [-oexpl] 441 | [-sexpl] 442 | [Kexpl] 443 | [=S@@expl] 444 | [*expl] 445 | [CH2+expl] 446 | [S@+expl] 447 | [S@@+expl] 448 | [-nexpl] 449 | [P@@expl] 450 | [/I] 451 | [Reexpl] 452 | [=SH+expl] 453 | [/CH-expl] 454 | [\nHexpl] 455 | [=C@@expl] 456 | [N@H+expl] 457 | [Teexpl] 458 | [Osexpl] 459 | [=Ru-expl] 460 | [Re-expl] 461 | [Zn+expl] 462 | [nexpl] 463 | [Mnexpl] 464 | [BH-expl] 465 | [\NH-expl] 466 | [BH3-expl] 467 | [11CH3expl] 468 | [/O-expl] 469 | [PH+expl] 470 | [Wexpl] 471 | [OH+expl] 472 | [/P] 473 | [Nb--expl] 474 | [Pt--expl] 475 | [Fe-3expl] 476 | [Al-3expl] 477 | [Cu-expl] 478 | [Ag-expl] 479 | [As-expl] 480 | [Pd--expl] 481 | [Se-expl] 482 | [cexpl] 483 | [3Hexpl] 484 | [\B] 485 | [Ring3] 486 | [=B] 487 | [Coexpl] 488 | [BH2-expl] 489 | [125Iexpl] 490 | [18Fexpl] 491 | [=CH+expl] 492 | [/C@expl] 493 | [=Ruexpl] 494 | [\n+expl] 495 | [\CH+expl] 496 | [Moexpl] 497 | [cH+expl] 498 | 499 | 500 | 501 | 502 | [NHexpl] 503 | [Expl=Ring2] 504 | . 505 | [Branch3_1] 506 | [Branch3_2] 507 | [Expl=Ring3] --------------------------------------------------------------------------------