├── model └── .gitkeep ├── Img2Mol.png ├── benchmark_data ├── STAKER_map.pkl ├── Img2Mol_map.pkl └── README.md ├── examples ├── digital_example1.png ├── digital_example2.png ├── handwritten_example1.png └── handwritten_example2.jpg ├── environment.yml ├── environment.local-cddd.yml ├── download_model.sh ├── img2mol ├── README.md ├── cddd_server.py ├── model.py └── inference.py ├── setup.py ├── .gitignore ├── README.md ├── LICENSE └── example_inference.ipynb /model/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Img2Mol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayer-science-for-a-better-life/Img2Mol/HEAD/Img2Mol.png -------------------------------------------------------------------------------- /benchmark_data/STAKER_map.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayer-science-for-a-better-life/Img2Mol/HEAD/benchmark_data/STAKER_map.pkl -------------------------------------------------------------------------------- /examples/digital_example1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayer-science-for-a-better-life/Img2Mol/HEAD/examples/digital_example1.png -------------------------------------------------------------------------------- /examples/digital_example2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayer-science-for-a-better-life/Img2Mol/HEAD/examples/digital_example2.png -------------------------------------------------------------------------------- /benchmark_data/Img2Mol_map.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayer-science-for-a-better-life/Img2Mol/HEAD/benchmark_data/Img2Mol_map.pkl -------------------------------------------------------------------------------- /examples/handwritten_example1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayer-science-for-a-better-life/Img2Mol/HEAD/examples/handwritten_example1.png -------------------------------------------------------------------------------- /examples/handwritten_example2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayer-science-for-a-better-life/Img2Mol/HEAD/examples/handwritten_example2.jpg -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: img2mol 2 | 3 | channels: 4 | - rdkit 5 | - pytorch 6 | - anaconda 7 | - conda-forge 8 | - defaults 9 | dependencies: 10 | - python=3.8.5 11 | - pip=20.2.4 12 | - notebook=6.4.2 13 | - pillow=8.0.1 14 | - numpy=1.19.2 15 | - rdkit=2020.03.1 16 | - cudatoolkit=11.0 17 | - torchvision=0.8.0 18 | - torchaudio=0.7.0 19 | - pytorch=1.7.0 20 | - pytorch-lightning=1.0.8 -------------------------------------------------------------------------------- /environment.local-cddd.yml: -------------------------------------------------------------------------------- 1 | name: img2mol 2 | 3 | channels: 4 | - rdkit 5 | - pytorch 6 | - anaconda 7 | - conda-forge 8 | - defaults 9 | dependencies: 10 | - python=3.6 11 | - pip=20.2.4 12 | - pandas<=1.0.3 13 | - notebook=6.4.2 14 | - pillow=8.0.1 15 | - scikit-learn 16 | - rdkit=2020.03.1 17 | - cudatoolkit=11.0 18 | - torchvision=0.8.0 19 | - torchaudio=0.7.0 20 | - pytorch=1.7.0 21 | - pytorch-lightning=1.0.8 22 | - pip: 23 | - https://github.com/jrwnter/cddd/archive/refs/tags/1.0.tar.gz 24 | - tensorflow==1.10.1 25 | - tensorboard==1.15 26 | - numpy==1.19.2 27 | - . 28 | -------------------------------------------------------------------------------- /download_model.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | gURL=https://drive.google.com/file/d/1pk21r4Zzb9ZJkszJwP9SObTlfTaRMMtF/view?usp=sharing 4 | # match more than 26 word characters 5 | ggID=$(echo "$gURL" | egrep -o '(\w|-){26,}') 6 | # alternative, just hardcode the id 7 | ggID='1pk21r4Zzb9ZJkszJwP9SObTlfTaRMMtF' 8 | ggURL='https://drive.google.com/uc?export=download' 9 | 10 | curl -sc /tmp/gcokie "${ggURL}&id=${ggID}" >/dev/null 11 | getcode="$(awk '/_warning_/ {print $NF}' /tmp/gcokie)" 12 | 13 | FILE=/model/model.ckpt 14 | if test -f "$FILE"; then 15 | echo "$FILE exists." 16 | else 17 | echo "$FILE does not exist." 18 | echo -e "Downloading from "$gURL"...\n" 19 | cmd='curl --insecure -C - -LOJb /tmp/gcokie "${ggURL}&confirm=${getcode}&id=${ggID}"' 20 | eval $cmd 21 | mv 'model.ckpt' 'model/' 22 | fi -------------------------------------------------------------------------------- /img2mol/README.md: -------------------------------------------------------------------------------- 1 | # `img2mol` module structure 2 | This directory consists of the necessary python scripts to perform inference tasks with the `img2mol` model. 3 | The list below summarizes each module: 4 | 5 | 6 | * `cddd_server.py` 7 | * class for utilizing th CDDD encoder-decoder described by [Winter et al. (2019)](https://pubs.rsc.org/en/content/articlelanding/2019/sc/c8sc04175j#!divAbstract) 8 | * note that the implemented model class is licensed under the CC BY-NC 4.0 license and only applicable in non-commercial setting 9 | * `model.py` 10 | * Model implementation of the `img2mol` as described in our paper. We use Pytorch Lightning for model training, but essentially, only using PyTorch is also possible 11 | * `inference.py` 12 | * inference class that can be used for predicting the SMILES representation based on an image representation. By default, the model weights are randomly initialized and when instantatiating the inference class, a model checkpoint can be used for loading trained weights. 13 | * The provided model weights are licensed under CC BY-NC 4.0 and only applicable for non-commercial usage 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Machine Learning Research @ Bayer AG 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Install script for setuptools.""" 16 | 17 | from setuptools import setup 18 | 19 | 20 | setup( 21 | name='img2mol', 22 | version='0.1', 23 | packages=['img2mol'], 24 | url='https://github.com/bayer-science-for-a-better-life/Img2Mol', 25 | license='Apache License, Version 2.0', 26 | author='Djork-Arné Clevert, Tuan Le, Robin Winter and Floriane Montanari', 27 | author_email='djork-arne.clevert@bayer.com', 28 | description='Inferring molecules from images' 29 | ) 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | model/* 2 | !model/.gitkeep 3 | 4 | # Editors 5 | .vscode/ 6 | .idea/ 7 | 8 | # Vagrant 9 | .vagrant/ 10 | 11 | # Mac/OSX 12 | .DS_Store 13 | 14 | # Windows 15 | Thumbs.db 16 | 17 | # Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore 18 | # Byte-compiled / optimized / DLL files 19 | __pycache__/ 20 | *.py[cod] 21 | *$py.class 22 | 23 | # C extensions 24 | *.so 25 | 26 | # Distribution / packaging 27 | .Python 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .nox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | .hypothesis/ 66 | .pytest_cache/ 67 | 68 | # Translations 69 | *.mo 70 | *.pot 71 | 72 | # Django stuff: 73 | *.log 74 | local_settings.py 75 | db.sqlite3 76 | 77 | # Flask stuff: 78 | instance/ 79 | .webassets-cache 80 | 81 | # Scrapy stuff: 82 | .scrapy 83 | 84 | # Sphinx documentation 85 | docs/_build/ 86 | 87 | # PyBuilder 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # IPython 94 | profile_default/ 95 | ipython_config.py 96 | 97 | # pyenv 98 | .python-version 99 | 100 | # celery beat schedule file 101 | celerybeat-schedule 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | -------------------------------------------------------------------------------- /img2mol/cddd_server.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Machine Learning Research @ Bayer AG 2 | # 3 | # Licensed for non-commercial use only, under the terms of the 4 | # Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) license. 5 | # You can find details at: https://creativecommons.org/licenses/by-nc/4.0/legalcode 6 | 7 | 8 | import json 9 | import requests 10 | requests.packages.urllib3.disable_warnings() 11 | 12 | """ 13 | CDDD Server to encode SMILES string to molecular embeddings and decode the molecular embeddings to SMILES string. 14 | 15 | For further details, please refer to: 16 | [1] R. Winter, F. Montanari, F. Noe and D. Clevert, Chem. Sci, 2019, 17 | https://pubs.rsc.org/en/content/articlelanding/2019/sc/c8sc04175j#!divAbstract 18 | 19 | and: https://github.com/jrwnter/cddd 20 | """ 21 | 22 | # Note that the DEFAULT_HOST is accessing the AWS instance deployed by Machine Learning Research Group of Bayer. 23 | DEFAULT_HOST = "http://ec2-18-157-240-87.eu-central-1.compute.amazonaws.com" 24 | 25 | """ 26 | The CDDD server is applicable for non-commercial use only, under the terms of the 27 | Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) license. 28 | You can find details at: https://creativecommons.org/licenses/by-nc/4.0/legalcode 29 | """ 30 | 31 | 32 | class CDDDRequest: 33 | def __init__(self, host=DEFAULT_HOST, port=8892): 34 | self.host = host 35 | self.port = port 36 | self.headers = {'content-type': 'application/json'} 37 | 38 | def smiles_to_cddd(self, smiles): 39 | url = "{}:{}/smiles_to_cddd/".format(self.host, self.port) 40 | req = json.dumps({"smiles": smiles}) 41 | response = requests.post(url, data=req, headers=self.headers, verify=False) 42 | return json.loads(response.content.decode("utf-8")) 43 | 44 | def seq_to_emb(self, smiles): 45 | return self.smiles_to_cddd(smiles) 46 | 47 | def cddd_to_smiles(self, embedding): 48 | url = "{}:{}/cddd_to_smiles/".format(self.host, self.port) 49 | req = json.dumps({"cddd": embedding}) 50 | response = requests.post(url, data=req, headers=self.headers, verify=False) 51 | return json.loads(response.content.decode("utf-8")) 52 | 53 | def emb_to_seq(self, embedding): 54 | return self.cddd_to_smiles(embedding) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Img2Mol: inferring molecules from pictures 2 | ========================================== 3 | ![Img2Mol](Img2Mol.png) 4 | Welcome to Img2Mol! :wave:. 5 | 6 | :point_right: For the Img2Mol web app switch to the "deployment-example" branch. 7 | 8 | ## Overview 9 | Here we provide the implementation of the `img2mol` model using [PyTorch](https://github.com/pytorch/pytorch) and [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning) for training and inference, along with an exemplary jupyter notebook. 10 | 11 | This repository is organized as follows: 12 | * `examples/`: contains example images to apply our proposed model on 13 | * `img2mol/`: contains necessary python modules for our proposed model 14 | * `model/`: stores the trained model weights as pickled files. The download-link will be provided in future soon 15 | 16 | ## Installation 17 | #### Requirements 18 | ``` 19 | python=3.8.5 20 | pip=20.2.4 21 | notebook=6.4.2 22 | pillow=8.0.1 23 | numpy=1.19.2 24 | rdkit=2020.03.1 25 | cudatoolkit=11.0 26 | torchvision=0.8.0 27 | torchaudio=0.7.0 28 | pytorch=1.7.0 29 | pytorch-lightning=1.0.8 30 | ``` 31 | 32 | #### Environment 33 | Create a new environment: 34 | ```bash 35 | git clone git@github.com:bayer-science-for-a-better-life/Img2Mol.git 36 | cd Img2Mol 37 | conda env create -f environment.yml 38 | conda activate img2mol 39 | pip install . 40 | ``` 41 | *If you want to run Img2Mol as a standalone version with a locally loaded CDDD model instead of sending requests to our CDDD server, install the environment from `environment.local-cddd.yml` instead of `environment.yml`* 42 | ## Download Model Weights 43 | You can download the trained parameters for the default model (~2.4GB) as described in our paper using the following link: 44 | https://drive.google.com/file/d/1pk21r4Zzb9ZJkszJwP9SObTlfTaRMMtF/view . 45 | Please move the downloaded file `model.ckpt` into the `model/` directory. 46 | 47 | If you are working with the local CDDD installation, please * [download and unzip the CDDD model](https://drive.google.com/u/0/uc?id=1oyknOulq_j0w9kzOKKIHdTLo5HphT99h&export=download) and ove the directory *default_model* to `path/to/anaconda3/envs/img2mol/lib/python3.6/site-packages/cddd/data/` 48 | 49 | Alternatively, we provide a bash script that will download and move the file automatically. 50 | ```bash 51 | bash download_model.sh 52 | ``` 53 | If you have problems downloading the file using the bash script, please manually download the file using the browser. 54 | 55 | ## Examples 56 | Check the example notebook `example_inference.ipynb` to see how the inference class can be used. A demonstration of the usage with the usage with the local CDDD model is demonstrated in `example_inference_local_cddd.ipynb`. 57 | 58 | ## Reference 59 | Please cite our manuscript if you use our model in your work. 60 | 61 | D.-A. Clevert, T. Le, R. Winter, F. Montanari, Chem. Sci., 2021, [DOI: 10.1039/D1SC01839F](https://doi.org/10.1039/D1SC01839F) 62 | 63 | ## Img2Mol Code License 64 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0. 65 | 66 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 67 | 68 | ## Model Parameters License 69 | The Img2Mol parameters are made available for non-commercial use only, under the terms of the Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) license. You can find details at: https://creativecommons.org/licenses/by-nc/4.0/legalcode 70 | -------------------------------------------------------------------------------- /img2mol/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Machine Learning Research @ Bayer AG 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from torch import nn 17 | import torch.nn.functional as F 18 | import pytorch_lightning as pl 19 | from typing import Union, List, Optional 20 | 21 | 22 | MODEL_CONFIGS: List = [[128, 7, 3, 4], 23 | [256, 5, 1, 1], 24 | [384, 5, 1, 1], 25 | 'M', 26 | [384, 3, 1, 1], 27 | [384, 3, 1, 1], 28 | 'M', 29 | [512, 3, 1, 1], 30 | [512, 3, 1, 1], 31 | [512, 3, 1, 1], 32 | 'M'] 33 | 34 | 35 | def make_layers(cfg: Optional[List[Union[str, int]]] = None, 36 | batch_norm: bool = False) -> nn.Sequential: 37 | """ 38 | Helper function to create the convolutional layers for the Img2Mol model to be passed into a nn.Sequential module. 39 | :param cfg: list populated with either a str or a list, where the str object refers to the pooling method and the 40 | list object will be unrolled to obtain the convolutional-filter parameters. 41 | Defaults to the `MODEL_CONFIGS` list. 42 | :param batch_norm: boolean of batch normalization should be used in-between conv2d and relu activation. 43 | Defaults to False 44 | :return: torch.nn.Sequential module as feature-extractor 45 | """ 46 | if cfg is None: 47 | cfg = MODEL_CONFIGS 48 | 49 | layers: List[nn.Module] = [] 50 | 51 | in_channels = 1 52 | for v in cfg: 53 | if v == 'A': 54 | layers += [nn.AvgPool2d(kernel_size=2, stride=2)] 55 | else: 56 | if v == 'M': 57 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 58 | else: 59 | units, kern_size, stride, padding = v 60 | conv2d = nn.Conv2d(in_channels, units, kernel_size=kern_size, stride=stride, padding=padding) 61 | if batch_norm: 62 | layers += [conv2d, nn.BatchNorm2d(units), nn.ReLU(inplace=True)] 63 | else: 64 | layers += [conv2d, nn.ReLU(inplace=True)] 65 | in_channels = units 66 | 67 | model = nn.Sequential(*layers) 68 | return model 69 | 70 | 71 | class Img2MolPlModel(pl.LightningModule): 72 | """ 73 | Wraps the Img2Mol model into pytorch lightning for easy training and inference 74 | """ 75 | def __init__(self, learning_rate: float = 1e-4, batch_norm: bool = False): 76 | super().__init__() 77 | self.learning_rate = learning_rate 78 | 79 | # convolutional NN for feature extraction 80 | self.features = make_layers(cfg=MODEL_CONFIGS, batch_norm=batch_norm) 81 | # fully-connected network for classification based on CNN feature extractor 82 | self.classifier = nn.Sequential( 83 | nn.Linear(512 * 9 * 9, 4096), 84 | nn.ReLU(True), 85 | nn.Dropout(p=0.0), 86 | nn.Linear(4096, 4096), 87 | nn.ReLU(True), 88 | nn.Dropout(p=0.0), 89 | nn.Linear(4096, 512), 90 | nn.Tanh(), 91 | ) 92 | 93 | self._initialize_weights() 94 | 95 | def forward(self, x: torch.Tensor) -> torch.Tensor: 96 | x = self.features(x) 97 | x = torch.flatten(x, 1) 98 | x = self.classifier(x) 99 | return x 100 | 101 | def _initialize_weights(self) -> None: 102 | for m in self.modules(): 103 | if isinstance(m, nn.Conv2d): 104 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 105 | if m.bias is not None: 106 | nn.init.constant_(m.bias, 0) 107 | elif isinstance(m, nn.BatchNorm2d): 108 | nn.init.constant_(m.weight, 1) 109 | nn.init.constant_(m.bias, 0) 110 | elif isinstance(m, nn.Linear): 111 | nn.init.normal_(m.weight, 0, 0.01) 112 | nn.init.constant_(m.bias, 0) 113 | 114 | def training_step(self, batch, batch_idx): 115 | x, cddd = batch 116 | cddd_hat = self(x) 117 | loss = F.mse_loss(cddd_hat, cddd) 118 | self.log('train_loss', loss, on_epoch=True, prog_bar=True, logger=True) 119 | return loss 120 | 121 | def validation_step(self, batch, batch_idx): 122 | x, cddd = batch 123 | cddd_hat = self(x) 124 | loss = F.mse_loss(cddd_hat, cddd) 125 | self.log('valid_loss', loss, on_epoch=True, prog_bar=True, logger=True) 126 | 127 | def test_step(self, batch, batch_idx): 128 | x, cddd = batch 129 | cddd_hat = self(x) 130 | loss = F.mse_loss(cddd_hat, cddd) 131 | self.log('test_loss', loss) 132 | 133 | def configure_optimizers(self): 134 | return torch.optim.AdamW(self.parameters(), lr=self.learning_rate) 135 | 136 | 137 | if __name__ == "__main__": 138 | pl_model = Img2MolPlModel() 139 | print(pl_model) 140 | -------------------------------------------------------------------------------- /benchmark_data/README.md: -------------------------------------------------------------------------------- 1 | Here we provide the benchmark datasets that was used to evaluate the performance of Img2Mol and compare it with that of state-of-the-art molecular recognition methods. The following benchmark datasets (all 8-bit grayscale images) were used. 2 | For the smaller benchmark datasets (USPTO, UoB, CLEF and JPO), we applied a slight input perturbation by adding rotation (randomly drawn from [−5°, 5°]) and shearing (xy-shearing factor randomly drawn from [−0.1, 0.1]). Every input image of those benchmarks is perturbed five times randomly. This is done in order to detect potential overfitting of the baseline methods to those small, well known datasets. 3 | 4 | #### Img2Mol 5 | Test set collection of 25,000 images and molecule descriptions. Images were generated as described in subsection 3.3 of the paper. The resolution of the images is 224 × 224 px. Only half of our original test set is used due to the computational time of the baseline methods. The data set consists of typical small molecules with an average size of 25 atoms, ranging between 6 and 44 atoms. Please load the pickled dataframe object to get the mapping images<>smiles. 6 | 7 | You can download the tgz-file (~114MB) of the images here: 8 | https://drive.google.com/file/d/1FZxjcncEQ-aK4Gl5obepNxAJCFOcEc8W/view 10 | 11 | #### STAKER 12 | The validation set collection of 30,000 images and molecule descriptions provided by Staker et al. The images are based on US Patent Office (USPTO) data. The image resolution is 256 × 256 px. Molecules are composed of 24 atoms on average, ranging from 7 at the minimum to 51 at the maximum. Please load the pickled dataframe object to get the mapping images<>smiles. 13 | 14 | You can download the tgz-file (~110MB) of the images here: 15 | https://drive.google.com/file/d/1rYPMSF6C7AbHubll8BZZJF2zvd7UYzp6/view. 16 | #### USPTO 17 | A collection of 4852 images and molecule descriptions based on US Patent Office (USPTO) data, obtained from Rajan et al. The average resolution of the images is 649 × 417 px. The dataset consists of molecules with an average size of 28 atoms, ranging between 10 and 96 atoms. 18 | 19 | You can download the tgz-file (~12MB) of the images here: 20 | https://drive.google.com/file/d/15h1c50AmcJ3jCuQOdLjkVhcFkqe7slLn/view. 22 | #### UoB 23 | 5716 images and molecule descriptions of chemical structures developed by the University of Birmingham, obtained from Rajan et al. The average resolution of the images is 762 × 412 px. The molecules in this data set are quite small, consisting on average of only 13 atoms, ranging between 4 and 34 atoms. 24 | 25 | You can download the tgz-file of the images (~124MB) here: 26 | https://drive.google.com/file/d/13Ul94f6hUEpDbUKLUP_e7xEfSRZIqFuy/view. 28 | #### CLEF 29 | A collection of 711 images and molecule descriptions based on the Conference and Labs of the Evaluation Forum (CLEF) test set, obtained from Rajan et al. The average resolution of the images is 1243 × 392 px. The dataset consists of molecules with an average size of 26 atoms, ranging between 4 and 42 atoms. 30 | 31 | You can download the tgz-file (~12MB) of the images here: 32 | https://drive.google.com/file/d/1fqMg0N582ti9ij71Pbntbq6vMw8z1BJI/view. 34 | #### JPO 35 | A collection of 365 images and molecule descriptions based on Japanese Patent Office (JPO) data, obtained from Rajan et al. Note that this data set contains many textual labels, including Japanese characters, and irregular features, including line thickness variations. In addition, some images are characterised by poor quality. The average resolution of the images is 607 × 373 px. Molecules are composed of 20 atoms on average, ranging from 5 at the minimum to 43 at the maximum. 36 | 37 | You can download the tgz-file (~12MB) of the images here: 38 | https://drive.google.com/file/d/11GxOLvQn_TanDAW8u7oCvSA_FJ-SXU4F/view. 40 | 41 | 42 | #### Influence of the depiction library 43 | 44 | To investigate how the rendering library (RDKit, OEChem TK, or Indigo) used to create input images affects the performance of chemical structure recognition models, we compiled the following benchmark dataset. A subset of 5000 compounds from the Img2Mol test set depicted each five times by each of the three libraries. Please use the Img2Mol mapping information to link images to the smiles. 45 | 46 | You can download the tgz-file (~1GB) of the images herere: 47 | https://drive.google.com/file/d/1ixGj51F5NnhRfHFydpuCBYvYKexfaX3E/view. 49 | 50 | 51 | #### Influence of the image resolution 52 | 53 | To investigate how the image resolution used to create input images affects the performance of chemical structure recognition models, we compiled the following benchmark dataset. A subset of 5000 compounds from the Img2Mol test set depicted each five times with 256, 512, 1024 and 2048 px resolution. Please use the Img2Mol mapping information to link images to the smiles. 54 | 55 | You can download the tgz-file (~2.5GB) of the images here: 56 | https://drive.google.com/file/d/1uMZ2FGNON4k6vxrldkEJZPRbyrISNR6K/view. 58 | 59 | 60 | -------------------------------------------------------------------------------- /img2mol/inference.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Machine Learning Research @ Bayer AG 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from torchvision import transforms 17 | 18 | from typing import Optional 19 | import random 20 | import numpy as np 21 | from PIL import Image, ImageOps, ImageEnhance 22 | 23 | from img2mol.model import Img2MolPlModel 24 | from img2mol.cddd_server import CDDDRequest 25 | 26 | from rdkit import Chem 27 | 28 | import warnings 29 | # CDDD import only works if the suitable environment has been installed 30 | try: 31 | with warnings.catch_warnings(): 32 | warnings.simplefilter("ignore", FutureWarning) 33 | from cddd.inference import InferenceModel as CDDDInferenceModel 34 | except ImportError: 35 | print("Local CDDD installation has not been found.") 36 | 37 | 38 | """ 39 | Inference Class for Img2Mol Model. 40 | By default, the class instantiation will not use any model checkpoint. 41 | The Img2Mol model parameters are made available for non-commercial use only, under the terms of the 42 | Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) license. 43 | You can find details at: https://creativecommons.org/licenses/by-nc/4.0/legalcode 44 | """ 45 | 46 | 47 | class Img2MolInference(object): 48 | """ 49 | Inference Class 50 | """ 51 | def __init__( 52 | self, 53 | model_ckpt: Optional[str] = None, 54 | device: str = "cuda:0" if torch.cuda.is_available() else "cpu", 55 | local_cddd: bool = None 56 | ): 57 | super(Img2MolInference, self).__init__() 58 | if local_cddd: 59 | self.cddd_inference_model = CDDDInferenceModel() 60 | else: 61 | self.cddd_inference_model = None 62 | self.device = device 63 | print("Initializing Img2Mol Model with random weights.") 64 | self.model = Img2MolPlModel() 65 | if model_ckpt is not None: 66 | print(f"Loading checkpoint: {model_ckpt}") 67 | self.model = self.model.load_from_checkpoint(model_ckpt) 68 | 69 | print("Setting to `self.eval()`-mode.") 70 | self.model.eval() 71 | print(f"Sending model to `{self.device}` device.") 72 | self.model.to(self.device) 73 | print("Succesfully created Img2Mol Inference class.") 74 | 75 | """ 76 | Class methods for image preprocessing 77 | """ 78 | @classmethod 79 | def read_imagefile(cls, filepath: str) -> Image.Image: 80 | img = Image.open(filepath, "r") 81 | 82 | if img.mode == "RGBA": 83 | bg = Image.new('RGB', img.size, (255, 255, 255)) 84 | # Paste image to background image 85 | bg.paste(img, (0, 0), img) 86 | return bg.convert('L') 87 | else: 88 | return img.convert('L') 89 | 90 | @classmethod 91 | def fit_image(cls, img: Image): 92 | old_size = img.size 93 | desired_size = 224 94 | ratio = float(desired_size) / max(old_size) 95 | new_size = tuple([int(x * ratio) for x in old_size]) 96 | img = img.resize(new_size, Image.BICUBIC) 97 | new_img = Image.new("L", (desired_size, desired_size), "white") 98 | new_img.paste(img, ((desired_size - new_size[0]) // 2, 99 | (desired_size - new_size[1]) // 2)) 100 | 101 | new_img = ImageOps.expand(new_img, int(np.random.randint(5, 25, size=1)), "white") 102 | return new_img 103 | 104 | @classmethod 105 | def transform_image(cls, image: Image): 106 | image = cls.fit_image(image) 107 | img_PIL = transforms.RandomRotation((-15, 15), resample=3, expand=True, center=None, fill=255)(image) 108 | img_PIL = transforms.ColorJitter(brightness=[0.75, 2.0], contrast=0, saturation=0, hue=0)(img_PIL) 109 | shear_value = np.random.uniform(0.1, 7.0) 110 | shear = random.choice([[0, 0, -shear_value, shear_value], [-shear_value, shear_value, 0, 0], 111 | [-shear_value, shear_value, -shear_value, shear_value]]) 112 | img_PIL = transforms.RandomAffine(0, translate=None, scale=None, 113 | shear=shear, resample=3, fillcolor=255)(img_PIL) 114 | img_PIL = ImageEnhance.Contrast(ImageOps.autocontrast(img_PIL)).enhance(2.0) 115 | img_PIL = transforms.Resize((224, 224), interpolation=3)(img_PIL) 116 | img_PIL = ImageOps.autocontrast(img_PIL) 117 | img_PIL = transforms.ToTensor()(img_PIL) 118 | return img_PIL 119 | 120 | def read_image_to_tensor(self, filepath: str, 121 | repeats: int = 50): 122 | extension = filepath.split(".")[-1] in ("jpg", "jpeg", "png") 123 | if not extension: 124 | return "Image must be jpg or png format!" 125 | image = self.read_imagefile(filepath) 126 | images = torch.cat([torch.unsqueeze(self.transform_image(image), 0) 127 | for _ in range(repeats)], dim=0) 128 | images = images.to(self.device) 129 | return images 130 | 131 | def __call__(self, 132 | filepath: str, 133 | cddd_server: CDDDRequest = None, 134 | return_cddd: bool = False, 135 | ) -> dict: 136 | images = self.read_image_to_tensor(filepath, repeats=50) 137 | with torch.no_grad(): 138 | cddd = self.model(images).detach().cpu().numpy() 139 | 140 | # take the median cddd prediction out of `repeats` predictions 141 | cddd = np.median(cddd, axis=0) 142 | 143 | if self.cddd_inference_model: 144 | smiles = self.cddd_inference_model.emb_to_seq(cddd) 145 | else: 146 | smiles = cddd_server.cddd_to_smiles(cddd.tolist()) 147 | mol = Chem.MolFromSmiles(smiles, sanitize=True) 148 | # if the molecule is valid, i.e. can be parsed with the rdkit 149 | if mol: 150 | can_smiles = Chem.MolToSmiles(mol) 151 | can_mol = Chem.MolFromSmiles(can_smiles) 152 | else: 153 | print("Image translation failed.") 154 | can_smiles = None 155 | can_mol = None 156 | 157 | if not return_cddd: 158 | cddd = None 159 | 160 | return {"filepath": filepath, 161 | "cddd": cddd, "smiles": can_smiles, "mol": can_mol 162 | } 163 | 164 | def predict(self, filepath: str, 165 | cddd_server: CDDDRequest, 166 | return_cddd: bool = False) -> dict: 167 | return self.__call__(filepath, cddd_server, return_cddd) 168 | 169 | 170 | if __name__ == "__main__": 171 | 172 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 173 | img2mol = Img2MolInference(model_ckpt=None, 174 | device=device) 175 | cddd_server = CDDDRequest(host="http://ec2-18-157-240-87.eu-central-1.compute.amazonaws.com") 176 | 177 | example = "examples/example1.png" 178 | 179 | res = img2mol(filepath=example, cddd_server=cddd_server) 180 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 Machine Learning Research @ Bayer AG 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /example_inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Copyright 2021 Machine Learning Research @ Bayer AG\n", 8 | "\n", 9 | "Licensed for non-commercial use only, under the terms of the\n", 10 | "Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) license.\n", 11 | "\n", 12 | "You can find details at: https://creativecommons.org/licenses/by-nc/4.0/legalcode" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "id": "b9b48452", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import torch\n", 23 | "from img2mol.inference import *" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "id": "dca01497", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "from IPython.display import display" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "id": "30a2191a", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "from PIL import Image" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "id": "30d0b65b", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "import os" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 5, 59 | "id": "2d9605c0", 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "['.gitkeep', 'model.ckpt']" 66 | ] 67 | }, 68 | "execution_count": 5, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "os.listdir(\"model/\")" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 6, 80 | "id": "99236cf4", 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "Initializing Img2Mol Model with random weights.\n", 88 | "Loading checkpoint: model/model.ckpt\n", 89 | "Setting to `self.eval()`-mode.\n", 90 | "Sending model to `cuda:0` device.\n", 91 | "Succesfully created Img2Mol Inference class.\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", 97 | "img2mol = Img2MolInference(model_ckpt=\"model/model.ckpt\",\n", 98 | " device=device)\n", 99 | "cddd_server = CDDDRequest()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 7, 105 | "id": "24a20cfa", 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/plain": [ 111 | "['digital_example1.png',\n", 112 | " 'digital_example2.png',\n", 113 | " 'handwritten_example1.png',\n", 114 | " 'handwritten_example2.jpg']" 115 | ] 116 | }, 117 | "execution_count": 7, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "os.listdir(\"examples/\")" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 8, 129 | "id": "902c1057", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "res = img2mol(filepath=\"examples/digital_example1.png\", cddd_server=cddd_server)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 9, 139 | "id": "bd748334", 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "{'filepath': 'examples/digital_example1.png',\n", 146 | " 'cddd': None,\n", 147 | " 'smiles': 'Cn1c(=O)c2c(nc(Sc3ccccc3)n2C)n(C)c1=O',\n", 148 | " 'mol': }" 149 | ] 150 | }, 151 | "execution_count": 9, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "res" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 10, 163 | "id": "af273a49", 164 | "metadata": { 165 | "scrolled": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "input_img = Image.open(res[\"filepath\"], \"r\")" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 11, 175 | "id": "2657ad09", 176 | "metadata": { 177 | "scrolled": false 178 | }, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "image/png": "\n", 183 | "text/plain": [ 184 | "" 185 | ] 186 | }, 187 | "metadata": {}, 188 | "output_type": "display_data" 189 | } 190 | ], 191 | "source": [ 192 | "display(input_img)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "id": "80a91bd6", 198 | "metadata": {}, 199 | "source": [ 200 | "# show prediction" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 12, 206 | "id": "ebf67ab2", 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "Cn1c(=O)c2c(nc(Sc3ccccc3)n2C)n(C)c1=O\n", 214 | "\n" 215 | ] 216 | }, 217 | { 218 | "data": { 219 | "image/png": "\n", 220 | "text/plain": [ 221 | "" 222 | ] 223 | }, 224 | "execution_count": 12, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | } 228 | ], 229 | "source": [ 230 | "print(res[\"smiles\"])\n", 231 | "print()\n", 232 | "res[\"mol\"]" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "e68f0281", 238 | "metadata": {}, 239 | "source": [ 240 | "# Different Example" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 13, 246 | "id": "9655bd18", 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "example = \"examples/example2.png\"\n", 251 | "res = img2mol(filepath=\"examples/digital_example2.png\", cddd_server=cddd_server)\n", 252 | "input_img = Image.open(res[\"filepath\"], \"r\")" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 14, 258 | "id": "4b294b23", 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "image/png": "\n", 264 | "text/plain": [ 265 | "" 266 | ] 267 | }, 268 | "metadata": {}, 269 | "output_type": "display_data" 270 | } 271 | ], 272 | "source": [ 273 | "display(input_img)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 15, 279 | "id": "9afc6ad8", 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "CN(C)C(=O)COC(=O)Cc1ccc(OC(=O)c2ccc(N=C(N)N)cc2)cc1\n", 287 | "\n" 288 | ] 289 | }, 290 | { 291 | "data": { 292 | "image/png": "\n", 293 | "text/plain": [ 294 | "" 295 | ] 296 | }, 297 | "execution_count": 15, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "print(res[\"smiles\"])\n", 304 | "print()\n", 305 | "res[\"mol\"]" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "id": "f2274b9f", 311 | "metadata": {}, 312 | "source": [ 313 | "# Another example on a handwritten image" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 16, 319 | "id": "2d63aae9", 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "res = img2mol(filepath=\"examples/handwritten_example1.png\", cddd_server=cddd_server)\n", 324 | "input_img = Image.open(res[\"filepath\"], \"r\")" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 17, 330 | "id": "c5a2704f", 331 | "metadata": {}, 332 | "outputs": [ 333 | { 334 | "data": { 335 | "image/png": "\n", 336 | "text/plain": [ 337 | "" 338 | ] 339 | }, 340 | "metadata": {}, 341 | "output_type": "display_data" 342 | } 343 | ], 344 | "source": [ 345 | "display(input_img)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 18, 351 | "id": "76ec658c", 352 | "metadata": {}, 353 | "outputs": [ 354 | { 355 | "name": "stdout", 356 | "output_type": "stream", 357 | "text": [ 358 | "CCOC(=O)c1[nH]c2c(S(=O)(=O)N3CCN(C)CC3)ccc(OCC)c2c1C1CCNCC1\n", 359 | "\n" 360 | ] 361 | }, 362 | { 363 | "data": { 364 | "image/png": "\n", 365 | "text/plain": [ 366 | "" 367 | ] 368 | }, 369 | "execution_count": 18, 370 | "metadata": {}, 371 | "output_type": "execute_result" 372 | } 373 | ], 374 | "source": [ 375 | "print(res[\"smiles\"])\n", 376 | "print()\n", 377 | "res[\"mol\"]" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "# Next example " 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 20, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "res = img2mol(filepath=\"examples/handwritten_example2.jpg\", cddd_server=cddd_server)\n", 394 | "input_img = Image.open(res[\"filepath\"], \"r\")" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 21, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "image/png": "\n", 405 | "text/plain": [ 406 | "" 407 | ] 408 | }, 409 | "metadata": {}, 410 | "output_type": "display_data" 411 | } 412 | ], 413 | "source": [ 414 | "display(input_img)" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 22, 420 | "metadata": {}, 421 | "outputs": [ 422 | { 423 | "name": "stdout", 424 | "output_type": "stream", 425 | "text": [ 426 | "O=C1C(=C2Nc3ccccc3C2=O)Nc2ccccc21\n", 427 | "\n" 428 | ] 429 | }, 430 | { 431 | "data": { 432 | "image/png": "\n", 433 | "text/plain": [ 434 | "" 435 | ] 436 | }, 437 | "execution_count": 22, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "print(res[\"smiles\"])\n", 444 | "print()\n", 445 | "res[\"mol\"]" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [] 454 | } 455 | ], 456 | "metadata": { 457 | "kernelspec": { 458 | "display_name": "Python 3", 459 | "language": "python", 460 | "name": "python3" 461 | }, 462 | "language_info": { 463 | "codemirror_mode": { 464 | "name": "ipython", 465 | "version": 3 466 | }, 467 | "file_extension": ".py", 468 | "mimetype": "text/x-python", 469 | "name": "python", 470 | "nbconvert_exporter": "python", 471 | "pygments_lexer": "ipython3", 472 | "version": "3.8.5" 473 | } 474 | }, 475 | "nbformat": 4, 476 | "nbformat_minor": 5 477 | } 478 | --------------------------------------------------------------------------------