├── .gitignore ├── Examples.ipynb ├── LICENSE ├── README.md ├── molvecgen ├── __init__.py ├── generators.py └── vectorizers.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /Examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Standard Imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from rdkit import Chem\n", 17 | "from rdkit.Chem.Draw import IPythonConsole\n", 18 | "import numpy as np\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "%matplotlib inline" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "#For debugging, reimport modules when executing cells\n", 30 | "%load_ext autoreload\n", 31 | "%autoreload 2" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "# Working with the SMILES based vectorizer\n", 39 | "\n", 40 | "The SMILES based vectorizer uses the SMILES format to produce a sequence of one hot encoded characters suited for modelling with sequence oriented neural network architectures such as transformers and RNNs. Data augmentation is done via atom order permutation and generatiion of non-canonical SMILES.\n", 41 | "\n", 42 | "Reference: https://arxiv.org/abs/1703.07076\n", 43 | "\n" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "Import the SmilesVectorizer" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": { 57 | "scrolled": true 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "from molvecgen import SmilesVectorizer" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "Work with some molecules" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "smiles = [ \"CCC(=O)O[C@@]1(CC[NH+](C[C@H]1CC=C)C)c2ccccc2\",\n", 78 | " \"CCC[S@@](=O)c1ccc2c(c1)[nH]/c(=N/C(=O)OC)/[nH]2\"]*10\n", 79 | " \n", 80 | "mols = [Chem.MolFromSmiles(smile) for smile in smiles]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "image/png": "\n", 91 | "text/plain": [ 92 | "" 93 | ] 94 | }, 95 | "execution_count": 5, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "mols[0]" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "Create the object and fit the characterset and length. The object is called a SMILES vectorizer, but currently only work directly from lists of RDKit molecules. It works by generating the SMILES of the molecule with subsequent one hot encoding into a numpy array. The .fit() function analyses the dataset for which characters are used by the SMILES and updates the character set of the vectorizer as well as adjusting the length of the embedding." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "Default Charset @C)(=cOn1S2/H[N]\\^$?\n", 121 | "Default Maximum allowed SMILES length 120\n", 122 | "\n", 123 | "After fitting\n", 124 | "Charset after fit ]\\(S)[2=ONHCc@n/1+^$?\n", 125 | "Maximum allowed SMILES length 45\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "sm_en = SmilesVectorizer(canonical=True, augment=False)\n", 131 | "\n", 132 | "print(\"Default Charset %s\"%sm_en.charset)\n", 133 | "print(\"Default Maximum allowed SMILES length %s\"%sm_en.maxlength)\n", 134 | "\n", 135 | "sm_en.fit(mols, extra_chars=[\"\\\\\"])\n", 136 | "print()\n", 137 | "print(\"After fitting\")\n", 138 | "print(\"Charset after fit %s\"%sm_en.charset)\n", 139 | "print(\"Maximum allowed SMILES length %s\"%sm_en.maxlength)\n" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "The molecules can be transformed to vectors. The first one is plotted as \"piano roll\"" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 7, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "" 158 | ] 159 | }, 160 | "execution_count": 7, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | }, 164 | { 165 | "data": { 166 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjkAAAECCAYAAAAcvsaeAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAADdtJREFUeJzt3V+MpXdZB/Dv4+62FbChC7TpPwVNNfQCl2TTkuBFpWILGouJJBA1vSBZLyCBBGMqN6iJCV4I3BCTKg29QJQISGOMtVkx1cSsLLBCSYVWglB30wVWsiixtOXxYg4wbafsmTn/f/P5JJNz3nfeOe8z5zk7893feef3q+4OAMBofmTVBQAALIKQAwAMScgBAIYk5AAAQxJyAIAhCTkAwJBWGnKq6taq+kJVPVxVd6yyFn6gqu6qqrNV9cC2fYer6r6qemhye9kqaySpqmur6hNV9WBVfb6q3jrZr1drpqouqap/rap/m/Tq9yf7X1JVJya9+suqumjVtZJU1YGq+kxV/c1kW5821MpCTlUdSPK+JK9Jcn2SN1bV9auqh6f4QJJbn7bvjiTHu/u6JMcn26zWE0ne3t0vTfKKJG+e/BvSq/XzWJJXdffPJjmS5NaqekWSP0rynkmv/jvJm1ZYIz/w1iQPbtvWpw21ypGcG5I83N1f6u7vJPmLJLetsB4muvv+JOeetvu2JHdP7t+d5HVLLYpn6O4z3f3pyf1vZeuH8tXRq7XTW/5nsnlo8tFJXpXkryb79WoNVNU1SX4pyZ9Ntiv6tLFWGXKuTvLVbduPTPaxnq7o7jPJ1i/XJJevuB62qaoXJ3l5khPRq7U0eQvkVJKzSe5L8h9JvtndT0wO8TNwPbw3ye8k+e5k+wXRp421ypBTO+yzxgTsUlU9L8lHkrytu8+vuh521t1PdveRJNdkayT7pTsdttyq2K6qfjnJ2e7+1PbdOxyqTxvi4ArP/UiSa7dtX5Pk9Ipq4cIeraoru/tMVV2Zrf+NsmJVdShbAeeD3f3RyW69WmPd/c2q+sdsXUf1/Ko6OBkl8DNw9V6Z5Feq6rVJLklyabZGdvRpQ61yJOeTSa6bXLV+UZI3JLlnhfXww92T5PbJ/duTfHyFtZDvXyvw/iQPdve7t31Kr9ZMVb2oqp4/uf+jSX4hW9dQfSLJr00O06sV6+7f7e5ruvvF2fqd9A/d/evRp41Vq1yFfJKW35vkQJK7uvsPV1YM31dVH0pyU5IXJnk0yTuT/HWSDyf58SRfSfL67n76xcksUVX9XJJ/SvK5/OD6gXdk67ocvVojVfWybF2weiBb/7n8cHf/QVX9ZLb+6OJwks8k+Y3ufmx1lfI9VXVTkt/u7l/Wp8210pADALAoZjwGAIYk5AAAQxJyAIAhCTkAwJCEHABgSCsPOVV1bNU1MB292gz6tBn0aTPo02ZbechJ4gW0OfRqM+jTZtCnzaBPG2wdQg4AwNwtdTLAi+riviTPfcq+x/NYDuXipdXA3i2rVz/9sm9PfewXP/ucBVay/nZ6rr72jSfzohcceMq+/f48Lcq0r9Wdnn8/+zaDPq2f/8v/5jv92E4Lpz7DUkPOpXW4b6ybl3Y+NtO9p09NfewtVx1ZYCXrb9rnar8/T4vi+YflO9HHc77PTRVyZnq7qqpuraovVNXDVXXHLI8FADBPew45VXUgyfuSvCbJ9UneWFXXz6swAIBZzDKSc0OSh7v7S939nWyt0HrbfMoCAJjNLCHn6iRf3bb9yGTfU1TVsao6WVUnH4+V6QGA5Zgl5Ox00c8zrmLu7ju7+2h3H3WFOgCwLLOEnEeSXLtt+5okp2crBwBgPmYJOZ9Mcl1VvaSqLkryhiT3zKcsAIDZHNzrF3b3E1X1liT3JjmQ5K7u/vzcKgMAmIHJAAGAjbG0yQABANaVkAMADEnIAQCGJOQAAEMScgCAIQk5AMCQhBwAYEhCDgAwJCEHABjSnpd1ADbHvadPTX3sLVcdWWAlXMi0vVpUn1Z9fpgnIzkAwJCEHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQxJyAIAhCTkAwJCEHABgSNXdSzvZpXW4b6ybl3Y+AGAsJ/p4zve5muZYIzkAwJCEHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQxJyAIAhCTkAwJCEHABgSAdXXQCwue49fWqq42656siCK2EUq35Nrfr8zJeRHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQxJyAIAhCTkAwJCEHABgSNXdSzvZpXW4b6ybl3a+/WDa2TkTM3ROy3O6OcxOC/vPiT6e832upjnWSA4AMCQhBwAY0kwLdFbVl5N8K8mTSZ7o7qPzKAoAYFbzWIX857v763N4HACAufF2FQAwpFlDTif5+6r6VFUdm0dBAADzMOvbVa/s7tNVdXmS+6rq37v7/u0HTMLPsSS5JM+Z8XQAANOZaSSnu09Pbs8m+ViSG3Y45s7uPtrdRw/l4llOBwAwtT2HnKp6blX92PfuJ/nFJA/MqzAAgFnM8nbVFUk+VlXfe5w/7+6/m0tVAAAzsqwDALAxLOsAAOx7Qg4AMCQhBwAYkpADAAxJyAEAhiTkAABDEnIAgCEJOQDAkIQcAGBIQg4AMKRZ1q4iyb2nT0197C1XHVlgJQDL42cfm8BIDgAwJCEHABiSkAMADEnIAQCGJOQAAEMScgCAIQk5AMCQhBwAYEhCDgAwJDMez8hMnsAqrHrG4d085qprZf8ykgMADEnIAQCGJOQAAEMScgCAIQk5AMCQhBwAYEhCDgAwJCEHABiSkAMADEnIAQCGZFkHmIHp6lmVTXo9bVKtjMVIDgAwJCEHABiSkAMADEnIAQCGJOQAAEMScgCAIQk5AMCQhBwAYEhCDgAwJCEHABiSZR1YmmmXQNikKeB3U+uI3z/AOjOSAwAM6YIhp6ruqqqzVfXAtn2Hq+q+qnpocnvZYssEANidaUZyPpDk1qftuyPJ8e6+LsnxyTYAwNq4YMjp7vuTnHva7tuS3D25f3eS1825LgCAmez1mpwruvtMkkxuL59fSQAAs1v4X1dV1bEkx5Lkkjxn0acDAEiy95GcR6vqyiSZ3J59tgO7+87uPtrdRw/l4j2eDgBgd/Yacu5Jcvvk/u1JPj6fcgAA5mOaPyH/UJJ/SfIzVfVIVb0pybuSvLqqHkry6sk2AMDaqO5e2skurcN9Y928tPPNYhGz0077mLthxt3p7ffvf1q7eZ1O+1wt4rW/qPOP2P8Rv/9Vv6ZYnRN9POf7XE1zrBmPAYAhCTkAwJCEHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQxJyAIAhCTkAwJAs6wAAu7SopTIWtVzFKs17qQzLOgAA+56QAwAMScgBAIYk5AAAQxJyAIAhCTkAwJCEHABgSEIOADAkIQcAGJKQAwAM6eCqC2D9LGpa8XlP7Q2LsKjp+hnLonrvNTVfRnIAgCEJOQDAkIQcAGBIQg4AMCQhBwAYkpADAAxJyAEAhiTkAABDEnIAgCGZ8ZhnMOMm+5nXP4zDSA4AMCQhBwAYkpADAAxJyAEAhiTkAABDEnIAgCEJOQDAkIQcAGBIQg4AMCQhBwAYkpADAAxJyAEAhnTBkFNVd1XV2ap6YNu+36uq/6qqU5OP1y62TACA3ZlmJOcDSW7dYf97uvvI5ONv51sWAMBsLhhyuvv+JOeWUAsAwNzMck3OW6rqs5O3sy6bW0UAAHOw15DzJ0l+KsmRJGeS/PGzHVhVx6rqZFWdfDyP7fF0AAC7s6eQ092PdveT3f3dJH+a5IYfcuyd3X20u48eysV7rRMAYFf2FHKq6sptm7+a5IFnOxYAYBUOXuiAqvpQkpuSvLCqHknyziQ3VdWRJJ3ky0l+a4E1AgDs2gVDTne/cYfd719ALQAAc3PBkAPr7N7Tp6Y+9parjiywEnh2Xqf7237v/7Tf/yK+d8s6AABDEnIAgCEJOQDAkIQcAGBIQg4AMCQhBwAYkpADAAxJyAEAhiTkAABDqu5e2skurcN9Y928tPPBJtrvs6MC/DAn+njO97ma5lgjOQDAkIQcAGBIQg4AMCQhBwAYkpADAAxJyAEAhiTkAABDEnIAgCEJOQDAkIQcAGBIB1ddwDKZLp9N4LUHMB9GcgCAIQk5AMCQhBwAYEhCDgAwJCEHABiSkAMADEnIAQCGJOQAAEMScgCAIQk5AMCQ9tWyDqbLB4D1NO3SSzfc8u2pH9NIDgAwJCEHABiSkAMADEnIAQCGJOQAAEMScgCAIQk5AMCQhBwAYEhCDgAwpH014zEAsJ6mXZXgi/2NqR/TSA4AMKQLhpyquraqPlFVD1bV56vqrZP9h6vqvqp6aHJ72eLLBQCYzjQjOU8keXt3vzTJK5K8uaquT3JHkuPdfV2S45NtAIC1cMGQ091nuvvTk/vfSvJgkquT3Jbk7slhdyd53aKKBADYrV1dk1NVL07y8iQnklzR3WeSrSCU5PJ5FwcAsFdTh5yqel6SjyR5W3ef38XXHauqk1V18vE8tpcaAQB2baqQU1WHshVwPtjdH53sfrSqrpx8/sokZ3f62u6+s7uPdvfRQ7l4HjUDAFzQNH9dVUnen+TB7n73tk/dk+T2yf3bk3x8/uUBAOzNNJMBvjLJbyb5XFWdmux7R5J3JflwVb0pyVeSvH4xJQIA7N4FQ053/3OSepZP3zzfcgAA5qO6e3knq/pakv982u4XJvn60opgFnq1GfRpM+jTZtCn9fMT3f2iaQ5casjZsYCqk919dKVFMBW92gz6tBn0aTPo02azdhUAMCQhBwAY0jqEnDtXXQBT06vNoE+bQZ82gz5tsJVfkwMAsAjrMJIDADB3Qg4AMCQhBwAYkpADAAxJyAEAhvT/DX9UkUe3ToYAAAAASUVORK5CYII=\n", 167 | "text/plain": [ 168 | "
" 169 | ] 170 | }, 171 | "metadata": { 172 | "needs_background": "light" 173 | }, 174 | "output_type": "display_data" 175 | } 176 | ], 177 | "source": [ 178 | "mol_vects = sm_en.transform(mols)\n", 179 | "plt.matshow(mol_vects[0].T)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "It is also possible to translate the vector back into SMILES as long as the character set is the one that was used to encode the molecule. The start and end tokens are stripped by default, but can be kept if wanted." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 8, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "array(['C=CC[C@@H]1C[NH+](C)CC[C@]1(OC(=O)CC)c1ccccc1',\n", 198 | " 'CCC[S@@](=O)c1ccc2[nH]/c(=N\\\\C(=O)OC)[nH]c2c1'], dtype='" 372 | ] 373 | }, 374 | "execution_count": 13, 375 | "metadata": {}, 376 | "output_type": "execute_result" 377 | }, 378 | { 379 | "data": { 380 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjkAAAECCAYAAAAcvsaeAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAADb1JREFUeJzt3VGMZmdZB/D/Y7ttBWzoCm1qWwVNNfQCS7JpSfCiUpGCxGIiCURNL0jWC0ggwZjKDWpighcCN8SkSkMvECUC0hgjNktNNTErC1QoqdBKEGo3XXElixJLC48Xc4Bh2WW/me+bOd/3zu+XTOY7Z87Meb595sz89/3OvG91dwAARvNDcxcAALAXhBwAYEhCDgAwJCEHABiSkAMADEnIAQCGNGvIqarbqupzVfVoVd05Zy18V1XdXVWnquqhbfsOV9V9VfXI9P6KOWskqarrqur+qnq4qj5bVW+a9uvVmqmqy6rqn6vqX6Ze/d60//lVdXzq1V9U1SVz10pSVRdV1aeq6q+nbX3aULOFnKq6KMm7k7wiyQ1JXldVN8xVD9/jvUluO2vfnUmOdff1SY5N28zr6SRv6e4XJHlxkjdM15BerZ8nk7y0u382yY1JbquqFyf5wyTvnHr130leP2ONfNebkjy8bVufNtScIzk3JXm0u7/Q3d9I8udJbp+xHibd/UCS02ftvj3JPdPje5K8el+L4vt098nu/uT0+GvZ+qF8TfRq7fSW/5k2D01vneSlSf5y2q9Xa6Cqrk3yS0n+dNqu6NPGmjPkXJPky9u2H5v2sZ6u6u6TydYv1yRXzlwP21TV85K8KMnx6NVaml4CeTDJqST3Jfm3JF/t7qenQ/wMXA/vSvLbSb41bf9o9GljzRly6hz7rDEBO1RVz0rywSRv7u4zc9fDuXX3N7v7xiTXZmsk+wXnOmx/q2K7qnpVklPd/Yntu89xqD5tiItnPPdjSa7btn1tksdnqoULe6Kqru7uk1V1dbb+N8rMqupQtgLO+7r7Q9NuvVpj3f3Vqvr7bN1H9eyqungaJfAzcH4vSfLLVfXKJJcluTxbIzv6tKHmHMn5eJLrp7vWL0ny2iT3zlgPP9i9Se6YHt+R5CMz1kK+c6/Ae5I83N3v2PYhvVozVfXcqnr29PiHk/xCtu6huj/Jr06H6dXMuvt3uvva7n5etn4nfay7fy36tLFqzlXIp7T8riQXJbm7u/9gtmL4jqp6f5JbkjwnyRNJ3pbkr5J8IMmPJ/lSktd099k3J7OPqurnkvxDks/ku/cPvDVb9+Xo1Rqpqhdm64bVi7L1n8sPdPfvV9VPZuuPLg4n+VSSX+/uJ+erlG+rqluS/FZ3v0qfNtesIQcAYK+Y8RgAGJKQAwAMScgBAIYk5AAAQxJyAIAhzR5yquro3DWwGL3aDPq0GfRpM+jTZps95CTxDbQ59Goz6NNm0KfNoE8bbB1CDgDAyu3rZICX1KV9WZ75PfueypM5lEv3rYY5/fQLv77QcZ//9DP2uJLdOUi92mSb3qdFr5Nkfa+VRejTZtj0Po3o//K/+UY/ea6FU7/Pvoacy+tw31y37tv51s1HH39woeNe/mM37nElsL4WvU4S18qc9Im5HO9jOdOnFwo5S71cVVW3VdXnqurRqrpzma8FALBKuw45VXVRkncneUWSG5K8rqpuWFVhAADLWGYk56Ykj3b3F7r7G9laofX21ZQFALCcZULONUm+vG37sWnf96iqo1V1oqpOPBUr0wMA+2OZkHOum36+7y7m7r6ru4909xF3qAMA+2WZkPNYkuu2bV+b5PHlygEAWI1lQs7Hk1xfVc+vqkuSvDbJvaspCwBgORfv9hO7++mqemOSjya5KMnd3f3ZlVUGALAEkwGydkwyBsD57NtkgAAA60rIAQCGJOQAAEMScgCAIQk5AMCQhBwAYEhCDgAwJCEHABiSkAMADGnXyzowrrlnHN6rWYwXfV5mUWbV5r6m5nbQnz/zMZIDAAxJyAEAhiTkAABDEnIAgCEJOQDAkIQcAGBIQg4AMCQhBwAYkpADAAxJyAEAhlTdvW8nu7wO9811676dD2BUlkrgoDrex3KmT9cixxrJAQCGJOQAAEMScgCAIQk5AMCQhBwAYEhCDgAwJCEHABiSkAMADEnIAQCGJOQAAEO6eO4C4KBYdBr+vZiC3xIA49GnvTHndboO5x+NkRwAYEhCDgAwJCEHABiSkAMADEnIAQCGJOQAAEMScgCAIQk5AMCQhBwAYEjV3ft2ssvrcN9ct+7b+c5m1le4MNfJvMx4Cz/Y8T6WM326FjnWSA4AMCQhBwAY0lILdFbVF5N8Lck3kzzd3UdWURQAwLJWsQr5z3f3V1bwdQAAVsbLVQDAkJYNOZ3k76rqE1V1dBUFAQCswrIvV72kux+vqiuT3FdV/9rdD2w/YAo/R5PksjxjydMBACxmqZGc7n58en8qyYeT3HSOY+7q7iPdfeRQLl3mdAAAC9t1yKmqZ1bVj3z7cZJfTPLQqgoDAFjGMi9XXZXkw1X17a/zZ939tyupCgBgSQdqWQfgYLJUBcxn1UuVWNYBADjwhBwAYEhCDgAwJCEHABiSkAMADEnIAQCGJOQAAEMScgCAIQk5AMCQhBwAYEjLrF0FQ06XP+JzOuj0iVXzc2Jxcz5/IzkAwJCEHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQxJyAIAhCTkAwJCqu/ftZJfX4b65bt2388Fe24tZT82kCmNxTa/W8T6WM326FjnWSA4AMCQhBwAYkpADAAxJyAEAhiTkAABDEnIAgCEJOQDAkIQcAGBIQg4AMCQhBwAYkmUd2DeLTm1uWnMAzseyDgDAgSfkAABDEnIAgCEJOQDAkIQcAGBIQg4AMCQhBwAYkpADAAxJyAEAhiTkAABDunjuAjbdoksVJJYrOOjP/yBznQBzMJIDAAzpgiGnqu6uqlNV9dC2fYer6r6qemR6f8XelgkAsDOLjOS8N8ltZ+27M8mx7r4+ybFpGwBgbVww5HT3A0lOn7X79iT3TI/vSfLqFdcFALCU3d6Tc1V3n0yS6f2VqysJAGB5e/7XVVV1NMnRJLksz9jr0wEAJNn9SM4TVXV1kkzvT53vwO6+q7uPdPeRQ7l0l6cDANiZ3Yace5PcMT2+I8lHVlMOAMBqLPIn5O9P8k9JfqaqHquq1yd5e5KXVdUjSV42bQMArI3q7n072eV1uG+uW/ftfIzvoM+ku+jzH/G5M6adXNN7wbWy/o73sZzp07XIsWY8BgCGJOQAAEMScgCAIQk5AMCQhBwAYEhCDgAwJCEHABiSkAMADEnIAQCGJOQAAEOyrMMBYgkARrMX39N7sayAa4pVO8jLX1jWAQA48IQcAGBIQg4AMCQhBwAYkpADAAxJyAEAhiTkAABDEnIAgCEJOQDAkIQcAGBIlnUAGJylKhiJZR0AgANPyAEAhiTkAABDEnIAgCEJOQDAkIQcAGBIQg4AMCQhBwAYkpADAAzp4rkLWFeLzhBq1k8OMtfJZvDvz0FlJAcAGJKQAwAMScgBAIYk5AAAQxJyAIAhCTkAwJCEHABgSEIOADAkIQcAGJKQAwAMybIO57Ep06AvOq1+sjnPic3hewpYZ0ZyAIAhXTDkVNXdVXWqqh7atu93q+o/qurB6e2Ve1smAMDOLDKS894kt51j/zu7+8bp7W9WWxYAwHIuGHK6+4Ekp/ehFgCAlVnmnpw3VtWnp5ezrlhZRQAAK7DbkPPHSX4qyY1JTib5o/MdWFVHq+pEVZ14Kk/u8nQAADuzq5DT3U909ze7+1tJ/iTJTT/g2Lu6+0h3HzmUS3dbJwDAjuwq5FTV1ds2fyXJQ+c7FgBgDhecDLCq3p/kliTPqarHkrwtyS1VdWOSTvLFJL+5hzUCAOzYBUNOd7/uHLvfswe1AACsjGUd1pClGgBYZ5vye8qyDgDAkIQcAGBIQg4AMCQhBwAYkpADAAxJyAEAhiTkAABDEnIAgCEJOQDAkMx4vIbMYgzAOtuU31NGcgCAIQk5AMCQhBwAYEhCDgAwJCEHABiSkAMADEnIAQCGJOQAAEMScgCAIQk5AMCQNn5Zh48+/uDCx27KNNQAwPKM5AAAQxJyAIAhCTkAwJCEHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQxJyAIAhbfyyDpZqAIDNt+gyTTe9/OsLf00jOQDAkIQcAGBIQg4AMCQhBwAYkpADAAxJyAEAhiTkAABDEnIAgCEJOQDAkDZ+xmMAYPMtuoLB5/u/Fv6aRnIAgCFdMORU1XVVdX9VPVxVn62qN037D1fVfVX1yPT+ir0vFwBgMYuM5Dyd5C3d/YIkL07yhqq6IcmdSY519/VJjk3bAABr4YIhp7tPdvcnp8dfS/JwkmuS3J7knumwe5K8eq+KBADYqR3dk1NVz0vyoiTHk1zV3SeTrSCU5MpVFwcAsFsLh5yqelaSDyZ5c3ef2cHnHa2qE1V14qk8uZsaAQB2bKGQU1WHshVw3tfdH5p2P1FVV08fvzrJqXN9bnff1d1HuvvIoVy6ipoBAC5okb+uqiTvSfJwd79j24fuTXLH9PiOJB9ZfXkAALuzyGSAL0nyG0k+U1UPTvvemuTtST5QVa9P8qUkr9mbEgEAdu6CIae7/zFJnefDt662HACA1aju3r+TVf1nkn8/a/dzknxl34pgGXq1GfRpM+jTZtCn9fMT3f3cRQ7c15BzzgKqTnT3kVmLYCF6tRn0aTPo02bQp81m7SoAYEhCDgAwpHUIOXfNXQAL06vNoE+bQZ82gz5tsNnvyQEA2AvrMJIDALByQg4AMCQhBwAYkpADAAxJyAEAhvT/QuVwhyln7hEAAAAASUVORK5CYII=\n", 381 | "text/plain": [ 382 | "
" 383 | ] 384 | }, 385 | "metadata": { 386 | "needs_background": "light" 387 | }, 388 | "output_type": "display_data" 389 | } 390 | ], 391 | "source": [ 392 | "print(batch_y)\n", 393 | "plt.matshow(batch_x[0].T)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "In a for or while loop it will continue yielding new batches." 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 14, 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "[2 1 2 2]\n", 413 | "[2 1 1 2]\n", 414 | "[1 2 1 1]\n", 415 | "[2 1 1 2]\n", 416 | "[1 1 1 1]\n", 417 | "[1 2 2 1]\n", 418 | "[2 1 2 1]\n", 419 | "[2 1 2 2]\n", 420 | "[2 1 2 2]\n", 421 | "[1 2 2 1]\n" 422 | ] 423 | } 424 | ], 425 | "source": [ 426 | "for i in range(10):\n", 427 | " batch_x, batch_y = sm_gn.next()\n", 428 | " print(batch_y)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "# Hetero Generator\n", 436 | "The heterogenerator is special SMILESgenerator that returns smiles for both input to encoder and teacher forcing the decoder, as well as output from the decoder\n", 437 | "\n", 438 | "Reference: https://www.mdpi.com/2218-273X/8/4/131\n", 439 | "\n", 440 | "Blog-post: https://www.wildcardconsulting.dk/learn-how-to-improve-smiles-based-molecular-autoencoders-with-heteroencoders/" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 15, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "from molvecgen.generators import HetSmilesGenerator" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 16, 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "name": "stdout", 459 | "output_type": "stream", 460 | "text": [ 461 | "True\n", 462 | "False\n" 463 | ] 464 | } 465 | ], 466 | "source": [ 467 | "#If settings on generator is the same, it can be reused, otherwise recreate\n", 468 | "import copy\n", 469 | "vect1 = sm_en\n", 470 | "vect2 = copy.deepcopy(sm_en)\n", 471 | "vect2.augment = False # Set the augment to be false for testing purposes\n", 472 | "vect2.leftpad = False # Set the order of the SMILES to be from left to right\n", 473 | "\n", 474 | "print(vect1.augment)\n", 475 | "print(vect2.augment) " 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 17, 481 | "metadata": {}, 482 | "outputs": [ 483 | { 484 | "data": { 485 | "text/plain": [ 486 | "False" 487 | ] 488 | }, 489 | "execution_count": 17, 490 | "metadata": {}, 491 | "output_type": "execute_result" 492 | } 493 | ], 494 | "source": [ 495 | "vect2.leftpad" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 18, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "batchgen = HetSmilesGenerator(mols, None, vect1, vect2, batch_size=3) #Y is None" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 19, 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "_input, _output = batchgen.next()" 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": {}, 519 | "source": [ 520 | "The first input is one non-canonical form of the SMILES string. The second input and output is another non-canonical SMILES string of the same molecule, offset by a single character." 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 20, 526 | "metadata": {}, 527 | "outputs": [ 528 | { 529 | "data": { 530 | "text/plain": [ 531 | "" 532 | ] 533 | }, 534 | "execution_count": 20, 535 | "metadata": {}, 536 | "output_type": "execute_result" 537 | }, 538 | { 539 | "data": { 540 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjkAAAECCAYAAAAcvsaeAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAADcxJREFUeJzt3V+MpXdZB/DvY7ttBWzoCm36T0FTDb3AJdm0JHhRqdiCxGIiCURNL0jWC0ggwZjKDWpighcCN8SkSkMvEG0EbGOMtVkx1cSsLLBCSYVWglB30xVXUpRYWni8mBcYypY5M+fMvHN+8/kkmznvO++c95nznDnz3d955/er7g4AwGh+aO4CAAB2g5ADAAxJyAEAhiTkAABDEnIAgCEJOQDAkGYNOVV1a1V9rqoerao75qyF76qqu6rqbFU9tGnf4ap6oKoemT5eNmeNJFV1bVV9rKoerqrPVtVbp/16tc9U1SVV9c9V9S9Tr3532v/iqjox9erPq+qiuWslqaoLqupTVfVX07Y+ranZQk5VXZDkfUleneT6JG+squvnqofv8YEktz5j3x1Jjnf3dUmOT9vM6+kkb+/ulyR5eZI3Tz9DerX/PJnkld39M0mOJLm1ql6e5A+SvGfq1X8nedOMNfJdb03y8KZtfVpTc47k3JDk0e7+Qnd/I8mfJbltxnqYdPeDSc49Y/dtSe6ebt+d5HV7WhTfp7vPdPcnp9tfy8aL8tXRq32nN/zPtHlo+tdJXpnkL6b9erUPVNU1SX4xyZ9M2xV9Wltzhpyrk3x50/Zj0z72pyu6+0yy8cs1yeUz18MmVfWiJC9LciJ6tS9Nb4GcSnI2yQNJ/i3JV7v76ekQr4H7w3uT/FaSb03bPxp9Wltzhpw6zz5rTMA2VdXzknw4ydu6+4m56+H8uvub3X0kyTXZGMl+yfkO29uq2KyqXpvkbHd/YvPu8xyqT2viwhnP/ViSazdtX5Pk9Ey1sLXHq+rK7j5TVVdm43+jzKyqDmUj4Hywuz8y7darfay7v1pVf5+N66ieX1UXTqMEXgPn94okv1RVr0lySZJLszGyo09ras6RnI8nuW66av2iJG9Ict+M9fCD3Zfk9un27UnunbEW8p1rBd6f5OHufvemT+nVPlNVL6yq50+3fzjJz2fjGqqPJfmV6TC9mll3/3Z3X9PdL8rG76S/6+5fjT6trZpzFfIpLb83yQVJ7uru35+tGL6jqj6U5KYkL0jyeJJ3JvnLJPck+bEkX0ry+u5+5sXJ7KGq+tkk/5DkM/nu9QPvyMZ1OXq1j1TVS7NxweoF2fjP5T3d/XtV9RPZ+KOLw0k+leTXuvvJ+Srl26rqpiS/2d2v1af1NWvIAQDYLWY8BgCGJOQAAEMScgCAIQk5AMCQhBwAYEizh5yqOjZ3DSxGr9aDPq0HfVoP+rTeZg85STyB1oderQd9Wg/6tB70aY3th5ADALByezoZ4EV1cV+S537PvqfyZA7l4j2rgZ07KL36qZd+feFjP//p5+xiJTtzUPq07vZrnxZ9/u/H5/5u2Os+efy39n/533yjnzzfwqnfZ09DzqV1uG+sm/fsfLAT958+tfCxt1x1ZBcrgb236PPfc393ePy3dqKP54k+t1DIWertqqq6tao+V1WPVtUdy9wXAMAq7TjkVNUFSd6X5NVJrk/yxqq6flWFAQAsY5mRnBuSPNrdX+jub2RjhdbbVlMWAMBylgk5Vyf58qbtx6Z936OqjlXVyao6+VSsTA8A7I1lQs75Lvr5vquYu/vO7j7a3Uf3418SAABjWibkPJbk2k3b1yQ5vVw5AACrsUzI+XiS66rqxVV1UZI3JLlvNWUBACznwp1+YXc/XVVvSXJ/kguS3NXdn11ZZQAASzAZIACwNvZsMkAAgP1KyAEAhiTkAABDEnIAgCEJOQDAkIQcAGBIQg4AMCQhBwAYkpADAAxpx8s6jO7+06cWOu6Wq47Mep+sD/1nJIs+nxPPaeZjJAcAGJKQAwAMScgBAIYk5AAAQxJyAIAhCTkAwJCEHABgSEIOADAkIQcAGJKQAwAMqbp7z052aR3uG+vmPTsfADCWE308T/S5WuRYIzkAwJCEHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQxJyAIAhCTkAwJCEHABgSBfOXQB75/7TpxY67parjgx5fg6uRZ97ieff3LxOsEpGcgCAIQk5AMCQhBwAYEhCDgAwJCEHABiSkAMADEnIAQCGJOQAAEMScgCAIVV379nJLq3DfWPdvGfnYz2ZnZbReE7Py+M/lhN9PE/0uVrkWCM5AMCQhBwAYEhLLdBZVV9M8rUk30zydHcfXUVRAADLWsUq5D/X3V9Zwf0AAKyMt6sAgCEtG3I6yd9W1Seq6tgqCgIAWIVl3656RXefrqrLkzxQVf/a3Q9uPmAKP8eS5JI8Z8nTAQAsZqmRnO4+PX08m+SjSW44zzF3dvfR7j56KBcvczoAgIXtOORU1XOr6ke+fTvJLyR5aFWFAQAsY5m3q65I8tGq+vb9/Gl3/81KqgIAWJJlHWCNLTpd/ahT1R/07x8OIss6AAAHnpADAAxJyAEAhiTkAABDEnIAgCEJOQDAkIQcAGBIQg4AMCQhBwAYkpADAAxpmbWrGNSiU+Unpsuf20F//A/69w9zWZffE0ZyAIAhCTkAwJCEHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQxJyAIAhHagZj9dlhsa5bed795gCHDzr8npuJAcAGJKQAwAMScgBAIYk5AAAQxJyAIAhCTkAwJCEHABgSEIOADAkIQcAGJKQAwAM6UAt67Au01CvE48prNaiS6X42YOtGckBAIYk5AAAQxJyAIAhCTkAwJCEHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQzpQyzoA7HeLLtew6PIP27lPdodezcdIDgAwpC1DTlXdVVVnq+qhTfsOV9UDVfXI9PGy3S0TAGB7FhnJ+UCSW5+x744kx7v7uiTHp20AgH1jy5DT3Q8mOfeM3bcluXu6fXeS1624LgCApez0mpwruvtMkkwfL19dSQAAy9v1v66qqmNJjiXJJXnObp8OACDJzkdyHq+qK5Nk+nj22Q7s7ju7+2h3Hz2Ui3d4OgCA7dlpyLkvye3T7duT3LuacgAAVmORPyH/UJJ/SvLTVfVYVb0pybuSvKqqHknyqmkbAGDfqO7es5NdWof7xrp5z863F3ZjJsvt3Od2jDiTpplEWbXd+Pk76M+9uV8nt/P4L3q/69TTuZ/Tq35MT/TxPNHnapFjzXgMAAxJyAEAhiTkAABDEnIAgCEJOQDAkIQcAGBIQg4AMCQhBwAYkpADAAxJyAEAhmRZB9hn5p4Cf52s09T6wGpY1gEAOPCEHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQxJyAIAhCTkAwJCEHABgSBfOXQDslUWXNph7qYDdOP/c3xOMZjeWX2H1jOQAAEMScgCAIQk5AMCQhBwAYEhCDgAwJCEHABiSkAMADEnIAQCGJOQAAEMy4zEHhllHgVXxerIejOQAAEMScgCAIQk5AMCQhBwAYEhCDgAwJCEHABiSkAMADEnIAQCGJOQAAEMScgCAIQk5AMCQhBwAYEhbhpyququqzlbVQ5v2/U5V/UdVnZr+vWZ3ywQA2J5FRnI+kOTW8+x/T3cfmf799WrLAgBYzpYhp7sfTHJuD2oBAFiZZa7JeUtVfXp6O+uylVUEALACOw05f5TkJ5McSXImyR8+24FVdayqTlbVyafy5A5PBwCwPTsKOd39eHd/s7u/leSPk9zwA469s7uPdvfRQ7l4p3UCAGzLjkJOVV25afOXkzz0bMcCAMzhwq0OqKoPJbkpyQuq6rEk70xyU1UdSdJJvpjkN3axRgCAbdsy5HT3G8+z+/27UAsAwMpsGXKAZ3f/6VMLH3vLVUd2sRLgIPHasxjLOgAAQxJyAIAhCTkAwJCEHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQzLjMSzhIM8kyvowO+549GkxRnIAgCEJOQDAkIQcAGBIQg4AMCQhBwAYkpADAAxJyAEAhiTkAABDEnIAgCEJOQDAkNZ+WQfTlQP8YF77OKiM5AAAQxJyAIAhCTkAwJCEHABgSEIOADAkIQcAGJKQAwAMScgBAIYk5AAAQxJyAIAhrf2yDqYrB4D1t+gyTTfc8vWF79NIDgAwJCEHABiSkAMADEnIAQCGJOQAAEMScgCAIQk5AMCQhBwAYEhCDgAwpLWf8RgAWH+LrmDw+f6vhe/TSA4AMKQtQ05VXVtVH6uqh6vqs1X11mn/4ap6oKoemT5etvvlAgAsZpGRnKeTvL27X5Lk5UneXFXXJ7kjyfHuvi7J8WkbAGBf2DLkdPeZ7v7kdPtrSR5OcnWS25LcPR12d5LX7VaRAADbta1rcqrqRUleluREkiu6+0yyEYSSXL7q4gAAdmrhkFNVz0vy4SRv6+4ntvF1x6rqZFWdfCpP7qRGAIBtWyjkVNWhbAScD3b3R6bdj1fVldPnr0xy9nxf2913dvfR7j56KBevomYAgC0t8tdVleT9SR7u7ndv+tR9SW6fbt+e5N7VlwcAsDOLTAb4iiS/nuQzVXVq2veOJO9Kck9VvSnJl5K8fndKBADYvi1DTnf/Y5J6lk/fvNpyAABWo7p7705W9Z9J/v0Zu1+Q5Ct7VgTL0Kv1oE/rQZ/Wgz7tPz/e3S9c5MA9DTnnLaDqZHcfnbUIFqJX60Gf1oM+rQd9Wm/WrgIAhiTkAABD2g8h5865C2BherUe9Gk96NN60Kc1Nvs1OQAAu2E/jOQAAKyckAMADEnIAQCGJOQAAEMScgCAIf0/PUJR254mWFIAAAAASUVORK5CYII=\n", 541 | "text/plain": [ 542 | "
" 543 | ] 544 | }, 545 | "metadata": { 546 | "needs_background": "light" 547 | }, 548 | "output_type": "display_data" 549 | }, 550 | { 551 | "data": { 552 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAi4AAAECCAYAAADZzFwPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAADbBJREFUeJzt3V+M5WdZB/Dv43bbWrChC22z/aOgKYZe4JJMWhK8qFRsQWJrIgmNml6QrBeQQIIxlRvUxAQvBG6ISZWGXiBKCkhjjGtbaqqJqWxhLSUVWglC3U2XWgg1JKUtjxdzoNPd2Z2zM+ffO/v5JJM5v3femd+z+5458533nHl+1d0BABjBTy27AACAaQkuAMAwBBcAYBiCCwAwDMEFABiG4AIADGOpwaWqbqyqr1XV41V12zJr4UVVdUdVHa+qRzaM7auqe6rqscn7i5ZZI0lVXVlV91fVo1X11ap672TcWq2Yqjq/qv69qv5jslZ/PBl/TVU9OFmrv62qc5ddK0lV7amqL1fV30+OrdMKWVpwqao9ST6W5K1Jrk5yS1Vdvax6eIlPJLnxhLHbktzX3VcluW9yzHI9n+T93f26JG9M8u7J95C1Wj3PJnlzd/9SkgNJbqyqNyb5syQfmazVd5O8a4k18qL3Jnl0w7F1WiHL3HG5Jsnj3f2N7v5hkr9JctMS62Giux9I8vQJwzcluXNy+84kNy+0KE7S3ce6+0uT289k/YH28lirldPr/m9yuHfy1knenOSuybi1WgFVdUWSX0/yV5PjinVaKcsMLpcn+faG4ycmY6ymS7v7WLL+AzPJJUuuhw2q6tVJ3pDkwVirlTR5+uFIkuNJ7knyX0m+193PT6Z4DFwNH03yB0l+NDl+ZazTSllmcKlNxlx/AM5QVb08yWeSvK+7v7/sethcd7/Q3QeSXJH1HefXbTZtsVWxUVW9Pcnx7n5o4/AmU63TEp2zxHM/keTKDcdXJDm6pFrY2pNVtb+7j1XV/qz/1siSVdXerIeWT3b3ZyfD1mqFdff3quqfs/66pFdU1TmT3+Y9Bi7fm5L8RlW9Lcn5SS7M+g6MdVohy9xx+WKSqyav1j43yTuT3L3Eeji9u5PcOrl9a5LPL7EW8pPn3j+e5NHu/vCGD1mrFVNVF1fVKya3fzrJr2b9NUn3J/mtyTRrtWTd/YfdfUV3vzrrP5O+0N2/Heu0UmqZV4eepNqPJtmT5I7u/tOlFcNPVNWnklyX5FVJnkzywSR/l+TTSX42ybeSvKO7T3wBLwtUVb+c5F+SfCUvPh//gay/zsVarZCqen3WX9S5J+u/MH66u/+kqn4+63+YsC/Jl5P8Tnc/u7xK+bGqui7J73f3263TallqcAEAOBM65wIAwxBcAIBhCC4AwDAEFwBgGIILADCMpQeXqjq47BqYjrUag3Uag3Uag3VaPUsPLkncKcZhrcZgncZgncZgnVbMKgQXAICpLLQB3bl1Xp+fl71k7Lk8m705b2E1sH07XavXvv4HU837+sMXbPsc+J5aRZvd97/zvy/k4lfuOWnc/X/2dvLY4/tpcZ7Jd5/q7ou3mrfQ4HJh7etr6/qFnY/Vcujokanm3XDZgTlXAos17X0/cf+fB489Y7i373qou9e2mrejp4qq6saq+lpVPV5Vt+3kawEAbGXbwaWq9iT5WJK3Jrk6yS1VdfWsCgMAONFOdlyuSfJ4d3+ju3+Y9Stn3jSbsgAATraT4HJ5km9vOH5iMvYSVXWwqg5X1eHn4irgAMD27SS41CZjJ73St7tv7+617l7zymwAYCd2ElyeSHLlhuMrkhzdWTkAAKe2k+DyxSRXVdVrqurcJO9McvdsygIAONk52/3E7n6+qt6T5FCSPUnu6O6vzqwyAIATaEAHACzdQhrQAQAskuACAAxDcAEAhiG4AADDEFwAgGEILgDAMAQXAGAYggsAMAzBBQAYxrZb/u92h44emWreDZcdmHMlnI51Yrdxn4bTs+MCAAxDcAEAhiG4AADDEFwAgGEILgDAMAQXAGAYggsAMAzBBQAYhuACAAxDcAEAhlHdvbCTXVj7+tq6fmHnAwDGcG/f9VB3r201z44LADAMwQUAGIbgAgAMQ3ABAIYhuAAAwxBcAIBhCC4AwDAEFwBgGIILADAMwQUAGMY5yy6Akx06emTquTdcdmCOlcDpTXtfdT9dLo8p7CZ2XACAYQguAMAwBBcAYBiCCwAwDMEFABiG4AIADENwAQCGIbgAAMMQXACAYVR3L+xkF9a+vrauX9j5eCldTtlNdINdLv//zNq9fddD3b221Tw7LgDAMAQXAGAYO7rIYlV9M8kzSV5I8vw0WzwAANs1i6tD/0p3PzWDrwMAcFqeKgIAhrHT4NJJ/qmqHqqqg7MoCADgVHb6VNGbuvtoVV2S5J6q+s/ufmDjhEmgOZgk5+eCHZ4OADib7WjHpbuPTt4fT/K5JNdsMuf27l7r7rW9OW8npwMAznLbDi5V9bKq+pkf307ya0kemVVhAAAn2slTRZcm+VxV/fjr/HV3/+NMqgIA2ISW/3CCs72V+dn+7weWQ8t/AGDXEVwAgGEILgDAMAQXAGAYggsAMAzBBQAYhuACAAxDcAEAhiG4AADDEFwAgGHs5FpFRHv03ehsX6ez/d8Py+LnyXTsuAAAwxBcAIBhCC4AwDAEFwBgGIILADAMwQUAGIbgAgAMQ3ABAIYhuAAAw9A5d4fOpHvhtF0R5/E1z/TrArBYHqOnY8cFABiG4AIADENwAQCGIbgAAMMQXACAYQguAMAwBBcAYBiCCwAwDMEFABiG4AIADEPL/wWaRztnLaJhay6NAbuHHRcAYBiCCwAwDMEFABiG4AIADENwAQCGIbgAAMMQXACAYQguAMAwBBcAYBiCCwAwDC3/gV3vTNr4T3t5AJcGWC6XcTh72XEBAIaxZXCpqjuq6nhVPbJhbF9V3VNVj03eXzTfMgEApttx+USSG08Yuy3Jfd19VZL7JscAAHO1ZXDp7geSPH3C8E1J7pzcvjPJzTOuCwDgJNt9jcul3X0sSSbvL5ldSQAAm5v7XxVV1cEkB5Pk/Fww79MBALvYdndcnqyq/UkyeX/8VBO7+/buXuvutb05b5unAwDYfnC5O8mtk9u3Jvn8bMoBADi1af4c+lNJ/i3JL1bVE1X1riQfSvKWqnosyVsmxwAAc1XdvbCTXVj7+tq6fqq5Z9IVcR7O5k6LOlIya/P4fj7b73vz+j6dR+fg3diNeNn36d34f3pv3/VQd69tNU/nXABgGIILADAMwQUAGIbgAgAMQ3ABAIYhuAAAwxBcAIBhCC4AwDAEFwBgGIILADCMlW35D7O07PboIxmpRTiwe2j5DwDsOoILADAMwQUAGIbgAgAMQ3ABAIYhuAAAwxBcAIBhCC4AwDAEFwBgGIILADCMc5ZdAGObVyv9WZvXubXHh9kZ5fGE5bLjAgAMQ3ABAIYhuAAAwxBcAIBhCC4AwDAEFwBgGIILADAMwQUAGIbgAgAMQ+dcdkT3SmBWPJ4wDTsuAMAwBBcAYBiCCwAwDMEFABiG4AIADENwAQCGIbgAAMMQXACAYQguAMAwBBcAYBiCCwAwDMEFABjGlsGlqu6oquNV9ciGsT+qqv+pqiOTt7fNt0wAgOl2XD6R5MZNxj/S3Qcmb/8w27IAAE62ZXDp7geSPL2AWgAATmsnr3F5T1U9PHkq6aKZVQQAcArbDS5/keQXkhxIcizJn59qYlUdrKrDVXX4uTy7zdMBAGwzuHT3k939Qnf/KMlfJrnmNHNv7+617l7bm/O2WycAwPaCS1Xt33D4m0keOdVcAIBZOWerCVX1qSTXJXlVVT2R5INJrquqA0k6yTeT/N4cawQASDJFcOnuWzYZ/vgcagEAOK0tgwss2qGjR6aee8NlB+ZYCXA28dgzBi3/AYBhCC4AwDAEFwBgGIILADAMwQUAGIbgAgAMQ3ABAIYhuAAAwxBcAIBh6JzLytGRkhHosrr7WKcx2HEBAIYhuAAAwxBcAIBhCC4AwDAEFwBgGIILADAMwQUAGIbgAgAMQ3ABAIYhuAAAw1hoy//Xvv4HOXRoujbZWi8Dq8xjFCyHHRcAYBiCCwAwDMEFABiG4AIADENwAQCGIbgAAMMQXACAYQguAMAwBBcAYBiCCwAwjIW2/P/6wxdokw0AC3Lo6HSX2VkFe/ZPN8+OCwAwDMEFABiG4AIADENwAQCGIbgAAMMQXACAYQguAMAwBBcAYBiCCwAwjIV2zgUAFmesbvWPTzXLjgsAMIwtg0tVXVlV91fVo1X11ap672R8X1XdU1WPTd5fNP9yAYCz2TQ7Ls8neX93vy7JG5O8u6quTnJbkvu6+6ok902OAQDmZsvg0t3HuvtLk9vPJHk0yeVJbkpy52TanUlunleRAADJGb7GpapeneQNSR5Mcml3H0vWw02SS2ZdHADARlMHl6p6eZLPJHlfd3//DD7vYFUdrqrDz+XZ7dQIAJBkyuBSVXuzHlo+2d2fnQw/WVX7Jx/fn+T4Zp/b3bd391p3r+3NebOoGQA4S03zV0WV5ONJHu3uD2/40N1Jbp3cvjXJ52dfHgDAi6ZpQPemJL+b5CtVdWQy9oEkH0ry6ap6V5JvJXnHfEoEAFi3ZXDp7n9NUqf48PWzLQcA4NSquxd3sqrvJPnvE4ZfleSphRXBTlirMVinMVinMVinxfm57r54q0kLDS6bFlB1uLvXlloEU7FWY7BOY7BOY7BOq8e1igCAYQguAMAwViG43L7sApiatRqDdRqDdRqDdVoxS3+NCwDAtFZhxwUAYCqCCwAwDMEFABiG4AIADENwAQCG8f9bslFtp/cjagAAAABJRU5ErkJggg==\n", 553 | "text/plain": [ 554 | "
" 555 | ] 556 | }, 557 | "metadata": { 558 | "needs_background": "light" 559 | }, 560 | "output_type": "display_data" 561 | }, 562 | { 563 | "data": { 564 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAi4AAAECCAYAAADZzFwPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAADaFJREFUeJzt3V+MpXdZB/Dv43bbCtjQFdps/yhoqqEXsCSTlgQvKhVbkFhMJIGo6QXJegEJJBhTuUFNTPBC4IaYVNu0F4iSAtIY49qWmmpigC2sUFKxlSDU3XQlQKgxKS08XsyBDruznbMzZ857fjOfTzI5533nN/M+M8+ZM9/5nXd+b3V3AABG8BNTFwAAMC/BBQAYhuACAAxDcAEAhiG4AADDEFwAgGFMGlyq6uaq+kpVPV5Vt01ZC8+pqjur6nRVPbJh36Gquq+qHpvdXjpljSRVdXVVPVhVj1bVl6vqXbP9erViquriqvpsVf3brFd/NNv/8qr6zKxXf1NVF05dK0lVHaiqL1TV38229WmFTBZcqupAkg8neUOSa5O8raqunaoefsxdSW4+Y99tSR7o7muSPDDbZlrPJnlPd78iyWuSvGP2M6RXq+fpJK/r7lclOZLk5qp6TZI/TfLBWa++neTtE9bIc96V5NEN2/q0QqaccbkuyePd/dXu/l6Sv05yy4T1MNPdDyX51hm7b0ly9+z+3UnevNSiOEt3n+ruz8/uP5X1J9oro1crp9f972zz4Oytk7wuyT2z/Xq1AqrqqiS/luQvZ9sVfVopUwaXK5N8Y8P2E7N9rKbLu/tUsv4LM8llE9fDBlX1siSvTvKZ6NVKmr38cCLJ6ST3JfnPJN/p7mdnQzwHroYPJfn9JD+Ybf909GmlTBlcapN9rj8A56mqXpTk40ne3d3fnboeNtfd3+/uI0muyvqM8ys2G7bcqtioqt6U5HR3P7xx9yZD9WlCF0x47CeSXL1h+6okJyeqha09WVWHu/tUVR3O+l+NTKyqDmY9tHykuz8x261XK6y7v1NV/5T185JeXFUXzP6a9xw4vdcm+fWqemOSi5NckvUZGH1aIVPOuHwuyTWzs7UvTPLWJPdOWA/P794kt87u35rkUxPWQn702vsdSR7t7g9seJderZiqemlVvXh2/yeT/ErWz0l6MMlvzobp1cS6+w+6+6ruflnWfyd9urt/K/q0UmrKq0PPUu2HkhxIcmd3/8lkxfAjVfXRJDckeUmSJ5O8L8nfJvlYkp9J8vUkb+nuM0/gZYmq6peS/HOSL+W51+Pfm/XzXPRqhVTVK7N+UueBrP/B+LHu/uOq+rms/2PCoSRfSPLb3f30dJXyQ1V1Q5Lf6+436dNqmTS4AACcDyvnAgDDEFwAgGEILgDAMAQXAGAYggsAMIzJg0tVHZ26BuajV2PQpzHo0xj0afVMHlySeFCMQ6/GoE9j0Kcx6NOKWYXgAgAwl6UuQHdhXdQX54U/tu+ZPJ2DuWhpNbB9m/XqF175f3N//H988QWLLolN+Jlajp0+9vVpZ5b13KNPy/NUvv3N7n7pVuOWGlwuqUN9fd24tOOx+46dPDH32JuuOLKLlcByeexPy/d/77m/73m4u9e2Grejl4qq6uaq+kpVPV5Vt+3kcwEAbGXbwaWqDiT5cJI3JLk2yduq6tpFFQYAcKadzLhcl+Tx7v5qd38v61fOvGUxZQEAnG0nweXKJN/YsP3EbN+PqaqjVXW8qo4/E1cBBwC2byfBpTbZd9aZvt19e3evdfeaM7MBgJ3YSXB5IsnVG7avSnJyZ+UAAJzbToLL55JcU1Uvr6oLk7w1yb2LKQsA4GwXbPcDu/vZqnpnkmNJDiS5s7u/vLDKAADOYAE6AGByS1mADgBgmQQXAGAYggsAMAzBBQAYhuACAAxDcAEAhiG4AADDEFwAgGEILgDAMLa95P+Ijp08MffYm644souV8Hz0ib3GYxoWx4wLADAMwQUAGIbgAgAMQ3ABAIYhuAAAwxBcAIBhCC4AwDAEFwBgGIILADAMwQUAGEZ199IOdkkd6uvrxqUdDwAYw/19z8PdvbbVODMuAMAwBBcAYBiCCwAwDMEFABiG4AIADENwAQCGIbgAAMMQXACAYQguAMAwBBcAYBgXTF3AfnLs5Im5xt10xZFdrgTObd7HaeKxOiV9Yr8y4wIADENwAQCGIbgAAMMQXACAYQguAMAwBBcAYBiCCwAwDMEFABiG4AIADKO6e2kHu6QO9fV149KOtx9YPZO9xgrT0/L9Zyr39z0Pd/faVuPMuAAAwxBcAIBh7Ogii1X1tSRPJfl+kmfnmeIBANiuRVwd+pe7+5sL+DwAAM/LS0UAwDB2Glw6yT9W1cNVdXQRBQEAnMtOXyp6bXefrKrLktxXVf/e3Q9tHDALNEeT5OK8YIeHAwD2sx3NuHT3ydnt6SSfTHLdJmNu7+617l47mIt2cjgAYJ/bdnCpqhdW1U/98H6SX03yyKIKAwA4005eKro8ySer6oef56+6+x8WUhUAwCYs+c++sN8vjbDfv35g9VnyHwDYcwQXAGAYggsAMAzBBQAYhuACAAxDcAEAhiG4AADDEFwAgGEILgDAMAQXAGAYO7lW0Z427xLplkcfw37v037/+mFKfp8slhkXAGAYggsAMAzBBQAYhuACAAxDcAEAhiG4AADDEFwAgGEILgDAMAQXAGAYVs49h3lXMJx3RcSpPycA0/A8vVhmXACAYQguAMAwBBcAYBiCCwAwDMEFABiG4AIADENwAQCGIbgAAMMQXACAYQguAMAwLPm/Q7uxlLPloWFrLo0B+5MZFwBgGIILADAMwQUAGIbgAgAMQ3ABAIYhuAAAwxBcAIBhCC4AwDAEFwBgGIILADAMS/4DQzqfZfxdHmAM+sQ8zLgAAMPYMrhU1Z1VdbqqHtmw71BV3VdVj81uL93dMgEA5ptxuSvJzWfsuy3JA919TZIHZtsAALtqy+DS3Q8l+dYZu29Jcvfs/t1J3rzgugAAzrLdc1wu7+5TSTK7vWxxJQEAbG7X/6uoqo4mOZokF+cFu304AGAP2+6My5NVdThJZrenzzWwu2/v7rXuXjuYi7Z5OACA7QeXe5PcOrt/a5JPLaYcAIBzm+ffoT+a5F+T/GJVPVFVb0/y/iSvr6rHkrx+tg0AsKuqu5d2sLVXXdyfPXb10o53pv2+0uK8q1Lu9+8T8zuflU7ntd8ff7vxc7obK9LuxVVud+PxnOzv7+n5uL/vebi717YaZ+VcAGAYggsAMAzBBQAYhuACAAxDcAEAhiG4AADDEFwAgGEILgDAMAQXAGAYggsAMIylLvl/SR3q6+vGpR2PvW/q5dFHsReXBwf2Fkv+AwB7juACAAxDcAEAhiG4AADDEFwAgGEILgDAMAQXAGAYggsAMAzBBQAYhuACAAzjgqkLYPWcz5L3Uy8lvxvHn/prgr1mNy7Nwf5lxgUAGIbgAgAMQ3ABAIYhuAAAwxBcAIBhCC4AwDAEFwBgGIILADAMwQUAGIaVczmL1SuBRfKcwiKZcQEAhiG4AADDEFwAgGEILgDAMAQXAGAYggsAMAzBBQAYhuACAAxDcAEAhiG4AADDEFwAgGEILgDAMLYMLlV1Z1WdrqpHNuz7w6r676o6MXt74+6WCQAw34zLXUlu3mT/B7v7yOzt7xdbFgDA2bYMLt39UJJvLaEWAIDntZNzXN5ZVV+cvZR06cIqAgA4h+0Glz9P8vNJjiQ5leTPzjWwqo5W1fGqOv5Mnt7m4QAAthlcuvvJ7v5+d/8gyV8kue55xt7e3WvdvXYwF223TgCA7QWXqjq8YfM3kjxyrrEAAItywVYDquqjSW5I8pKqeiLJ+5LcUFVHknSSryX53V2sEQAgyRzBpbvftsnuO3ahFgCA57VlcIFFOXbyxFzjbrriyC5XAuwnnnv2Fkv+AwDDEFwAgGEILgDAMAQXAGAYggsAMAzBBQAYhuACAAxDcAEAhiG4AADDsHIuS2NVSkZgldW9R6/2FjMuAMAwBBcAYBiCCwAwDMEFABiG4AIADENwAQCGIbgAAMMQXACAYQguAMAwBBcAYBiW/AfYwPLwsNrMuAAAwxBcAIBhCC4AwDAEFwBgGIILADAMwQUAGIbgAgAMQ3ABAIYhuAAAwxBcAIBhWPIfAAZz7OSJqUtYuAOH5xtnxgUAGIbgAgAMQ3ABAIYhuAAAwxBcAIBhCC4AwDAEFwBgGIILADAMwQUAGIaVcwFgMDddcWTqEnbB43ONMuMCAAxjy+BSVVdX1YNV9WhVfbmq3jXbf6iq7quqx2a3l+5+uQDAfjbPjMuzSd7T3a9I8pok76iqa5PcluSB7r4myQOzbQCAXbNlcOnuU939+dn9p5I8muTKJLckuXs27O4kb96tIgEAkvM8x6WqXpbk1Uk+k+Ty7j6VrIebJJctujgAgI3mDi5V9aIkH0/y7u7+7nl83NGqOl5Vx5/J09upEQAgyZzBpaoOZj20fKS7PzHb/WRVHZ69/3CS05t9bHff3t1r3b12MBctomYAYJ+a57+KKskdSR7t7g9seNe9SW6d3b81yacWXx4AwHPmWYDutUl+J8mXqurEbN97k7w/yceq6u1Jvp7kLbtTIgDAui2DS3f/S5I6x7tvXGw5AADnVt29vINV/U+S/zpj90uSfHNpRbATejUGfRqDPo1Bn5bnZ7v7pVsNWmpw2bSAquPdvTZpEcxFr8agT2PQpzHo0+pxrSIAYBiCCwAwjFUILrdPXQBz06sx6NMY9GkM+rRiJj/HBQBgXqsw4wIAMBfBBQAYhuACAAxDcAEAhiG4AADD+H9X6EkJQHtdjAAAAABJRU5ErkJggg==\n", 565 | "text/plain": [ 566 | "
" 567 | ] 568 | }, 569 | "metadata": { 570 | "needs_background": "light" 571 | }, 572 | "output_type": "display_data" 573 | } 574 | ], 575 | "source": [ 576 | "plt.matshow(_input[0][0].T)\n", 577 | "plt.matshow(_input[1][0].T)\n", 578 | "plt.matshow(_output[0].T)" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "metadata": {}, 584 | "source": [ 585 | "# 2D embeddings with Chemception\n", 586 | "\n", 587 | "Chemception used small chemical \"images\" and is suitable for modelling using image architectures such as convolutional neural networks and Inception modules and similar\n", 588 | "\n", 589 | "Reference: https://arxiv.org/abs/1706.06689\n", 590 | "\n", 591 | "Blog-post: https://www.wildcardconsulting.dk/learn-how-to-teach-your-computer-to-see-chemistry-free-chemception-models-with-rdkit-and-keras/" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 21, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "from molvecgen.vectorizers import ChemceptionVectorizer\n", 601 | "chemcepterizer = ChemceptionVectorizer()" 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": {}, 607 | "source": [ 608 | "### Preprocessing\n", 609 | "Molecules must have 2D coordinates and gasteiger charges computed. The preprocess function can do that" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 22, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "mols_array = chemcepterizer.preprocess_mols(mols)" 619 | ] 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "metadata": {}, 624 | "source": [ 625 | "Transform all molecules and show first three channels of the first molecule as an image. The augment property on the object controls if augmentation (rotation) should be active and can be overruled in the function call" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 23, 631 | "metadata": {}, 632 | "outputs": [ 633 | { 634 | "name": "stderr", 635 | "output_type": "stream", 636 | "text": [ 637 | "Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).\n" 638 | ] 639 | }, 640 | { 641 | "data": { 642 | "text/plain": [ 643 | "" 644 | ] 645 | }, 646 | "execution_count": 23, 647 | "metadata": {}, 648 | "output_type": "execute_result" 649 | }, 650 | { 651 | "data": { 652 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP4AAAD8CAYAAABXXhlaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAADWtJREFUeJzt3V+sHOV9xvHvUxuXNAGBISALQ8GSlZCLYCKLgogqQpvIpShwUaqkreS2qOcmlYjaKoG2apOqlcpNoBdNIgtofNEGCDSAuChYLlZ7URnM35gYx4RSsOziVmAl6QWK4deLHVfHp8fePefM7p7j9/uRjnZnPLvzk2efnXln3n0nVYWktvzMtAuQNHkGX2qQwZcaZPClBhl8qUEGX2qQwZcatKTgJ9mSZH+SV5Pc3ldRksYri+3Ak2QV8APg08BB4Bng81X1/f7KkzQOq5fw2quAV6vqNYAk9wM3AScNfhK7CUpjVlUZtsxSDvUvAt6cNX2wmydpmVvKHn++b5X/t0dPMgPMLGE9knq2lOAfBC6eNb0eODR3oaraBmwDD/Wl5WIph/rPABuTXJZkDfA54LF+ypI0Tove41fVsSS/DzwBrALuq6qXe6tM0tgs+nLeolbmob40duM+qy9phTL4UoMMvtQggy81yOBLDTL4UoMMvtQggy81yOBLDTL4UoMMvtQggy81yOBLDTL4UoMMvtQggy81yOBLDTL4UoMMvtQggy81yOBLDTL4UoMMvtQggy81yOBLDRoa/CT3JTmSZO+seWuT7EhyoHs8d7xlSurTKHv8bwFb5sy7HdhZVRuBnd20pBViaPCr6l+At+fMvgnY3j3fDtzcc12SxmixbfwLq+owQPd4QX8lSRq3Rd8me1RJZoCZca9H0ugWu8d/K8k6gO7xyMkWrKptVbW5qjYvcl2SerbY4D8GbO2ebwUe7accSZOQqjr1Asm3geuA84G3gD8HHgEeBC4B3gBuqaq5JwDne69Tr0zSklVVhi0zNPh9MvjS+I0SfHvuSQ0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0aGvwkFyd5Ksm+JC8nua2bvzbJjiQHusdzx1+upD6Mcu+8dcC6qnouyVnAs8DNwG8Db1fVXye5HTi3qr485L28hZY0Zr3cQquqDlfVc93zHwP7gIuAm4Dt3WLbGXwZSFoBFtTGT3IpcCWwG7iwqg7D4MsBuKDv4iSNx+pRF0zyIeBh4ItV9aNk6NHE8dfNADOLK0/SOIx0m+wkZwCPA09U1de6efuB66rqcHceYFdVfWTI+9jGl8aslzZ+Brv2e4F9x0PfeQzY2j3fCjy6mCIlTd4oZ/U/Cfwr8D3g/W72HzNo5z8IXAK8AdxSVW8PeS/3+NKYjbLHH+lQvy8GXxq/UYI/8sk9qf7y306Yzp9eM6VKtFR22ZUaZPClBtnG18jmbrzRenJo0nq5nCfp9GPwpQYZfKlBBl9qkMGXGmTwpQbZc2+Fm9ubjpP1puuh193ca0Sz120vvpXFPb7UIIMvNcjgSw2yy+4Kt9hutLNft5jXLOR1miy77Eqal8GXGuTlvBVoMYfpp3y/U10S9JLdack9vtQggy81yLP6K1DfPeZOdba+72aFxs+z+pLmZfClBhl8qUG28Ve4cffcs42/8vR177wzkzyd5MUkLyf5ajf/siS7kxxI8kCSNX0ULWn8RjnUfxe4vqquADYBW5JcDdwJ3FVVG4F3gFvHV6akPg3tuVeDtsBPuskzur8Crgd+o5u/HfgK8I3+S9RCnNAL71SX+kZc7oRLe6/87on/9tH7FlidlouRTu4lWZXkBeAIsAP4IXC0qo51ixwELhpPiZL6NlLwq+q9qtoErAeuAi6fb7H5XptkJsmeJHsWX6akPi3ocl5VHQV2AVcD5yQ53lRYDxw6yWu2VdXmqtq8lEIl9Wfo5bwkHwZ+WlVHk3wAeJLBib2twMNVdX+SbwIvVdXXh7yXl/OWCS/nnb5GuZw3SvA/zuDk3SoGRwgPVtVfJNkA3A+sBZ4Hfquq3h3yXgZ/mTD4p69egt8ng798GPzT1yjBdyCORp3sF3iAg280wL76UoMMvtQg2/iNGnVDLHbo7ZP1DBz1fMLc97DJMToH4pA0L4MvNcjgSw2yjd+IuWPnj9pmPmGDnWL8/ZHb7ot8D43ONr6keRl8qUEe6jeijzvderfclcFDfUnzMvhSgwy+1CCDLzXI4EsNMvhSgxyIoxVzeszVYnrMnaLXnVYW9/hSgwy+1CB77jXqhA1Rj5z4j7l53tfYU29lsOeepHkZfKlBBl9qkG18+au700yvbfzuVtnPJ3m8m74sye4kB5I8kGTNUoqVNDkLOdS/Ddg3a/pO4K6q2gi8A9zaZ2GSxmek4CdZD/wqcE83HeB64KFuke3A/NeAJC07o+7x7wa+BLzfTZ8HHK2qY930QeCinmuTNCZDg5/kRuBIVT07e/Y8i8574i7JTJI9SfYsskZJPRvlRzrXAp9NcgNwJnA2gyOAc5Ks7vb664FD8724qrYB28Cz+tJysaDLeUmuA/6oqm5M8h3g4aq6P8k3gZeq6utDXm/wl6HFjrmv5WncXXa/DPxBklcZtPnvXcJ7SZqgBf0ev6p2Abu6568BV/VfkqRxs+eedJrx13mS5mXwpQYZfKlBBl9qkMGXGmTwpQYZfKlBBl9qkMGXGmTwpQYZfKlBBl9qkMGXGmTwpQYZfKlBBl9qkMGXGmTwpQYZfKlBBl9qkMGXGmTwpQYZfKlBBl9q0Eh30knyOvBj4D3gWFVtTrIWeAC4FHgd+PWqemc8ZUrq00L2+J+qqk1Vtbmbvh3YWVUbgZ3dtKQVYCmH+jcB27vn24Gbl16OpEkYNfgFPJnk2SQz3bwLq+owQPd4wTgKlNS/Ue+We21VHUpyAbAjySujrqD7opgZuqCkiVnw3XKTfAX4CfB7wHVVdTjJOmBXVX1kyGu9W640Zr3cLTfJB5Ocdfw58BlgL/AYsLVbbCvw6OJLlTRJQ/f4STYA3+0mVwP/UFV/leQ84EHgEuAN4JaqenvIe7nHl8ZslD3+gg/1l8LgS+PXy6G+pNOPwZcaZPClBhl8qUEGX2qQwZcaZPClBhl8qUEGX2qQwZcaZPClBhl8qUEGX2qQwZcaZPClBhl8qUEGX2qQwZcaZPClBhl8qUEGX2qQwZcaZPClBhl8qUEGX2rQSMFPck6Sh5K8kmRfkmuSrE2yI8mB7vHccRcrqR+j7vH/BvinqvoocAWwD7gd2FlVG4Gd3bSkFWCUm2aeDbwIbKhZCyfZj7fJlpadvu6dtwH4L+Dvkjyf5J7udtkXVtXhbkWHgQuWVK2kiRkl+KuBTwDfqKorgf9hAYf1SWaS7EmyZ5E1SurZKME/CBysqt3d9EMMvgje6g7x6R6PzPfiqtpWVZuranMfBUtauqHBr6r/BN5Mcrz9/kvA94HHgK3dvK3Ao2OpUFLvhp7cA0iyCbgHWAO8BvwOgy+NB4FLgDeAW6rq7SHv48k9acxGObk3UvD7YvCl8evrrL6k04zBlxpk8KUGGXypQQZfapDBlxpk8KUGrZ7w+v4b+A/g/O75NC2HGsA65rKOEy20jp8fZaGJduD5v5Ume6bdd3851GAd1jGtOjzUlxpk8KUGTSv426a03tmWQw1gHXNZx4nGUsdU2viSpstDfalBEw1+ki1J9id5NcnERuVNcl+SI0n2zpo38eHBk1yc5KluiPKXk9w2jVqSnJnk6SQvdnV8tZt/WZLdXR0PJFkzzjpm1bOqG8/x8WnVkeT1JN9L8sLxYeKm9BmZyFD2Ewt+klXA3wK/AnwM+HySj01o9d8CtsyZN43hwY8Bf1hVlwNXA1/o/g8mXcu7wPVVdQWwCdiS5GrgTuCuro53gFvHXMdxtzEYsv24adXxqaraNOvy2TQ+I5MZyr6qJvIHXAM8MWv6DuCOCa7/UmDvrOn9wLru+Tpg/6RqmVXDo8Cnp1kL8HPAc8AvMOgosnq+7TXG9a/vPszXA48DmVIdrwPnz5k30e0CnA38O925t3HWMclD/YuAN2dNH+zmTctUhwdPcilwJbB7GrV0h9cvMBgkdQfwQ+BoVR3rFpnU9rkb+BLwfjd93pTqKODJJM8mmenmTXq7TGwo+0kGf77hgJq8pJDkQ8DDwBer6kfTqKGq3quqTQz2uFcBl8+32DhrSHIjcKSqnp09e9J1dK6tqk8waIp+IckvTmCdcy1pKPuFmGTwDwIXz5peDxya4PrnGml48L4lOYNB6P++qv5xmrUAVNVRYBeDcw7nJDn++41JbJ9rgc8meR24n8Hh/t1TqIOqOtQ9HgG+y+DLcNLbZUlD2S/EJIP/DLCxO2O7BvgcgyG6p2Xiw4MnCXAvsK+qvjatWpJ8OMk53fMPAL/M4CTSU8CvTaqOqrqjqtZX1aUMPg//XFW/Oek6knwwyVnHnwOfAfYy4e1SkxzKftwnTeacpLgB+AGD9uSfTHC93wYOAz9l8K16K4O25E7gQPe4dgJ1fJLBYetLwAvd3w2TrgX4OPB8V8de4M+6+RuAp4FXge8APzvBbXQd8Pg06ujW92L39/Lxz+aUPiObgD3dtnkEOHccddhzT2qQPfekBhl8qUEGX2qQwZcaZPClBhl8qUEGX2qQwZca9L8nnhCYdNuL8wAAAABJRU5ErkJggg==\n", 653 | "text/plain": [ 654 | "
" 655 | ] 656 | }, 657 | "metadata": { 658 | "needs_background": "light" 659 | }, 660 | "output_type": "display_data" 661 | } 662 | ], 663 | "source": [ 664 | "outputs = chemcepterizer.transform(mols_array)\n", 665 | "plt.imshow(outputs[0,:,:,:3])" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": {}, 671 | "source": [ 672 | "The SmilesGenerator are reused as generator" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": 24, 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [ 681 | "from molvecgen import ChemceptionGenerator" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": 25, 687 | "metadata": {}, 688 | "outputs": [], 689 | "source": [ 690 | "chemceptgenerator = ChemceptionGenerator(mols_array, y, chemcepterizer, batch_size=5)" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 26, 696 | "metadata": {}, 697 | "outputs": [ 698 | { 699 | "name": "stderr", 700 | "output_type": "stream", 701 | "text": [ 702 | "Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).\n" 703 | ] 704 | }, 705 | { 706 | "data": { 707 | "text/plain": [ 708 | "" 709 | ] 710 | }, 711 | "execution_count": 26, 712 | "metadata": {}, 713 | "output_type": "execute_result" 714 | }, 715 | { 716 | "data": { 717 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP4AAAD8CAYAAABXXhlaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAADW5JREFUeJzt3V2sXNV5xvH/Uzsu+QDxFYiFoYBkJeSimMiiIKKK0CZxEQ1clCpRL9wW9UhVKhGlVQJt1RIpkcpNoBdNIyvQ+KINkJAAcqWC5YLaSpXBfMZgHBNKwbKLU4GVtBdRDG8vZjs9HA6eOXNm9vh4/X/S0cze3jP7lWee2WvtWbN2qgpJbfmFWRcgqX8GX2qQwZcaZPClBhl8qUEGX2qQwZcatKzgJ9mUZG+SF5LcNKmiJE1Xxh3Ak2QV8APg48B+4DHgM1X13OTKkzQNq5fx2EuBF6rqRYAkdwHXAu8Y/CQOE5SmrKoybJvlNPXPAV6Zt7y/WyfpOLecI/5inypvO6InmQPmlrEfSRO2nODvB86dt7wOOLBwo6raAmwBm/rS8WI5Tf3HgPVJLkiyBvg08MBkypI0TWMf8avqSJI/Ah4EVgF3VtWzE6tM0tSM/XXeWDuzqS9N3bTP6ktaoQy+1CCDLzXI4EsNMvhSgwy+1CCDLzXI4EsNMvhSgwy+1CCDLzXI4EsNMvhSgwy+1CCDLzXI4EsNMvhSgwy+1CCDLzXI4EsNMvhSgwy+1CCDLzXI4EsNMvhSg4YGP8mdSQ4l2T1v3elJtifZ192eNt0yJU3SKEf8bwKbFqy7CdhRVeuBHd2ypBViaPCr6l+A1xasvhbY2t3fClw34bokTdG4ffyzq+ogQHd71uRKkjRtY18me1RJ5oC5ae9H0ujGPeK/mmQtQHd76J02rKotVbWxqjaOuS9JEzZu8B8ANnf3NwP3T6YcSX1IVR17g+RbwJXAmcCrwF8C9wH3AOcBLwPXV9XCE4CLPdexdyZp2aoqw7YZGvxJMvjS9I0SfEfuSQ0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0aGvwk5yZ5OMmeJM8mubFbf3qS7Un2dbenTb9cSZMwyrXz1gJrq+qJJCcDjwPXAb8LvFZVf5XkJuC0qvrikOfyElrSlE3kElpVdbCqnuju/wTYA5wDXAts7TbbyuDDQNIKsKQ+fpLzgUuAncDZVXUQBh8OwFmTLk7SdKwedcMk7wPuBT5XVT9OhrYmjj5uDpgbrzxJ0zDSZbKTvAvYBjxYVV/t1u0Frqyqg915gEeq6oNDnsc+vjRlE+njZ3BovwPYczT0nQeAzd39zcD94xQpqX+jnNX/KPCvwPeBN7vVf8qgn38PcB7wMnB9Vb025Lk84ktTNsoRf6Sm/qQYfGn6JtLUl3TiMfhSgwy+1CCDLzXI4EsNMvhSgwy+1KCRx+pLC9WX//3n9/Pnl8+wEi2VR3ypQQZfapBDdjWyhS/eaD/MVt8csitpUQZfapDBlxpk8KUGGXypQQZfapAj93RMjs47MXnElxpk8KUGOXJPIxt35J7dhX45ck/Sogy+1CCDLzXIPr5GNr+vDsD8/vrzv//Wf/vQnUvfwYLn93zAeCZ17byTkjya5Okkzyb5Urf+giQ7k+xLcneSNZMoWtL0jdLU/ylwVVVdDGwANiW5DLgVuK2q1gOvAzdMr0xJk7Skpn6S9wD/Bvwh8I/AB6rqSJLLgVuq6pNDHm9TfwU71td5C7sB4zTTnehjMib2dV6SVUmeAg4B24EfAoer6ki3yX7gnHELldSvkYJfVW9U1QZgHXApcNFimy322CRzSXYl2TV+mZImaUlf51XVYeAR4DLg1CRHf+SzDjjwDo/ZUlUbq2rjcgqVNDlD+/hJ3g/8rKoOJ3k38BCDE3ubgXur6q4kXweeqaqvDXku+/hjmET/eex9z9/viNsN23aaz6HR+vij/Cx3LbA1ySoGLYR7qmpbkueAu5J8GXgSuGNZ1UrqzdDgV9UzwCWLrH+RQX9f0grjyL0VoO8mcM0bhZcRR+BNpKnvyL2J8Nd5khZl8KUG2dRfgY51pv0d/4PHbEaPvK9j/IBn1G8DPIs/GTb1JS3K4EsNMvhSg+zjr3CjfgV2rP/4kUfkTeA8wbjnAjQ6+/iSFmXwpQbZ1Nex59I7Bpvmxyeb+pIWZfClBhl8qUH28Rs1iaGyDrc9PtnHl7Qogy81aJSpt6RFTXpeffXHI77UIIMvNciz+o2a3zQft1nuWf3jk2f1JS3K4EsNMvhSg+zjN2rciTne8hwTOE+gyZtoH7+7VPaTSbZ1yxck2ZlkX5K7k6xZTrGS+rOUpv6NwJ55y7cCt1XVeuB14IZJFiZpekZq6idZB2wFvgJ8HvhN4EfAB6rqSJLLgVuq6pNDnsem/gpQdd//L+S6d9zOr/COT5Ns6t8OfAF4s1s+AzhcVUe65f3AOUuuUNJMDA1+kmuAQ1X1+PzVi2y66NE8yVySXUl2jVmjpAkb5Uc6VwCfSnI1cBJwCoMWwKlJVndH/XXAgcUeXFVbgC1gU186Xizp67wkVwJ/UlXXJPk2cG9V3ZXk68AzVfW1IY83+CuAQ3FXtmkP2f0i8PkkLzDo89+xjOeS1CMH8OhtPOKvbKMc8Z2IQ287K2vYT3yO1ZcaZPClBtnU1zGvYKsTk0d8qUEGX2qQwZca5Pf40gnGyTYlLcrgSw0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNcjgSw0y+FKDDL7UIIMvNWikWXaTvAT8BHgDOFJVG5OcDtwNnA+8BPx2Vb0+nTIlTdJSjvgfq6oNVbWxW74J2FFV64Ed3bKkFWA5Tf1rga3d/a3AdcsvR1IfRg1+AQ8leTzJXLfu7Ko6CNDdnjWNAiVN3qhX0rmiqg4kOQvYnuT5UXfQfVDMDd1QUm+WPL12kluA/wH+ALiyqg4mWQs8UlUfHPJYp9eWpmwi02sneW+Sk4/eBz4B7AYeADZ3m20G7h+/VEl9GnrET3Ih8L1ucTXwD1X1lSRnAPcA5wEvA9dX1WtDnssjvjRloxzxvZKOdILxSjqSFmXwpQYZfKlBBl9qkMGXGmTwpQYZfKlBBl9qkMGXGmTwpQYZfKlBBl9qkMGXGmTwpQYZfKlBBl9qkMGXGmTwpQYZfKlBBl9qkMGXGmTwpQYZfKlBBl9qkMGXGjRS8JOcmuQ7SZ5PsifJ5UlOT7I9yb7u9rRpFytpMkY94v818E9V9SHgYmAPcBOwo6rWAzu6ZUkrwCgXzTwFeBq4sOZtnGQvXiZbOu5M6tp5FwI/Av4uyZNJvtFdLvvsqjrY7eggcNayqpXUm1GCvxr4CPC3VXUJ8L8soVmfZC7JriS7xqxR0oSNEvz9wP6q2tktf4fBB8GrXROf7vbQYg+uqi1VtbGqNk6iYEnLNzT4VfVfwCtJjvbffw14DngA2Nyt2wzcP5UKJU3c0JN7AEk2AN8A1gAvAr/H4EPjHuA84GXg+qp6bcjzeHJPmrJRTu6NFPxJMfjS9E3qrL6kE4zBlxpk8KUGGXypQQZfapDBlxpk8KUGre55f/8N/CdwZnd/lo6HGsA6FrKOt1pqHb80yka9DuD5+U6TXbMeu3881GAd1jGrOmzqSw0y+FKDZhX8LTPa73zHQw1gHQtZx1tNpY6Z9PElzZZNfalBvQY/yaYke5O8kKS3WXmT3JnkUJLd89b1Pj14knOTPNxNUf5skhtnUUuSk5I8muTpro4vdesvSLKzq+PuJGumWce8elZ18zlum1UdSV5K8v0kTx2dJm5G75FeprLvLfhJVgF/A/wG8GHgM0k+3NPuvwlsWrBuFtODHwH+uKouAi4DPtv9H/Rdy0+Bq6rqYmADsCnJZcCtwG1dHa8DN0y5jqNuZDBl+1GzquNjVbVh3tdns3iP9DOVfVX18gdcDjw4b/lm4OYe938+sHve8l5gbXd/LbC3r1rm1XA/8PFZ1gK8B3gC+BUGA0VWL/Z6TXH/67o381XANiAzquMl4MwF63p9XYBTgP+gO/c2zTr6bOqfA7wyb3l/t25WZjo9eJLzgUuAnbOopWteP8VgktTtwA+Bw1V1pNukr9fnduALwJvd8hkzqqOAh5I8nmSuW9f369LbVPZ9Bn+x6YCa/EohyfuAe4HPVdWPZ1FDVb1RVRsYHHEvBS5abLNp1pDkGuBQVT0+f3XfdXSuqKqPMOiKfjbJr/awz4WWNZX9UvQZ/P3AufOW1wEHetz/QiNNDz5pSd7FIPR/X1XfnWUtAFV1GHiEwTmHU5Mc/f1GH6/PFcCnkrwE3MWguX/7DOqgqg50t4eA7zH4MOz7dVnWVPZL0WfwHwPWd2ds1wCfZjBF96z0Pj14kgB3AHuq6quzqiXJ+5Oc2t1/N/DrDE4iPQz8Vl91VNXNVbWuqs5n8H7456r6nb7rSPLeJCcfvQ98AthNz69L9TmV/bRPmiw4SXE18AMG/ck/63G/3wIOAj9j8Kl6A4O+5A5gX3d7eg91fJRBs/UZ4Knu7+q+awF+GXiyq2M38Bfd+guBR4EXgG8Dv9jja3QlsG0WdXT7e7r7e/boe3NG75ENwK7utbkPOG0adThyT2qQI/ekBhl8qUEGX2qQwZcaZPClBhl8qUEGX2qQwZca9H+3syltvu9O5AAAAABJRU5ErkJggg==\n", 718 | "text/plain": [ 719 | "
" 720 | ] 721 | }, 722 | "metadata": { 723 | "needs_background": "light" 724 | }, 725 | "output_type": "display_data" 726 | } 727 | ], 728 | "source": [ 729 | "X_batch, y_batch = chemceptgenerator.next()\n", 730 | "plt.imshow(X_batch[0,:,:,:3])" 731 | ] 732 | }, 733 | { 734 | "cell_type": "markdown", 735 | "metadata": {}, 736 | "source": [ 737 | "# Morgan Fingerprints as example of another vectorizer" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": 27, 743 | "metadata": {}, 744 | "outputs": [], 745 | "source": [ 746 | "from molvecgen.vectorizers import MorganDictVectorizer" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": 28, 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [ 755 | "mdv = MorganDictVectorizer()" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": 29, 761 | "metadata": {}, 762 | "outputs": [ 763 | { 764 | "name": "stdout", 765 | "output_type": "stream", 766 | "text": [ 767 | "[ 98513984 219692797 535847852 864674487 864942730 951226070\n", 768 | " 1074692693 1113276223 1275884092 1412710081 1465074879 1471352294\n", 769 | " 1510328189 1740632203 1775209781 1963848833 2064788354 2119439498\n", 770 | " 2143075994 2154975788 2245384272 2246699815 2246703798 2246728737\n", 771 | " 2246997334 2281069397 2534373880 2763854213 2959890341 2968968094\n", 772 | " 2976033787 2976816164 3075056557 3116051204 3217380708 3218693969\n", 773 | " 3542456614 3586270004 3600182528 3643586416 3696402029 3999906991\n", 774 | " 4172736314 4194366826 4208894168 4212392629]\n" 775 | ] 776 | }, 777 | { 778 | "data": { 779 | "text/plain": [ 780 | "46" 781 | ] 782 | }, 783 | "execution_count": 29, 784 | "metadata": {}, 785 | "output_type": "execute_result" 786 | } 787 | ], 788 | "source": [ 789 | "#Fit analyses the dataset and set the keys mapping\n", 790 | "mdv.fit(mols[0:1])\n", 791 | "print(mdv.keys)\n", 792 | "mdv.dims" 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": 30, 798 | "metadata": {}, 799 | "outputs": [ 800 | { 801 | "data": { 802 | "text/plain": [ 803 | "(array([3., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", 804 | " 1., 1., 1., 2., 1., 1., 2., 1., 1., 1., 1., 1., 3., 1., 1., 1., 2.,\n", 805 | " 1., 5., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1.]), 0)" 806 | ] 807 | }, 808 | "execution_count": 30, 809 | "metadata": {}, 810 | "output_type": "execute_result" 811 | } 812 | ], 813 | "source": [ 814 | "mdv.transform_mol(mols[0],misses=True)" 815 | ] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": 31, 820 | "metadata": {}, 821 | "outputs": [ 822 | { 823 | "data": { 824 | "text/plain": [ 825 | "(array([0., 0., 0., 1., 2., 2., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,\n", 826 | " 0., 0., 0., 2., 1., 0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 827 | " 4., 3., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 30)" 828 | ] 829 | }, 830 | "execution_count": 31, 831 | "metadata": {}, 832 | "output_type": "execute_result" 833 | } 834 | ], 835 | "source": [ 836 | "mdv.transform_mol(mols[1], misses=True)" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": 32, 842 | "metadata": {}, 843 | "outputs": [], 844 | "source": [ 845 | "arr, misses = mdv.transform(mols, misses=True)" 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "execution_count": 33, 851 | "metadata": {}, 852 | "outputs": [ 853 | { 854 | "data": { 855 | "text/plain": [ 856 | "" 857 | ] 858 | }, 859 | "execution_count": 33, 860 | "metadata": {}, 861 | "output_type": "execute_result" 862 | }, 863 | { 864 | "data": { 865 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAicAAAECCAYAAAAl2XfFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAAEe5JREFUeJzt3WGMXWlZB/DnYWZ30HUJrLs0ZMoIYxrCftA17WxI8cPailmQCCaaQG1CkzZrUkkw0Rj0C2piox8UPpSYrO2GTaooUZGNIeqmQ4MfDO1URl2ChJViqbvZlQBp3Zbuln390EvaLdP2PTNz7n3PzO+XNDP39sk5z97nnLv/nrnzniylBABAK1416QYAAK4nnAAATRFOAICmCCcAQFOEEwCgKcIJANCUiYaTzHw4M7+SmU9n5ocn2Qu3l5mPZebzmfnUdc/dk5lPZuZXR19fN8keWVlmvjEzP5eZX87ML2Xmh0bPm98AZOarM/NkZv7baH6/N3r+zZn5hdH8/ioz75x0r6wsM6cy84uZ+fejx2Z3CxMLJ5k5FREfj4h3RsT9EfH+zLx/Uv1Q5RMR8fANz304Io6XUrZFxPHRY9pzJSJ+o5Ty1oh4W0T82uh8M79huBwRu0opPxkRD0TEw5n5toj4o4j46Gh+346I/RPskVv7UER8+brHZncLk7xy8mBEPF1K+Vop5cWI+MuIeM8E++E2Simfj4hv3fD0eyLi8dH3j0fEe8faFFVKKc+WUv519P2FuPomORvmNwjlqv8bPbxj9KdExK6I+OvR8+bXqMzcGhE/HxFHRo8zzO6WJhlOZiPiG9c9Pjd6jmHZUkp5NuLq/wAj4vUT7ofbyMw3RcRPRcQXwvwGY/RjgeWIeD4inoyI/4qI75RSroxKvIe262MR8VsR8fLo8Y+G2d3SJMNJrvCctfShR5n5IxHxNxHx66WU85Puh3qllO+VUh6IiK1x9crzW1cqG29X3E5mvjsini+lnL7+6RVKze460xPc97mIeON1j7dGxDMT6oXVey4z31BKeTYz3xBX/1VHgzLzjrgaTP68lPK3o6fNb2BKKd/JzBNx9bNDr83M6dG/wL2HtuntEfELmfmuiHh1RLwmrl5JMbtbmOSVk1MRsW30ieU7I+J9EfHEBPthdZ6IiA+Mvv9ARHxmgr1wE6OfcR+NiC+XUv7kur8yvwHIzPsy87Wj738oIn42rn5u6HMR8UujMvNrUCnlt0spW0spb4qr/59bLKX8SpjdLeUk70o8SpIfi4ipiHislPIHE2uG28rMT0bEQxFxb0Q8FxEfiYi/i4hPRcRcRJyNiF8updz4oVkmLDN/OiL+OSL+I6793Pt34urnTsyvcZn5E3H1Q5NTcfUflZ8qpfx+Zs7H1V8muCcivhgRe0splyfXKbeSmQ9FxG+WUt5tdrc20XACAHAjK8QCAE0RTgCApggnAEBThBMAoCnCCQDQlCbCSWY+MukeWB2zGzbzGy6zGzbzu7UmwklEGNJwmd2wmd9wmd2wmd8ttBJOAAAiYsyLsE3dfVeZvu+1P/D8yxdeiFfdfdcrnrvzmyvdF6ltV+6q73n6hY2x+N1LL70Qd9xx1+0LmaibHZtXLr4Q0z/8g/Pr6/jcjOfIzbx4b/1/30rvh5M49/qa31ve/M3q2q+cube6tmU3m99GPke++91vx0svvlD1HzjWG/9N3/fa2HroYFXt3JGpnrtZf88tzFTXbjlllWLGp8uxGdHf8ekcuebsge9V17byftjX/I4fO1pdu3vv/uraIdrI58jSycPVtWv6sU5mPpyZX8nMpzPzw2vZFgBAxBrCSWZORcTHI+KdEXF/RLw/M+9fr8YAgM1pLVdOHoyIp0spXyulvBhX7674nvVpCwDYrNYSTmYj4hvXPT43eg4AYNXWEk5W+sTtD3x0ODMfycylzFx6+cILa9gdALAZrCWcnIuIN173eGtEPHNjUSnl0VLKjlLKjht/XRgA4EZrCSenImJbZr45M++MiPdFxBPr0xYAsFmtep2TUsqVzPxgRPxjRExFxGOllC+tW2cAwKa0pkXYSimfjYjP1tbPnLkU83uW17LLdXFl1/bq2q6LV9WaXjzdy3aHqMs8vG6rtLCzt013mkmHPjb6rOcXJ91Bd7Mdel5Yrl9kbtuJfdW1c/UtDPIY6vIaD02Wi9W17q0DADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmpKllLHtbGZ+tmw9dLCqdu7IVM/drL8uS91vOXW5x07glbrehqGv49M5cs3ZA/XLu7fyftjX/I4fO1pdu3vv/uraIdrI58jSycNx4fy5rKl15QQAaIpwAgA0RTgBAJoinAAATRFOAICmCCcAQFOEEwCgKcIJANAU4QQAaIpwAgA0RTgBAJoyPc6dzZy5FPN7lse5yxVd2bW9urbrPUlqTS+e7mW7Q9RlHl63VVrY2dumO82kQx8bfdbzi5PuoLvZDj0vLNffO2jbiX3VtXP1LQzyGOryGg9NlovVta6cAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmiKcAABNEU4AgKZkKWVsO5uZny1bDx2sqp07MtVzN+uvy1L3W05d7rETeKWut2Ho6/h0jlxz9kD98u6tvB/2Nb/jx45W1+7eu7+6dog28jmydPJwXDh/LmtqXTkBAJoinAAATRFOAICmCCcAQFOEEwCgKcIJANAU4QQAaIpwAgA0RTgBAJoinAAATZke585mzlyK+T3L49zliq7s2l5d23XZ71rTi6d72e4QdZmH122VFnb2tulOM+nQx0af9fzipDvobrZDzwvL9cvzbzuxr7p2rr6FQR5DXV7joclysbrWlRMAoCnCCQDQlDX9WCczvx4RFyLiexFxpZSyYz2aAgA2r/X4zMnPlFK+uQ7bAQDwYx0AoC1rDSclIv4pM09n5iMrFWTmI5m5lJlLL8XlNe4OANjo1vpjnbeXUp7JzNdHxJOZ+Z+llM9fX1BKeTQiHo2IeE3eU9a4PwBgg1vTlZNSyjOjr89HxKcj4sH1aAoA2LxWHU4y867MvPv730fEz0XEU+vVGACwOa3lxzpbIuLTmfn97fxFKeUf1qUrAGDTylLG9zGQmfnZsvXQwarauSNTPXez/rosdb/llA8HMz5db8PQ1/HpHLnm7IH65d1beT/sa37Hjx2trt29d3917RBt5HNk6eThuHD+XNbU+lViAKApwgkA0BThBABoinACADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0JS13Funs5kzl2J+z/I4d7miK7u2V9d2Xfa71vTi6V62O0Rd5uF1W6WFnb1tutNMOvSx0Wc9vzjpDrqb7dDzwnL98vzbTuyrrp2rb2GQx1CX13hoslysrnXlBABoinACADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmpKllLHtbGZ+tmw9dLCqdu7IVM/drL8u9+HZcupyj53AK3W9R1Rfx6dz5JqzB+rvPdPK+2Ff8zt+7Gh17e69+6trh2gjnyNLJw/HhfPnsqbWlRMAoCnCCQDQFOEEAGiKcAIANEU4AQCaIpwAAE0RTgCApggnAEBThBMAoCnCCQDQlOlx7mzmzKWY37M8zl2u6Mqu7dW1XZf9rjW9eLqX7Q5Rl3l43VZpYWdvm+40kw59bPRZzy9OuoPuZjv0vLBcvzz/thP7qmvn6lsY5DHU5TUemiwXq2tdOQEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmiKcAABNyVLK2HY2Mz9bth46WFU7d2Sq527WX5el7recutxjJ/BKXW/D0Nfx6Ry55uyB+uXdW3k/7Gt+x48dra7dvXd/de0QbeRzZOnk4bhw/lzW1LpyAgA05bbhJDMfy8znM/Op6567JzOfzMyvjr6+rt82AYDNoubKySci4uEbnvtwRBwvpWyLiOOjxwAAa3bbcFJK+XxEfOuGp98TEY+Pvn88It67zn0BAJvUaj9zsqWU8mxExOjr69evJQBgM+v9A7GZ+UhmLmXm0ssXXuh7dwDAwK02nDyXmW+IiBh9ff5mhaWUR0spO0opO151912r3B0AsFmsNpw8EREfGH3/gYj4zPq0AwBsdjW/SvzJiPiXiHhLZp7LzP0R8YcR8Y7M/GpEvGP0GABgzaZvV1BKef9N/mr3OvcCAHD7cLKeZs5civk9y+Pc5Yqu7NpeXdt12e9a04une9nuEHWZh9dtlRZ29rbpTjPp0MdGn/X84qQ76G62Q88Ly/XL8287sa+6dq6+hUEeQ11e46HJcrG61vL1AEBThBMAoCnCCQDQFOEEAGiKcAIANEU4AQCaIpwAAE0RTgCApggnAEBThBMAoClZShnbzmbmZ8vWQweraueOTPXczfrrstT9llOXe+wEXqnrbRj6Oj6dI9ecPVC/vHsr74d9ze/4saPVtbv37q+uHaKNfI4snTwcF86fy5paV04AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKAp0+Pc2cyZSzG/Z3mcu1zRlV3bq2u73pOk1vTi6V62O0Rd5uF1W6WFnb1tutNMOvSx0Wc9vzjpDrqb7dDzwnL9vYO2ndhXXTtX38Igj6Eur/HQZLlYXevKCQDQFOEEAGiKcAIANEU4AQCaIpwAAE0RTgCApggnAEBThBMAoCnCCQDQFOEEAGhKllLGtrOZ+dmy9dDBqtq5I1M9d7P+uix1v+XU5R47gVfqehuGvo5P58g1Zw/UL+/eyvthX/M7fuxode3uvfura4doI58jSycPx4Xz57Km1pUTAKApwgkA0BThBABoinACADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0JTpce5s5sylmN+zPM5drujKru3VtV2X/a41vXi6l+0OUZd5eN1WaWFnb5vuNJMOfWz0Wc8vTrqD7mY79LywXL88/7YT+6pr5+pbGOQx1OU1HposF6trXTkBAJpy23CSmY9l5vOZ+dR1z/1uZv5PZi6P/ryr3zYBgM2i5srJJyLi4RWe/2gp5YHRn8+ub1sAwGZ123BSSvl8RHxrDL0AAKzpMycfzMx/H/3Y53Xr1hEAsKmtNpz8aUT8eEQ8EBHPRsQf36wwMx/JzKXMXHopLq9ydwDAZrGqcFJKea6U8r1SyssR8WcR8eAtah8tpewopey4I/r5tVwAYONYVTjJzDdc9/AXI+Kpm9UCAHRx20XYMvOTEfFQRNybmeci4iMR8VBmPhARJSK+HhG/2mOPAMAmcttwUkp5/wpPH+2hFwCAyFLK2HY2Mz9bth46WFU7d2Sq527WX5el7rec8uFgxqfrbRj6Oj6dI9ecPVC/vHsr74d9ze/4sfp/7+7eu7+6dog28jmydPJwXDh/LmtqLV8PADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmnLbe+usp5kzl2J+z/I4d7miK7u2V9d2Xfa71vTi6V62O0Rd5uF1W6WFnb1tutNMOvSx0Wc9vzjpDrqb7dDzwnL98vzbTuyrrp2rb2GQx1CX13hoslysrnXlBABoinACADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmpKllLHtbGZ+tmw9dLCqdu7IVM/drL8u9+HZcupyj53AK3W9R1Rfx6dz5JqzB+rvPdPK+2Ff8zt+7Gh17e69+6trh2gjnyNLJw/HhfPnsqbWlRMAoCnCCQDQFOEEAGiKcAIANEU4AQCaIpwAAE0RTgCApggnAEBThBMAoCnCCQDQlOlx7mzmzKWY37M8zl2u6Mqu7dW1XZf9rjW9eLqX7Q5Rl3l43VZpYWdvm+40kw59bPRZzy9OuoPuZjv0vLBcvzz/thP7qmvn6lsY5DHU5TUemiwXq2tdOQEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmiKcAABNyVLK+HaW+b8R8d8r/NW9EfHNsTXCejK7YTO/4TK7YduM8/uxUsp9NYVjDSc3bSJzqZSyY9J90J3ZDZv5DZfZDZv53Zof6wAATRFOAICmtBJOHp10A6ya2Q2b+Q2X2Q2b+d1CE585AQD4vlaunAAARIRwAgA0RjgBAJoinAAATRFOAICm/D8fhHaj1xK2vAAAAABJRU5ErkJggg==\n", 866 | "text/plain": [ 867 | "
" 868 | ] 869 | }, 870 | "metadata": { 871 | "needs_background": "light" 872 | }, 873 | "output_type": "display_data" 874 | } 875 | ], 876 | "source": [ 877 | "plt.matshow(arr)" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": 34, 883 | "metadata": {}, 884 | "outputs": [], 885 | "source": [ 886 | "arr2 = mdv.transform(mols, misses=False)" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": 35, 892 | "metadata": {}, 893 | "outputs": [ 894 | { 895 | "data": { 896 | "text/plain": [ 897 | "" 898 | ] 899 | }, 900 | "execution_count": 35, 901 | "metadata": {}, 902 | "output_type": "execute_result" 903 | }, 904 | { 905 | "data": { 906 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAicAAAECCAYAAAAl2XfFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAAEe5JREFUeJzt3WGMXWlZB/DnYWZ30HUJrLs0ZMoIYxrCftA17WxI8cPailmQCCaaQG1CkzZrUkkw0Rj0C2piox8UPpSYrO2GTaooUZGNIeqmQ4MfDO1URl2ChJViqbvZlQBp3Zbuln390EvaLdP2PTNz7n3PzO+XNDP39sk5z97nnLv/nrnzniylBABAK1416QYAAK4nnAAATRFOAICmCCcAQFOEEwCgKcIJANCUiYaTzHw4M7+SmU9n5ocn2Qu3l5mPZebzmfnUdc/dk5lPZuZXR19fN8keWVlmvjEzP5eZX87ML2Xmh0bPm98AZOarM/NkZv7baH6/N3r+zZn5hdH8/ioz75x0r6wsM6cy84uZ+fejx2Z3CxMLJ5k5FREfj4h3RsT9EfH+zLx/Uv1Q5RMR8fANz304Io6XUrZFxPHRY9pzJSJ+o5Ty1oh4W0T82uh8M79huBwRu0opPxkRD0TEw5n5toj4o4j46Gh+346I/RPskVv7UER8+brHZncLk7xy8mBEPF1K+Vop5cWI+MuIeM8E++E2Simfj4hv3fD0eyLi8dH3j0fEe8faFFVKKc+WUv519P2FuPomORvmNwjlqv8bPbxj9KdExK6I+OvR8+bXqMzcGhE/HxFHRo8zzO6WJhlOZiPiG9c9Pjd6jmHZUkp5NuLq/wAj4vUT7ofbyMw3RcRPRcQXwvwGY/RjgeWIeD4inoyI/4qI75RSroxKvIe262MR8VsR8fLo8Y+G2d3SJMNJrvCctfShR5n5IxHxNxHx66WU85Puh3qllO+VUh6IiK1x9crzW1cqG29X3E5mvjsini+lnL7+6RVKze460xPc97mIeON1j7dGxDMT6oXVey4z31BKeTYz3xBX/1VHgzLzjrgaTP68lPK3o6fNb2BKKd/JzBNx9bNDr83M6dG/wL2HtuntEfELmfmuiHh1RLwmrl5JMbtbmOSVk1MRsW30ieU7I+J9EfHEBPthdZ6IiA+Mvv9ARHxmgr1wE6OfcR+NiC+XUv7kur8yvwHIzPsy87Wj738oIn42rn5u6HMR8UujMvNrUCnlt0spW0spb4qr/59bLKX8SpjdLeUk70o8SpIfi4ipiHislPIHE2uG28rMT0bEQxFxb0Q8FxEfiYi/i4hPRcRcRJyNiF8updz4oVkmLDN/OiL+OSL+I6793Pt34urnTsyvcZn5E3H1Q5NTcfUflZ8qpfx+Zs7H1V8muCcivhgRe0splyfXKbeSmQ9FxG+WUt5tdrc20XACAHAjK8QCAE0RTgCApggnAEBThBMAoCnCCQDQlCbCSWY+MukeWB2zGzbzGy6zGzbzu7UmwklEGNJwmd2wmd9wmd2wmd8ttBJOAAAiYsyLsE3dfVeZvu+1P/D8yxdeiFfdfdcrnrvzmyvdF6ltV+6q73n6hY2x+N1LL70Qd9xx1+0LmaibHZtXLr4Q0z/8g/Pr6/jcjOfIzbx4b/1/30rvh5M49/qa31ve/M3q2q+cube6tmU3m99GPke++91vx0svvlD1HzjWG/9N3/fa2HroYFXt3JGpnrtZf88tzFTXbjlllWLGp8uxGdHf8ekcuebsge9V17byftjX/I4fO1pdu3vv/uraIdrI58jSycPVtWv6sU5mPpyZX8nMpzPzw2vZFgBAxBrCSWZORcTHI+KdEXF/RLw/M+9fr8YAgM1pLVdOHoyIp0spXyulvBhX7674nvVpCwDYrNYSTmYj4hvXPT43eg4AYNXWEk5W+sTtD3x0ODMfycylzFx6+cILa9gdALAZrCWcnIuIN173eGtEPHNjUSnl0VLKjlLKjht/XRgA4EZrCSenImJbZr45M++MiPdFxBPr0xYAsFmtep2TUsqVzPxgRPxjRExFxGOllC+tW2cAwKa0pkXYSimfjYjP1tbPnLkU83uW17LLdXFl1/bq2q6LV9WaXjzdy3aHqMs8vG6rtLCzt013mkmHPjb6rOcXJ91Bd7Mdel5Yrl9kbtuJfdW1c/UtDPIY6vIaD02Wi9W17q0DADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmpKllLHtbGZ+tmw9dLCqdu7IVM/drL8uS91vOXW5x07glbrehqGv49M5cs3ZA/XLu7fyftjX/I4fO1pdu3vv/uraIdrI58jSycNx4fy5rKl15QQAaIpwAgA0RTgBAJoinAAATRFOAICmCCcAQFOEEwCgKcIJANAU4QQAaIpwAgA0RTgBAJoyPc6dzZy5FPN7lse5yxVd2bW9urbrPUlqTS+e7mW7Q9RlHl63VVrY2dumO82kQx8bfdbzi5PuoLvZDj0vLNffO2jbiX3VtXP1LQzyGOryGg9NlovVta6cAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmiKcAABNEU4AgKZkKWVsO5uZny1bDx2sqp07MtVzN+uvy1L3W05d7rETeKWut2Ho6/h0jlxz9kD98u6tvB/2Nb/jx45W1+7eu7+6dog28jmydPJwXDh/LmtqXTkBAJoinAAATRFOAICmCCcAQFOEEwCgKcIJANAU4QQAaIpwAgA0RTgBAJoinAAATZke585mzlyK+T3L49zliq7s2l5d23XZ71rTi6d72e4QdZmH122VFnb2tulOM+nQx0af9fzipDvobrZDzwvL9cvzbzuxr7p2rr6FQR5DXV7joclysbrWlRMAoCnCCQDQlDX9WCczvx4RFyLiexFxpZSyYz2aAgA2r/X4zMnPlFK+uQ7bAQDwYx0AoC1rDSclIv4pM09n5iMrFWTmI5m5lJlLL8XlNe4OANjo1vpjnbeXUp7JzNdHxJOZ+Z+llM9fX1BKeTQiHo2IeE3eU9a4PwBgg1vTlZNSyjOjr89HxKcj4sH1aAoA2LxWHU4y867MvPv730fEz0XEU+vVGACwOa3lxzpbIuLTmfn97fxFKeUf1qUrAGDTylLG9zGQmfnZsvXQwarauSNTPXez/rosdb/llA8HMz5db8PQ1/HpHLnm7IH65d1beT/sa37Hjx2trt29d3917RBt5HNk6eThuHD+XNbU+lViAKApwgkA0BThBABoinACADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0JS13Funs5kzl2J+z/I4d7miK7u2V9d2Xfa71vTi6V62O0Rd5uF1W6WFnb1tutNMOvSx0Wc9vzjpDrqb7dDzwnL98vzbTuyrrp2rb2GQx1CX13hoslysrnXlBABoinACADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmpKllLHtbGZ+tmw9dLCqdu7IVM/drL8u9+HZcupyj53AK3W9R1Rfx6dz5JqzB+rvPdPK+2Ff8zt+7Gh17e69+6trh2gjnyNLJw/HhfPnsqbWlRMAoCnCCQDQFOEEAGiKcAIANEU4AQCaIpwAAE0RTgCApggnAEBThBMAoCnCCQDQlOlx7mzmzKWY37M8zl2u6Mqu7dW1XZf9rjW9eLqX7Q5Rl3l43VZpYWdvm+40kw59bPRZzy9OuoPuZjv0vLBcvzz/thP7qmvn6lsY5DHU5TUemiwXq2tdOQEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmiKcAABNyVLK2HY2Mz9bth46WFU7d2Sq527WX5el7recutxjJ/BKXW/D0Nfx6Ry55uyB+uXdW3k/7Gt+x48dra7dvXd/de0QbeRzZOnk4bhw/lzW1LpyAgA05bbhJDMfy8znM/Op6567JzOfzMyvjr6+rt82AYDNoubKySci4uEbnvtwRBwvpWyLiOOjxwAAa3bbcFJK+XxEfOuGp98TEY+Pvn88It67zn0BAJvUaj9zsqWU8mxExOjr69evJQBgM+v9A7GZ+UhmLmXm0ssXXuh7dwDAwK02nDyXmW+IiBh9ff5mhaWUR0spO0opO151912r3B0AsFmsNpw8EREfGH3/gYj4zPq0AwBsdjW/SvzJiPiXiHhLZp7LzP0R8YcR8Y7M/GpEvGP0GABgzaZvV1BKef9N/mr3OvcCAHD7cLKeZs5civk9y+Pc5Yqu7NpeXdt12e9a04une9nuEHWZh9dtlRZ29rbpTjPp0MdGn/X84qQ76G62Q88Ly/XL8287sa+6dq6+hUEeQ11e46HJcrG61vL1AEBThBMAoCnCCQDQFOEEAGiKcAIANEU4AQCaIpwAAE0RTgCApggnAEBThBMAoClZShnbzmbmZ8vWQweraueOTPXczfrrstT9llOXe+wEXqnrbRj6Oj6dI9ecPVC/vHsr74d9ze/4saPVtbv37q+uHaKNfI4snTwcF86fy5paV04AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKAp0+Pc2cyZSzG/Z3mcu1zRlV3bq2u73pOk1vTi6V62O0Rd5uF1W6WFnb1tutNMOvSx0Wc9vzjpDrqb7dDzwnL9vYO2ndhXXTtX38Igj6Eur/HQZLlYXevKCQDQFOEEAGiKcAIANEU4AQCaIpwAAE0RTgCApggnAEBThBMAoCnCCQDQFOEEAGhKllLGtrOZ+dmy9dDBqtq5I1M9d7P+uix1v+XU5R47gVfqehuGvo5P58g1Zw/UL+/eyvthX/M7fuxode3uvfura4doI58jSycPx4Xz57Km1pUTAKApwgkA0BThBABoinACADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0JTpce5s5sylmN+zPM5drujKru3VtV2X/a41vXi6l+0OUZd5eN1WaWFnb5vuNJMOfWz0Wc8vTrqD7mY79LywXL88/7YT+6pr5+pbGOQx1OU1HposF6trXTkBAJpy23CSmY9l5vOZ+dR1z/1uZv5PZi6P/ryr3zYBgM2i5srJJyLi4RWe/2gp5YHRn8+ub1sAwGZ123BSSvl8RHxrDL0AAKzpMycfzMx/H/3Y53Xr1hEAsKmtNpz8aUT8eEQ8EBHPRsQf36wwMx/JzKXMXHopLq9ydwDAZrGqcFJKea6U8r1SyssR8WcR8eAtah8tpewopey4I/r5tVwAYONYVTjJzDdc9/AXI+Kpm9UCAHRx20XYMvOTEfFQRNybmeci4iMR8VBmPhARJSK+HhG/2mOPAMAmcttwUkp5/wpPH+2hFwCAyFLK2HY2Mz9bth46WFU7d2Sq527WX5el7rec8uFgxqfrbRj6Oj6dI9ecPVC/vHsr74d9ze/4sfp/7+7eu7+6dog28jmydPJwXDh/LmtqLV8PADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmnLbe+usp5kzl2J+z/I4d7miK7u2V9d2Xfa71vTi6V62O0Rd5uF1W6WFnb1tutNMOvSx0Wc9vzjpDrqb7dDzwnL98vzbTuyrrp2rb2GQx1CX13hoslysrnXlBABoinACADRFOAEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmpKllLHtbGZ+tmw9dLCqdu7IVM/drL8u9+HZcupyj53AK3W9R1Rfx6dz5JqzB+rvPdPK+2Ff8zt+7Gh17e69+6trh2gjnyNLJw/HhfPnsqbWlRMAoCnCCQDQFOEEAGiKcAIANEU4AQCaIpwAAE0RTgCApggnAEBThBMAoCnCCQDQlOlx7mzmzKWY37M8zl2u6Mqu7dW1XZf9rjW9eLqX7Q5Rl3l43VZpYWdvm+40kw59bPRZzy9OuoPuZjv0vLBcvzz/thP7qmvn6lsY5DHU5TUemiwXq2tdOQEAmiKcAABNEU4AgKYIJwBAU4QTAKApwgkA0BThBABoinACADRFOAEAmiKcAABNyVLK+HaW+b8R8d8r/NW9EfHNsTXCejK7YTO/4TK7YduM8/uxUsp9NYVjDSc3bSJzqZSyY9J90J3ZDZv5DZfZDZv53Zof6wAATRFOAICmtBJOHp10A6ya2Q2b+Q2X2Q2b+d1CE585AQD4vlaunAAARIRwAgA0RjgBAJoinAAATRFOAICm/D8fhHaj1xK2vAAAAABJRU5ErkJggg==\n", 907 | "text/plain": [ 908 | "
" 909 | ] 910 | }, 911 | "metadata": { 912 | "needs_background": "light" 913 | }, 914 | "output_type": "display_data" 915 | } 916 | ], 917 | "source": [ 918 | "plt.matshow(arr2)" 919 | ] 920 | }, 921 | { 922 | "cell_type": "markdown", 923 | "metadata": {}, 924 | "source": [ 925 | "## Hashed Fingerprints" 926 | ] 927 | }, 928 | { 929 | "cell_type": "markdown", 930 | "metadata": {}, 931 | "source": [ 932 | "### Morgan" 933 | ] 934 | }, 935 | { 936 | "cell_type": "code", 937 | "execution_count": 36, 938 | "metadata": {}, 939 | "outputs": [], 940 | "source": [ 941 | "from molvecgen.vectorizers import HashedMorganVectorizer" 942 | ] 943 | }, 944 | { 945 | "cell_type": "code", 946 | "execution_count": 37, 947 | "metadata": {}, 948 | "outputs": [], 949 | "source": [ 950 | "hmv = HashedMorganVectorizer(nBits=200)" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": 38, 956 | "metadata": { 957 | "scrolled": true 958 | }, 959 | "outputs": [ 960 | { 961 | "data": { 962 | "text/plain": [ 963 | "" 964 | ] 965 | }, 966 | "execution_count": 38, 967 | "metadata": {}, 968 | "output_type": "execute_result" 969 | }, 970 | { 971 | "data": { 972 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA6IAAAB8CAYAAABzJA1FAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAAEGJJREFUeJzt3U+obVd9B/Dvr7EaIoYaYkJapYikhRTah4RYEEpE1OgkCi2YQRuK5Tkw86YjHTqoCEUrPGlIHKhIIZhB8E+dZNJSIwQbpdZgUxsT8iqWIgi2iauDe159vZ7rOW/fvdfa597PB8J99+SctdZZZ6199rp7/X67WmsBAACAXn5ldAMAAAA4XyxEAQAA6MpCFAAAgK4sRAEAAOjKQhQAAICuLEQBAADoqvtCtKruqarvVNUzVfVg7/rhPKmqZ6vqn6rqqap6cvPYTVX11ar67ubna0e3Ew5dVT1UVZer6umrHts61+rIX22+B79ZVW8e13I4bCfMvY9U1Q82331PVdV7rvp/f7GZe9+pqneNaTWQdF6IVtV1ST6Z5N1J7khyX1Xd0bMNcA69rbV2obV25+b3B5N8rbV2e5KvbX4HTufhJPcce+ykufbuJLdv/ruY5FOd2ghn0cP5xbmXJB/ffPddaK09niSbc873J/mdzWv+enNuCgzQ+4roXUmeaa19r7X230k+n+Tezm2A8+7eJI9s/v1IkvcObAucCa21J5L86NjDJ821e5N8ph35hyS/VlW39WkpnC0nzL2T3Jvk8621n7bW/jXJMzk6NwUG6L0Q/Y0k/37V789tHgOW0ZJ8paq+UVUXN4/d2lp7IUk2P28Z1jo4206aa74LYXkPbLa+P3RVCIq5ByvSeyFaWx5rndsA58lbW2tvztFWwA9V1R+MbhDguxAW9qkkb0pyIckLST62edzcgxXpvRB9Lskbrvr99Ume79wGODdaa89vfl5O8miOtiC9eGUb4Obn5XEthDPtpLnmuxAW1Fp7sbX2cmvtZ0k+nZ9vvzX3YEV6L0S/nuT2qnpjVb0yRwHjj3VuA5wLVfXqqnrNlX8neWeSp3M05+7fPO3+JF8c00I4806aa48l+ZNN9tzfT/JfV7bwAqd3LOb6fTn67kuO5t77q+pVVfXGHCUM+8fe7QOOvKJnZa21l6rqgSRfTnJdkodaa9/q2QY4R25N8mhVJUdz/bOttS9V1deTfKGqPpDk+0n+aGAb4Uyoqs8luTvJzVX1XJIPJ/lots+1x5O8J0eJUn6S5E+7NxjOiBPm3t1VdSFH226fTfLBJGmtfauqvpDk20leSvKh1trLI9oNJNWarfEAAAD003trLgAAAOechSgAAABdWYgCAADQlYUoAAAAXVmIAgAA0NWQhWhVXRxRL5x35h6MY/7BGOYerNOpFqJVdU9VfaeqnqmqB6/hpQ4IMIa5B+OYfzCGuQcrNHkhWlXXJflkkncnuSPJfVV1x1wNAwAA4Gx6xSlee1eSZ1pr30uSqvp8knuTfPukF7yyXtWuz6tzfW7IjXVT26eS3/rdn+x8zr9884b9WtzJPm3eR8/3dbzN+9S97X3uet2U1yxZznnzy+aePj29XfNodB9POTYt1b4l+6Ln+7yWY+e1fPctpecYnPK9clb0OhZMLWeuz+ZQPuPTzr05zpFGH+vnGl9zlLs2PefRXN9Pvc7bT3rdrnJ+nP/8YWvtdbteV61Nm5dV9YdJ7mmt/dnm9z9O8pbW2gMnvebGuqm9pd5+TfV8+fmndj7nXb9+4ZrKXNo+bd5Hz/d1vM371L3tfe563ZTXLFkOP6dPT2/XPBrdx1OOTUu1b8m+6Pk+pxw7R+o5Bg+tb+bU61gwtZy5Ppvz8hnPcY40+lg/1/iao9y16TmP5vp+6nXeftLrdpXzd+1vv9Fau3PX605zRbS2PPYLq9pNgPjF5OgvUgAAAJxvp1mIPpfkDVf9/vokzx9/UmvtUpJLSWbbkrT2v77s85fPXa/prddfa+d6n1P/OnRe/no7xdqufPX8a99IPcfk2vtiH1P6q2efOsaQTDsPmKOeqaYeO9d0lW30VeaRlhpfPR3isXNKG5e6Ajm17n36fcnP4jRZc7+e5PaqemNVvTLJ+5M8Nk+zAAAAOKsmXxFtrb1UVQ8k+XKS65I81Fr71mwtAwAA4Ew6zdbctNYeT/L4TG0BAADgHDjN1lwAAAC4Zqe6IjrK2gOapwSNjw6O75VUaHQygbWNlTVZagxOLWOpJABrMzql/y5r69M1JUfZVvba+osxpty+ZY56ttW1j57jf6k5MjoZ4ki9kmMtaW19uo9et29Zsu6RyZMSV0QBAADozEIUAACArixEAQAA6OogY0TXvo98yl790e+pVyzn6BiOtcd5jLRUX8wVk7Bk/MNIPcfk2vtiH2uKSdtWtmMMSb+YvbnGV8+8C0udE43OQTHSIcaEHneIx86l4rGXmiNTz4uX/CxcEQUAAKArC1EAAAC6shAFAACgKwtRAAAAujrIZEVrD2ieEjQ+Oji+V1Kh0ckE1jZW1mSpMTi1jJ43ZR9p5Dzfx9r6dE3JUbaVvbb+Yoxd42Kpm9pvq2sfPcf/UnNkdDLEkXolx1rS2vp0H3MlVRxZ98jkSYkrogAAAHRmIQoAAEBXFqIAAAB0daoY0ap6NsmPk7yc5KXW2p1zNGqXte8jn7JXf/R76hXLOTqGY+1xHiMt1RdzxSQsGf8wUs8xufa+2MeaYtK2le0YQ9IvZm+u8dUz78JS50Sjc1CMdIgxoccd4rFzqXjspebI1PPiJT+LOZIVva219sMZygEAAOAcsDUXAACArk67EG1JvlJV36iqi3M0CAAAgLPttFtz39pae76qbkny1ar659baE1c/YbNAvZgk1+eGU1YHAADAoTvVQrS19vzm5+WqejTJXUmeOPacS0kuJcmNdVM7TX1XrD2geUrQ+Ojg+F5JhUYnE1jbWFmTpcbg1DJ63pR9pJHzfB9r69M1JUfZVvba+osxdo2LpW5qv62uffQc/0vNkdHJEEfqlRxrSWvr033MlVRxZN0jkyclp9iaW1WvrqrXXPl3kncmeXpqeQAAAJwPp7kiemuSR6vqSjmfba19aZZWAQAAcGZNXoi21r6X5PdmbAsAAADnwBz3Ee1u7fvIp+zVH/2eesVyjo7hWHucx0hL9cVcMQlLxj+M1HNMrr0v9rGmmLRtZTvGkPSL2ZtrfPXMu7DUOdHoHBQjHWJM6HGHeOxcKh57qTky9bx4yc/CfUQBAADoykIUAACArixEAQAA6MpCFAAAgK4OMlnR2gOapwSNjw6O75VUaHQygbWNlTVZagxOLaPnTdlHGjnP97G2Pl1TcpRtZa+tvxhj17hY6qb22+raR8/xv9QcGZ0McaReybGWtLY+3cdcSRVH1j0yeVLiiigAAACdWYgCAADQlYUoAAAAXR1kjOja95FP2as/+j31iuUcHcOx9jiPkZbqi7liEpaMfxip55hce1/sY00xadvKdowh6RezN9f46pl3YalzotE5KEY6xJjQ4w7x2LlUPPZSc2TqefGSn4UrogAAAHRlIQoAAEBXFqIAAAB0tXMhWlUPVdXlqnr6qsduqqqvVtV3Nz9fu2wzAQAAOCv2SVb0cJJPJPnMVY89mORrrbWPVtWDm9//fP7mbbf2gOYpQeOjg+N7JRUanUxgbWNlTZYag1PL6HlT9pFGzvN9rK1P15QcZVvZa+svxtg1Lpa6qf22uvbRc/wvNUdGJ0McqVdyrCWtrU/3MVdSxZF1j0yelOxxRbS19kSSHx17+N4kj2z+/UiS906qHQAAgHNnaozora21F5Jk8/OW+ZoEAADAWbb4fUSr6mKSi0lyfW5YujoAAABWbupC9MWquq219kJV3Zbk8klPbK1dSnIpSW6sm9rE+v6fte8jn7JXf/R76hXLOTqGY+1xHiMt1RdzxSQsGf8wUs8xufa+2MeaYtK2le0YQ9IvZm+u8dUz78JS50Sjc1CMdIgxoccd4rFzqXjspebI1PPiJT+LqVtzH0ty/+bf9yf54jzNAQAA4Kzb5/Ytn0vy90l+u6qeq6oPJPlokndU1XeTvGPzOwAAAOy0c2tua+2+E/7X22duCwAAAOfA1K25AAAAMMniWXOXsPaA5ilB46OD43slFRqdTGBtY2VNlhqDU8voeVP2kUbO832srU/XlBxlW9lr6y/G2DUulrqp/ba69tFz/C81R0YnQxypV3KsJa2tT/cxV1LFkXWPTJ6UuCIKAABAZxaiAAAAdGUhCgAAQFcHGSO69n3kU/bqj35PvWI5R8dwrD3OY6Sl+mKumIQl4x9G6jkm194X+1hTTNq2sh1jSPrF7M01vnrmXVjqnGh0DoqRDjEm9LhDPHYuFY+91ByZel685GfhiigAAABdWYgCAADQlYUoAAAAXVmIAgAA0NVBJitae0DzlKDx0cHxvZIKjU4msLaxsiZLjcGpZfS8KftII+f5PtbWp2tKjrKt7LX1F2PsGhdL3dR+W1376Dn+l5ojo5MhjtQrOdaS1tan+5grqeLIukcmT0pcEQUAAKAzC1EAAAC62rkQraqHqupyVT191WMfqaofVNVTm//es2wzAQAAOCv2iRF9OMknknzm2OMfb6395ewt2sPa95FP2as/+j31iuUcHcOx9jiPkZbqi7liEpaMfxip55hce1/sY00xadvKdowh6RezN9f46pl3YalzotE5KEY6xJjQ4w7x2LlUPPZSc2TqefGSn8XOK6KttSeS/GixFgAAAHCunCZG9IGq+uZm6+5rZ2sRAAAAZ9rUheinkrwpyYUkLyT52ElPrKqLVfVkVT35P/npxOoAAAA4KyYtRFtrL7bWXm6t/SzJp5Pc9Uuee6m1dmdr7c5fzaumthMAAIAzYp9kRb+gqm5rrb2w+fV9SZ7+Zc+f29oDmqcEjY8Oju+VVGh0MoG1jZU1WWoMTi2j503ZRxo5z/extj5dU3KUbWWvrb8YY9e4WOqm9tvq2kfP8b/UHBmdDHGkXsmxlrS2Pt3HXEkVR9Y9MnlSssdCtKo+l+TuJDdX1XNJPpzk7qq6kKQleTbJByfVDgAAwLmzcyHaWrtvy8N/s0BbAAAAOAdOkzUXAAAArtmkGNHR1r6PfMpe/dHvqVcs5+gYjrXHeYy0VF/MFZOwZPzDSD3H5Nr7Yh9riknbVrZjDEm/mL25xlfPvAtLnRONzkEx0iHGhB53iMfOpeKxl5ojU8+Ll/wsXBEFAACgKwtRAAAAurIQBQAAoCsLUQAAALo6yGRFaw9onhI0Pjo4vldSodHJBNY2VtZkqTE4tYyeN2UfaeQ838fa+nRNyVG2lb22/mKMXeNiqZvab6trHz3H/1JzZHQyxJF6Jcda0tr6dB9zJVUcWffI5EmJK6IAAAB0ZiEKAABAVxaiAAAAdHWQMaJr30c+Za/+6PfUK5ZzdAzH2uM8RlqqL+aKSVgy/mGknmNy7X2xjzXFpG0r2zGGpF/M3lzjq2fehaXOiUbnoBjpEGNCjzvEY+dS8dhLzZGp58VLfhauiAIAANCVhSgAAABdWYgCAADQlYUoAAAAXVVrrV9lVf+R5N+S3Jzkh90qBq4w92Ac8w/GMPegr99srb1u15O6LkT/r9KqJ1trd3avGM45cw/GMf9gDHMP1snWXAAAALqyEAUAAKCrUQvRS4PqhfPO3INxzD8Yw9yDFRoSIwoAAMD5ZWsuAAAAXVmIAgAA0JWFKAAAAF1ZiAIAANCVhSgAAABd/S+RRrJ3aFIfKwAAAABJRU5ErkJggg==\n", 973 | "text/plain": [ 974 | "
" 975 | ] 976 | }, 977 | "metadata": { 978 | "needs_background": "light" 979 | }, 980 | "output_type": "display_data" 981 | } 982 | ], 983 | "source": [ 984 | "plt.matshow(hmv.transform(mols))" 985 | ] 986 | }, 987 | { 988 | "cell_type": "markdown", 989 | "metadata": {}, 990 | "source": [ 991 | "### Hashed AtomPairFingerprint" 992 | ] 993 | }, 994 | { 995 | "cell_type": "code", 996 | "execution_count": 39, 997 | "metadata": { 998 | "scrolled": true 999 | }, 1000 | "outputs": [], 1001 | "source": [ 1002 | "from molvecgen.vectorizers import HashedAPVectorizer\n" 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "code", 1007 | "execution_count": 40, 1008 | "metadata": {}, 1009 | "outputs": [], 1010 | "source": [ 1011 | "hmv = HashedAPVectorizer(nBits=100, augment=True, minLength=4, maxLength=8)" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": 41, 1017 | "metadata": {}, 1018 | "outputs": [ 1019 | { 1020 | "data": { 1021 | "text/plain": [ 1022 | "{'minLength': 4, 'maxLength': 8}" 1023 | ] 1024 | }, 1025 | "execution_count": 41, 1026 | "metadata": {}, 1027 | "output_type": "execute_result" 1028 | } 1029 | ], 1030 | "source": [ 1031 | "hmv.kwargs" 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "code", 1036 | "execution_count": 42, 1037 | "metadata": { 1038 | "scrolled": true 1039 | }, 1040 | "outputs": [ 1041 | { 1042 | "data": { 1043 | "text/plain": [ 1044 | "" 1045 | ] 1046 | }, 1047 | "execution_count": 42, 1048 | "metadata": {}, 1049 | "output_type": "execute_result" 1050 | }, 1051 | { 1052 | "data": { 1053 | "image/png": "\n", 1054 | "text/plain": [ 1055 | "
" 1056 | ] 1057 | }, 1058 | "metadata": { 1059 | "needs_background": "light" 1060 | }, 1061 | "output_type": "display_data" 1062 | } 1063 | ], 1064 | "source": [ 1065 | "plt.matshow(hmv.transform(mols))" 1066 | ] 1067 | }, 1068 | { 1069 | "cell_type": "markdown", 1070 | "metadata": {}, 1071 | "source": [ 1072 | "### Hashed topological torsion fingerprint" 1073 | ] 1074 | }, 1075 | { 1076 | "cell_type": "code", 1077 | "execution_count": 43, 1078 | "metadata": {}, 1079 | "outputs": [], 1080 | "source": [ 1081 | "from molvecgen.vectorizers import HashedTorsionVectorizer" 1082 | ] 1083 | }, 1084 | { 1085 | "cell_type": "code", 1086 | "execution_count": 44, 1087 | "metadata": {}, 1088 | "outputs": [], 1089 | "source": [ 1090 | "hmv = HashedTorsionVectorizer(nBits=200)" 1091 | ] 1092 | }, 1093 | { 1094 | "cell_type": "code", 1095 | "execution_count": 45, 1096 | "metadata": { 1097 | "scrolled": true 1098 | }, 1099 | "outputs": [ 1100 | { 1101 | "data": { 1102 | "text/plain": [ 1103 | "" 1104 | ] 1105 | }, 1106 | "execution_count": 45, 1107 | "metadata": {}, 1108 | "output_type": "execute_result" 1109 | }, 1110 | { 1111 | "data": { 1112 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA6IAAAB8CAYAAABzJA1FAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAAD+lJREFUeJzt3V+IXVcVx/HfMqEOUyw29g9VRy1SC1WcUC5VESQi2j8vUVBoH7SIMj6Yd9Mn+2YfFEHUQsXQ9sEWGSgWKY6dguRFmXagjKnYaahVY0JjrYhSUBqXD3Oi0+PM3Jtz9581+34/EGbuzZl9Vva/Oyvn7H3M3QUAAAAAQClvqh0AAAAAAGC2kIgCAAAAAIoiEQUAAAAAFEUiCgAAAAAoikQUAAAAAFAUiSgAAAAAoKjiiaiZ3WZmz5vZaTM7Xvr8wCwxs5fM7Ndm9qyZPdO9d8jMnjSzF7qvV9aOE9jvzOyEmZ03s1Pb3ttxrNmW73SfgxtmdnO9yIH9bZexd6+Z/an77HvWzO7Y9nf3dGPveTO7tU7UAKTCiaiZHZD0PUm3S7pJ0l1mdlPJGIAZ9HF3P+zuo+71cUlPufsNkp7qXgOYzoOSbuu9t9tYu13SDd2fJUn3F4oRaNGD+v+xJ0nf7j77Drv7E5LU/c55p6T3dz/z/e53UwAVlL4ieouk0+7+orv/S9Kjko4WjgGYdUclPdR9/5CkT1eMBWiCu5+U9Grv7d3G2lFJD/uWX0l6q5ldVyZSoC27jL3dHJX0qLv/091/J+m0tn43BVBB6UT0HZL+uO31me49AHm4pJ+b2bqZLXXvXevu5ySp+3pNteiAtu021vgsBPI71t36fmLbEhTGHhBI6UTUdnjPC8cAzJKPuvvN2roV8Ktm9rHaAQHgsxDI7H5J75V0WNI5Sd/q3mfsAYGUTkTPSFrY9vqdks4WjgGYGe5+tvt6XtJj2roF6eWLtwF2X8/XixBo2m5jjc9CICN3f9ndL7j7vyX9QP+7/ZaxBwRSOhF9WtINZna9mV2mrQXjjxeOAZgJZna5mb3l4veSPiXplLbG3N3dYXdL+kmdCIHm7TbWHpf0hW733A9L+tvFW3gBTK+35voz2vrsk7bG3p1m9mYzu15bG4atlY4PwJaDJU/m7q+b2TFJK5IOSDrh7s+VjAGYIddKeszMpK2x/iN3/5mZPS3px2b2JUl/kPS5ijECTTCzRyQdkXSVmZ2R9HVJ92nnsfaEpDu0tVHKa5K+WDxgoBG7jL0jZnZYW7fdviTpK5Lk7s+Z2Y8l/UbS65K+6u4XasQNQDJ3bo0HAAAAAJRT+tZcAAAAAMCMIxEFAAAAABRFIgoAAAAAKIpEFAAAAABQFIkoAAAAAKCoKomomS3VOC8w6xh7QD2MP6AOxh4Q01SJqJndZmbPm9lpMzt+CT/KhADUwdgD6mH8AXUw9oCABieiZnZA0vck3S7pJkl3mdlNqQIDAAAAALTp4BQ/e4uk0+7+oiSZ2aOSjkr6zW4/cJm92ed0ueY0ryvskE9x7lDe98HX3vB6c2N+6jJql4P8crb5buXuNfZqxHMp5Qztx7nK2cm4snPVzdBysLdUfeeiaT77oo3PVpUao7M8zmv8u0r83pl6vphGtL4T7fM8lUjxRPv9/+/66yvufvW448x92Lg0s89Kus3dv9y9/rykD7n7sd1+5go75B+yTww6X2QrZ599w+tb33546jJql4P8crb5kHKjxzO0H+cqZyfjys7ZVkPKwd5S9Z0Uoo3PVpUao7M8zmfl3xVtvugrGV+0z/NUIsUT7ff/VV9ed/fRuOOmuSJqO7z3f1ltt0B8Sdr6HykAAAAAwGyb5oroRyTd6+63dq/vkSR3/8ZuPzNanPO1lYU9y639vxtIq9X/lUshWt2kEu2qX/Q2349t3NdqX44k2p0PqUQanynlulOqr5X6Qlol76DYCf1yctHnZGlYPJNeEZ1m19ynJd1gZteb2WWS7pT0+BTlAQAAAABmwOBbc939dTM7JmlF0gFJJ9z9uWSRAQAAAACaNM0aUbn7E5KeSBQLAAAAAGAGTHNrLgAAAAAAl2yqK6KXanNjvskFzDy+ZXfR2jtaPH2RHpcyNJ5caseSa+OQIfNHtE1MardNLtE30kkVT81/Z7Q6jTTOWxVt/kolehsPiS/aHBOtjlPEk6uOh5Zduk65IgoAAAAAKIpEFAAAAABQFIkoAAAAAKAoc/diJxstzvnaysKex9S+3xtpRVsLEml9QbS6SSXnmtUU5URr8/3Yxn2t9uVIWl0LHml8ppRr74i+VuoLaeXcd2QS9MvJRZ+TpWHxrPryuruPxh3HFVEAAAAAQFEkogAAAACAokhEAQAAAABFkYgCAAAAAIo6WPJkmxvzTS5gzrUpQc1yUonW3tHi6au5oUW0vtNXO5ZID7qPtolJ7bbJJfpGOtEeNj9EtDqNNM5bFW3+SiV6Gw+JL9ocE62OU8STq46Hll26TrkiCgAAAAAoikQUAAAAAFAUiSgAAAAAoChz9+E/bPaSpL9LuiDp9XEPLh0tzvnaysKeZda+3xtpRVsLEml9QbS6SSXnmtUU5URr8/3Yxn2t9uVIWl0LHml8ppRr74i+VuoLaeXcd2QS9MvJRZ+TpWHxrPry+ri8UEqzWdHH3f2VBOUAAAAAAGYAt+YCAAAAAIqaNhF1ST83s3UzW0oREAAAAACgbdPemvtRdz9rZtdIetLMfuvuJ7cf0CWoS5L0rncUfWwpAAAAACCgqTJDdz/bfT1vZo9JukXSyd4xD0h6QJKusEPe4gLmXJsS1CwnlWjtHS2evpobWkTrO321Y4n0oPtom5jUbptcom+kE+1h80NEq9NI47xV0eavVKK38ZD4os0x0eo4RTy56nho2aXrdPCtuWZ2uZm95eL3kj4l6VSqwAAAAAAAbZrmiui1kh4zs4vl/Mjdf5YkKgAAAABAswYnou7+oqTFhLEAAAAAAGaAuXuxk40W53xtZWHPY2rf7420oq0FibS+IFrdpJJzzWqKcqK1+X5s475W+3Ikra4FjzQ+U8q1d0RfK/WFtHLuOzIJ+uXkos/J0rB4Vn153d1H447jOaIAAAAAgKJIRAEAAAAARZGIAgAAAACKIhEFAAAAABQ1zeNbLtnmxnyTC5hzbUpQs5xUorV3tHj6am5oEa3v9NWOJdKD7qNtYlK7bXKJvpFOtIfNDxGtTiON81ZFm79Sid7GQ+KLNsdEq+MU8eSq46Fll65TrogCAAAAAIoiEQUAAAAAFEUiCgAAAAAoyty92MlGi3O+trKw5zG17/dGWtHWgkRaXxCtblLJuWY1RTnR2nw/tnFfq305klbXgkcanynl2juir5X6Qlo59x2ZBP1yctHnZGlYPKu+vO7uo3HHcUUUAAAAAFAUiSgAAAAAoCgSUQAAAABAUWMTUTM7YWbnzezUtvcOmdmTZvZC9/XKvGECAAAAAFpxcIJjHpT0XUkPb3vvuKSn3P0+Mzvevf7auII2N+abXMCca1OCmuWkEq29o8XTV3NDi2h9p692LJEedB9tE5PabZNL9I10oj1sfohodRppnLcq2vyVSvQ2HhJftDkmWh2niCdXHQ8tu3Sdjr0i6u4nJb3ae/uopIe67x+S9OnEcQEAAAAAGjV0jei17n5Okrqv16QLCQAAAADQskluzZ2KmS1JWpKkOc3nPh0AAAAAIDhz9/EHmb1H0k/d/QPd6+clHXH3c2Z2naRfuPuN48oZLc752srCnsfUvt8baUVbCxJpfUG0ukkl55rVFOVEa/P92MZ9rfblSFpdCx5pfKaUa++IvlbqC2nl3HdkEvTLyUWfk6Vh8az68rq7j8YdN/TW3Mcl3d19f7eknwwsBwAAAAAwYyZ5fMsjkn4p6UYzO2NmX5J0n6RPmtkLkj7ZvQYAAAAAYKyxa0Td/a5d/uoTiWMBAAAAAMyAobfmAgAAAAAwSPZdc7fb3JhvcgFzrk0JapaTSrT2jhZPX80NLaL1nb7asUR60H20TUxqt00u0TfSifaw+SGi1Wmkcd6qaPNXKtHbeEh80eaYaHWcIp5cdTy07NJ1yhVRAAAAAEBRJKIAAAAAgKJIRAEAAAAARZm7FzvZaHHO11YW9jym9v3eSCvaWpBI6wui1U0qOdespignWpvvxzbua7UvR9LqWvBI4zOlXHtH9LVSX0gr574jk6BfTi76nCwNi2fVl9fdfTTuOK6IAgAAAACKIhEFAAAAABRFIgoAAAAAKIpEFAAAAABQ1MGSJ9vcmG9yAXOuTQlqlpNKtPaOFk9fzQ0tovWdvtqxRHrQfbRNTGq3TS7RN9KJ9rD5IaLVaaRx3qpo81cq0dt4SHzR5phodZwinlx1PLTs0nXKFVEAAAAAQFEkogAAAACAosYmomZ2wszOm9mpbe/da2Z/MrNnuz935A0TAAAAANAKc/e9DzD7mKR/SHrY3T/QvXevpH+4+zcv5WSjxTlfW1nY85ja93sjrWhrQSKtL4hWN6nkXLOaopxobb4f27iv1b4cSatrwSONz5Ry7R3R10p9Ia2c+45Mgn45uehzsjQsnlVfXnf30bjjxl4RdfeTkl695AgAAAAAANjBNGtEj5nZRnfr7pXJIgIAAAAANG1oInq/pPdKOizpnKRv7XagmS2Z2TNm9syf/3Jh4OkAAAAAAK0YlIi6+8vufsHd/y3pB5Ju2ePYB9x95O6jq992YGicAAAAAIBGHBzyQ2Z2nbuf615+RtKpvY6/aHNjvskFzLk2JahZTirR2jtaPH01N7SI1nf6ascS6UH30TYxqd02uUTfSCfaw+aHiFankcZ5q6LNX6lEb+Mh8UWbY6LVcYp4ctXx0LJL1+nYRNTMHpF0RNJVZnZG0tclHTGzw5Jc0kuSvpIxRgAAAABAQ8Ymou5+1w5v/zBDLAAAAACAGTDNrrkAAAAAAFwyc/diJxstzvnaysKex9S+3xtpRVsLEml9QbS6SSXnmtUU5URr8/3Yxn2t9uVIWl0LHml8ppRr74i+VuoLaeXcd2QS9MvJRZ+TpWHxrPryuruPxh3HFVEAAAAAQFEkogAAAACAokhEAQAAAABFkYgCAAAAAIoa+/iWlDY35ptcwJxrU4Ka5aQSrb2jxdNXc0OLaH2nr3YskR50H20Tk9ptk0v0jXSiPWx+iGh1Gmmctyra/JVK9DYeEl+0OSZaHaeIJ1cdDy27dJ1yRRQAAAAAUBSJKAAAAACgKBJRAAAAAEBR5u7FTjZanPO1lYU9j6l9vzfSirYWJNL6gmh1k0rONaspyonW5vuxjfta7cuRtLoWPNL4TCnX3hF9rdQX0sq578gk6JeTiz4nS8PiWfXldXcfjTuOK6IAAAAAgKJIRAEAAAAARZGIAgAAAACKIhEFAAAAABRVdLMiM/uzpN9LukrSK8VODOAixh5QD+MPqIOxB5T1bne/etxBRRPR/57U7JlJdlICkBZjD6iH8QfUwdgDYuLWXAAAAABAUSSiAAAAAICiaiWiD1Q6LzDrGHtAPYw/oA7GHhBQlTWiAAAAAIDZxa25AAAAAICiSEQBAAAAAEWRiAIAAAAAiiIRBQAAAAAURSIKAAAAACjqPzBOXCTWjwJ7AAAAAElFTkSuQmCC\n", 1113 | "text/plain": [ 1114 | "
" 1115 | ] 1116 | }, 1117 | "metadata": { 1118 | "needs_background": "light" 1119 | }, 1120 | "output_type": "display_data" 1121 | } 1122 | ], 1123 | "source": [ 1124 | "plt.matshow(hmv.transform(mols))" 1125 | ] 1126 | }, 1127 | { 1128 | "cell_type": "markdown", 1129 | "metadata": {}, 1130 | "source": [ 1131 | "### RDkit Fingerprint" 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "code", 1136 | "execution_count": 46, 1137 | "metadata": { 1138 | "scrolled": true 1139 | }, 1140 | "outputs": [ 1141 | { 1142 | "data": { 1143 | "text/plain": [ 1144 | "" 1145 | ] 1146 | }, 1147 | "execution_count": 46, 1148 | "metadata": {}, 1149 | "output_type": "execute_result" 1150 | }, 1151 | { 1152 | "data": { 1153 | "image/png": "\n", 1154 | "text/plain": [ 1155 | "
" 1156 | ] 1157 | }, 1158 | "metadata": { 1159 | "needs_background": "light" 1160 | }, 1161 | "output_type": "display_data" 1162 | } 1163 | ], 1164 | "source": [ 1165 | "from molvecgen.vectorizers import HashedRDKVectorizer\n", 1166 | "hmv = HashedRDKVectorizer(nBits=1024)\n", 1167 | "\n", 1168 | "plt.matshow(hmv.transform(mols))" 1169 | ] 1170 | }, 1171 | { 1172 | "cell_type": "code", 1173 | "execution_count": null, 1174 | "metadata": {}, 1175 | "outputs": [], 1176 | "source": [] 1177 | } 1178 | ], 1179 | "metadata": { 1180 | "kernelspec": { 1181 | "display_name": "tf2", 1182 | "language": "python", 1183 | "name": "tf2" 1184 | }, 1185 | "language_info": { 1186 | "codemirror_mode": { 1187 | "name": "ipython", 1188 | "version": 3 1189 | }, 1190 | "file_extension": ".py", 1191 | "mimetype": "text/x-python", 1192 | "name": "python", 1193 | "nbconvert_exporter": "python", 1194 | "pygments_lexer": "ipython3", 1195 | "version": "3.6.10" 1196 | } 1197 | }, 1198 | "nbformat": 4, 1199 | "nbformat_minor": 4 1200 | } 1201 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Esben Jannik Bjerrum 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # molvecgen 2 | Molecular vectorization and batch generation. A further development of the SMILES enumeration package: https://github.com/EBjerrum/SMILES-enumeration 3 | 4 | # Installation 5 | If you downloaded/cloned the code: 6 | 7 | ```bash 8 | python setup.py install 9 | ``` 10 | 11 | or directly from the repository 12 | 13 | ```bash 14 | python -m pip install git+https://github.com/EBjerrum/molvecgen 15 | ``` 16 | 17 | # Usage 18 | See some basic examples in the Examples.ipynb 19 | 20 | 21 | ## Bibliography 22 | 23 | If you use SMILES augmentation please cite: [SMILES enumeration as Data Augmentation for Network Modeling of Molecules](https://arxiv.org/abs/1703.07076) 24 | 25 | ```bibtex 26 | @article{DBLP:journals/corr/Bjerrum17, 27 | author = {Esben Jannik Bjerrum}, 28 | title = {{SMILES} Enumeration as Data Augmentation for Neural Network Modeling 29 | of Molecules}, 30 | journal = {CoRR}, 31 | volume = {abs/1703.07076}, 32 | year = {2017}, 33 | url = {http://arxiv.org/abs/1703.07076}, 34 | timestamp = {Wed, 07 Jun 2017 14:40:38 +0200}, 35 | biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/Bjerrum17}, 36 | bibsource = {dblp computer science bibliography, http://dblp.org} 37 | } 38 | ``` 39 | 40 | 41 | -------------------------------------------------------------------------------- /molvecgen/__init__.py: -------------------------------------------------------------------------------- 1 | from .vectorizers import SmilesVectorizer, ChemceptionVectorizer 2 | from .generators import HetSmilesGenerator, SmilesGenerator, Iterator 3 | from .generators import SmilesGenerator as ChemceptionGenerator 4 | from .generators import SmilesSequence, Sequence 5 | -------------------------------------------------------------------------------- /molvecgen/generators.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import threading 3 | import math as m 4 | 5 | class Sequence(object): 6 | """Base object for fitting to a sequence of data, such as a dataset. 7 | The abstract method `__getitem__` should be overridden and should return a complete batch, 8 | contained in a tuple. 9 | The mini-batches are chosen sequentially from an index list that are reshuffled on each epoch. 10 | 11 | Notes: 12 | `Sequence` are a safer way to do multiprocessing. This structure guarantees 13 | that the network will only train once 14 | on each sample per epoch which is not the case with generators. 15 | """ 16 | 17 | def __init__(self, x, y, vectorizer=None, batch_size=32): 18 | if y is not None and len(x) != len(y): 19 | raise ValueError('X (features) and y (labels) ' 20 | 'should have the same length. ' 21 | 'Found: X.shape = %s, y.shape = %s' % 22 | (np.asarray(x).shape, np.asarray(y).shape)) 23 | 24 | self.x = np.asarray(x) 25 | 26 | if y is not None: 27 | self.y = np.asarray(y) 28 | else: 29 | self.y = None 30 | 31 | self.vectorizer = vectorizer 32 | self.batch_size = batch_size 33 | self._shuffle_index() 34 | 35 | def __getitem__(self, index): 36 | """Gets batch at position `index`. 37 | Arguments: 38 | index: position of the batch in the Sequence. 39 | Returns: 40 | A batch 41 | """ 42 | raise NotImplementedError 43 | 44 | def __len__(self): 45 | """Number of batch in the Sequence. 46 | Returns: 47 | The number of batches in the Sequence. 48 | """ 49 | l = m.ceil(len(self.x) / self.batch_size) 50 | l_last = len(self.x)%l 51 | if (l_last > 0) and (l_last < self.batch_size/4): 52 | print("Last batch will be %i samples which is significantly lower than intended %i. This can lead to unstable training."%(l_last, self.batch_size)) 53 | return l 54 | 55 | def __iter__(self): 56 | """Create a generator that iterate over the Sequence.""" 57 | for item in (self[i] for i in range(len(self))): 58 | yield item 59 | 60 | def _shuffle_index(self): 61 | """Method that creates/reshuffles the index for creation 62 | of the mini-batches in a random fashion""" 63 | self.index = np.random.permutation(len(self.x)) 64 | 65 | def get_samples_idx(self, idx): 66 | """Get the indices of the samples, from a given minibatch index""" 67 | 68 | sample_idxs = self.index[idx * self.batch_size:(idx + 1) * self.batch_size] 69 | return sample_idxs 70 | 71 | def on_epoch_end(self): 72 | """Method called at the end of every epoch. Reshuffles the index to 73 | create randomly selected mini-batches. 74 | """ 75 | self._shuffle_index() 76 | 77 | class Iterator(object): 78 | """Abstract base class for data iterators. 79 | 80 | :parameter n: Integer, total number of samples in the dataset to loop over. 81 | :parameter batch_size: Integer, size of a batch. 82 | :parameter shuffle: Boolean, whether to shuffle the data between epochs. 83 | :parameter seed: Random seeding for data shuffling. 84 | """ 85 | 86 | def __init__(self, n, batch_size, shuffle, seed): 87 | self.n = n 88 | self.batch_size = batch_size 89 | self.shuffle = shuffle 90 | self.batch_index = 0 91 | self.total_batches_seen = 0 92 | self.lock = threading.Lock() 93 | self.index_generator = self._flow_index(n, batch_size, shuffle, seed) 94 | if n < batch_size: 95 | raise ValueError('Input data length is shorter than batch_size\nAdjust batch_size') 96 | 97 | def reset(self): 98 | self.batch_index = 0 99 | 100 | def _flow_index(self, n, batch_size=32, shuffle=False, seed=None): 101 | # Ensure self.batch_index is 0. 102 | self.reset() 103 | while 1: 104 | if seed is not None: 105 | np.random.seed(seed + self.total_batches_seen) 106 | if self.batch_index == 0: 107 | index_array = np.arange(n) 108 | if shuffle: 109 | index_array = np.random.permutation(n) 110 | 111 | current_index = (self.batch_index * batch_size) % n 112 | if n > current_index + batch_size: 113 | current_batch_size = batch_size 114 | self.batch_index += 1 115 | else: 116 | current_batch_size = n - current_index 117 | self.batch_index = 0 118 | self.total_batches_seen += 1 119 | yield (index_array[current_index: current_index + current_batch_size], 120 | current_index, current_batch_size) 121 | 122 | def __iter__(self): 123 | # Needed if we want to do something like: 124 | # for x, y in data_gen.flow(...): 125 | return self 126 | 127 | def __next__(self, *args, **kwargs): 128 | return self.next(*args, **kwargs) 129 | 130 | 131 | class SmilesSequence(Sequence): 132 | """Sequence yielding vectorized SMILES. 133 | Initialize with X, y and a vectorizer, implementing a batch_wise transform methods""" 134 | def __init__(self, *args, **kwargs): 135 | super().__init__(*args, **kwargs) 136 | 137 | def __getitem__(self, idx): 138 | """Returns the vectorized samples.""" 139 | samples = self.get_samples_idx(idx) 140 | batch_x = self.x[samples] 141 | batch_x = self.vectorizer.transform(batch_x) 142 | batch_y = self.y[samples] 143 | return (batch_x, batch_y) 144 | 145 | class SmilesGenerator(Iterator): 146 | """Iterator yielding data from a SMILES array. 147 | 148 | :parameter x: Numpy array of SMILES input data. 149 | :parameter y: Numpy array of targets data. 150 | :parameter vectorizer: Instance of molecular vectorizer 151 | :parameter batch_size: Integer, size of a batch. 152 | :parameter shuffle: Boolean, whether to shuffle the data between epochs. 153 | :parameter seed: Random seed for data shuffling. 154 | :parameter dtype: dtype to use for returned batch. Set to keras.backend.floatx if using Keras 155 | """ 156 | 157 | def __init__(self, x, y, vectorizer, 158 | batch_size=32, shuffle=False, seed=None, 159 | dtype=np.float32 160 | ): 161 | if y is not None and len(x) != len(y): 162 | raise ValueError('X (images tensor) and y (labels) ' 163 | 'should have the same length. ' 164 | 'Found: X.shape = %s, y.shape = %s' % 165 | (np.asarray(x).shape, np.asarray(y).shape)) 166 | 167 | self.x = np.asarray(x) 168 | 169 | if y is not None: 170 | self.y = np.asarray(y) 171 | else: 172 | self.y = None 173 | self.vectorizer = vectorizer 174 | self.dtype = dtype 175 | #print(type(self)) 176 | #print(type(SmilesGenerator)) 177 | super(SmilesGenerator, self).__init__(len(x), batch_size, shuffle, seed) 178 | 179 | def next(self): 180 | """For python 2.x. 181 | returns the next batch. The X is directly the vectorized format and y is as supplied. 182 | """ 183 | # Keeps under lock only the mechanism which advances 184 | # the indexing of each batch. 185 | with self.lock: 186 | index_array, current_index, current_batch_size = next(self.index_generator) 187 | # The transformation of images is not under thread lock 188 | # so it can be done in parallel 189 | batch_x = np.zeros(tuple([current_batch_size] + list(self.vectorizer.dims)), dtype=self.dtype) 190 | for i, j in enumerate(index_array): 191 | smiles = self.x[j:j+1] 192 | x = self.vectorizer.transform(smiles) 193 | batch_x[i] = x 194 | 195 | if self.y is None: 196 | return batch_x 197 | batch_y = self.y[index_array] 198 | return batch_x, batch_y 199 | 200 | 201 | class HetSmilesGenerator(SmilesGenerator): 202 | """Hetero (maybe) generator class, for use to train the autoencoder. 203 | 204 | smilesvectorizer creates the input for the encoder 205 | Can be left_padded 206 | smilesvectorizer_2 creates the teacher input for the decoder + output. 207 | Must be right_padded. Output for decoder left shifted 1 pos, so no startchar. 208 | """ 209 | def __init__(self, x, y, smilesvectorizer, smilesvectorizer_2, 210 | batch_size=32, shuffle=False, seed=None, 211 | dtype=np.float32): 212 | super(HetSmilesGenerator,self).__init__(x, y, smilesvectorizer, 213 | batch_size=batch_size, shuffle=shuffle, seed=seed, 214 | dtype=dtype) 215 | self.smilesvectorizer = smilesvectorizer 216 | self.smilesvectorizer_2 = smilesvectorizer_2 217 | 218 | def next(self): 219 | """For python 2.x. 220 | 221 | :returns: The next batch. 222 | """ 223 | # Keeps under lock only the mechanism which advances 224 | # the indexing of each batch. 225 | with self.lock: 226 | index_array, current_index, current_batch_size = next(self.index_generator) 227 | 228 | self.enc_dims = list(self.smilesvectorizer.dims) 229 | #Subtract one from the output dims to prepare for the left shifting of output 230 | self.dec_dims = list(self.smilesvectorizer.dims) 231 | self.dec_dims[0] = self.dec_dims[0]-1 232 | 233 | #Prepare output arrays 234 | batch_1D = np.zeros(tuple([current_batch_size] + self.enc_dims), dtype=self.dtype) 235 | batch_1D_i = np.zeros(tuple([current_batch_size] + self.dec_dims), dtype=self.dtype) 236 | batch_1D_o = np.zeros(tuple([current_batch_size] + self.dec_dims), dtype=self.dtype) 237 | 238 | #TODO Maybe vectorize this, transform already has a for loop 239 | for i, j in enumerate(index_array): 240 | mol = self.x[j:j+1] 241 | 242 | chem1d_enc = self.smilesvectorizer.transform(mol) 243 | chem1d_dec = self.smilesvectorizer_2.transform(mol) 244 | 245 | batch_1D[i] = chem1d_enc 246 | batch_1D_i[i] = chem1d_dec[:,0:-1,:] #Including start_char 247 | batch_1D_o[i] = chem1d_dec[:,1:,:] #No start_char 248 | 249 | return [batch_1D, batch_1D_i], batch_1D_o 250 | -------------------------------------------------------------------------------- /molvecgen/vectorizers.py: -------------------------------------------------------------------------------- 1 | #Experimental Class for Smiles Enumeration, Iterator and SmilesIterator adapted from Keras 1.2.2 2 | from rdkit import Chem 3 | import numpy as np 4 | import math 5 | 6 | class SmilesVectorizer(object): 7 | """SMILES vectorizer and devectorizer, with support for SMILES enumeration (atom order randomization) 8 | as data augmentation 9 | 10 | :parameter charset: string containing the characters for the vectorization 11 | can also be generated via the .fit() method 12 | :parameter pad: Length of the vectorization 13 | :parameter leftpad: Add spaces to the left of the SMILES 14 | :parameter isomericSmiles: Generate SMILES containing information about stereogenic centers 15 | :parameter augment: Enumerate the SMILES during transform 16 | :parameter canonical: use canonical SMILES during transform (overrides enum) 17 | :parameter binary: Use RDKit binary strings instead of molecule objects 18 | """ 19 | def __init__(self, charset = '@C)(=cOn1S2/H[N]\\', pad=10, maxlength=120, leftpad=True, isomericSmiles=True, augment=True, canonical=False, startchar = '^', endchar = '$', unknownchar = '?', binary=False): 20 | #Special Characters 21 | self.startchar = startchar 22 | self.endchar = endchar 23 | self.unknownchar = unknownchar 24 | 25 | 26 | #Vectorization and SMILES options 27 | self.binary = binary 28 | self.leftpad = leftpad 29 | self.isomericSmiles = isomericSmiles 30 | self.augment = augment 31 | self.canonical = canonical 32 | self._pad = pad 33 | self._maxlength = maxlength 34 | 35 | #The characterset 36 | self._charset = None 37 | self.charset = charset 38 | 39 | #Calculate the dimensions 40 | self.setdims() 41 | 42 | @property 43 | def charset(self): 44 | return self._charset 45 | 46 | @charset.setter 47 | def charset(self, charset): 48 | #Ensure start and endchars are in the charset 49 | for char in [self.startchar, self.endchar, self.unknownchar]: 50 | if char not in charset: 51 | charset = charset + char 52 | #Set the hidden properties 53 | self._charset = charset 54 | self._charlen = len(charset) 55 | self._char_to_int = dict((c,i) for i,c in enumerate(charset)) 56 | self._int_to_char = dict((i,c) for i,c in enumerate(charset)) 57 | self.setdims() 58 | 59 | @property 60 | def maxlength(self): 61 | return self._maxlength 62 | 63 | @maxlength.setter 64 | def maxlength(self, maxlength): 65 | self._maxlength = maxlength 66 | self.setdims() 67 | 68 | @property 69 | def pad(self): 70 | return self._pad 71 | 72 | @pad.setter 73 | def pad(self, pad): 74 | self._pad = pad 75 | self.setdims() 76 | 77 | def setdims(self): 78 | """Calculates and sets the output dimensions of the vectorized molecules from the current settings""" 79 | self.dims = (self.maxlength + self.pad, self._charlen) 80 | 81 | 82 | def fit(self, mols, extra_chars=[]): 83 | """Performs extraction of the charset and length of a SMILES datasets and sets self.maxlength and self.charset 84 | 85 | :parameter smiles: Numpy array or Pandas series containing smiles as strings 86 | :parameter extra_chars: List of extra chars to add to the charset (e.g. "\\\\" when "/" is present) 87 | """ 88 | smiles = [Chem.MolToSmiles(mol) for mol in mols] 89 | charset = set("".join(list(smiles))) #Is there a smarter way when the list of SMILES is HUGE! 90 | self.charset = "".join(charset.union(set(extra_chars))) 91 | self.maxlength = max([len(smile) for smile in smiles]) 92 | 93 | def randomize_smiles(self, smiles): 94 | """Perform a randomization of a SMILES string 95 | must be RDKit sanitizable""" 96 | mol = Chem.MolFromSmiles(smiles) 97 | nmol = self.randomize_mol(mol) 98 | return Chem.MolToSmiles(nmol, canonical=self.canonical, isomericSmiles=self.isomericSmiles) 99 | 100 | def randomize_mol(self, mol): 101 | """Performs a randomization of the atom order of an RDKit molecule""" 102 | ans = list(range(mol.GetNumAtoms())) 103 | np.random.shuffle(ans) 104 | return Chem.RenumberAtoms(mol,ans) 105 | 106 | def transform(self, mols, augment=None, canonical=None): 107 | """Perform an enumeration (atom order randomization) and vectorization of a Numpy array of RDkit molecules 108 | 109 | :parameter mols: The RDKit molecules to transform in a list or array 110 | :parameter augment: Override the objects .augment setting 111 | :parameter canonical: Override the objects .canonical setting 112 | 113 | :output: Numpy array with the vectorized molecules with shape [batch, maxlength+pad, charset] 114 | """ 115 | #TODO make it possible to use both SMILES, RDKit mols and RDKit binary strings in input 116 | one_hot = np.zeros([len(mols)] + list(self.dims), dtype=np.int8) 117 | 118 | #Possibl override object settings 119 | if augment is None: 120 | augment = self.augment 121 | if canonical is None: 122 | canonical = self.canonical 123 | 124 | for i,mol in enumerate(mols): 125 | 126 | #Fast convert from RDKit binary 127 | if self.binary: mol = Chem.Mol(mol) 128 | 129 | if augment: 130 | mol = self.randomize_mol(mol) 131 | ss = Chem.MolToSmiles(mol, canonical=canonical, isomericSmiles=self.isomericSmiles) 132 | 133 | #TODO, Improvement make it robust to too long SMILES strings 134 | #TODO, Improvement make a "jitter", with random offset within the possible frame 135 | #TODO, Improvement make it report to many "?"'s 136 | 137 | l = len(ss) 138 | if self.leftpad: 139 | offset = self.dims[0]-l-1 140 | else: 141 | offset = 1 142 | 143 | for j,c in enumerate(ss): 144 | charidx = self._char_to_int.get(c, self._char_to_int[self.unknownchar]) 145 | one_hot[i,j+offset,charidx] = 1 146 | 147 | #Pad the start 148 | one_hot[i,offset-1,self._char_to_int[self.startchar]] = 1 149 | #Pad the end 150 | one_hot[i,offset+l:,self._char_to_int[self.endchar]] = 1 151 | #Pad the space in front of start (Could this lead to funky effects during sampling?) 152 | #one_hot[i,:offset-1,self._char_to_int[self.endchar]] = 1 153 | 154 | return one_hot 155 | 156 | 157 | def reverse_transform(self, vect, strip=True): 158 | """ Performs a conversion of a vectorized SMILES to a SMILES strings 159 | charset must be the same as used for vectorization. 160 | 161 | :parameter vect: Numpy array of vectorized SMILES. 162 | :parameter strip: Strip start and end tokens from the SMILES string 163 | """ 164 | #TODO make it possible to take a single vectorized molecule, not a list 165 | 166 | smiles = [] 167 | for v in vect: 168 | #mask v 169 | v=v[v.sum(axis=1)==1] 170 | #Find one hot encoded index with argmax, translate to char and join to string 171 | smile = "".join(self._int_to_char[i] for i in v.argmax(axis=1)) 172 | if strip: 173 | smile = smile.strip(self.startchar + self.endchar) 174 | smiles.append(smile) 175 | return np.array(smiles) 176 | 177 | from rdkit import DataStructs 178 | from rdkit.Chem import AllChem 179 | 180 | 181 | class ChemceptionVectorizer(object): 182 | """ 183 | Chemception Vectorizer turns RDKit molecules into 2D chemception "images" with embedded 184 | molecular information in the different layers. 185 | 186 | Data augmentation is possible and controlled via the .augment property. 187 | The RDKit molecules are rotated randomly in the drawing plane before vectorizaton. 188 | If .flip is set to true they are also flipped in 50% of the cases. Should not be used 189 | if theres embedded stereo information. Molecular coordinates are also randomly moved 190 | 0 to 0.5 units in the X and Y direction (jitter). 191 | 192 | The dimension of X and Y are int(self.embed*2/self.res) and the number of channels is 8 193 | 194 | The channels contains 3 layers with z-scale atom embedding, 1 layer bond information, 195 | and 4 layers with Hybridization, Gasteigercharge, Valence and Aromatic flag. 196 | 197 | 198 | """ 199 | def __init__(self, embed=16.0, resolution = 0.5, augment=True, flip=True, jitter=0.5, rotation=180): 200 | """ 201 | :parameter embed: the size of the embedding array. The embedding is specified in coordinate units which is approximate 1 Aangstrom for RDKit 202 | :parameter res: This is the resolution or the size of the "pixels" in coordinate space 203 | :parameter augment: Do rotation, jitter and evt. flip on coordinates before embedding 204 | :parameter jitter: maximum random movement of coords in X and Y dimensions 205 | :parameter rotation: Angle in degrees that molecule will be rotated in X and Y plane before 206 | embedding 207 | :parameter flip: If True the molecule will be randomly flipped around X axis 50% of times 208 | (Don't use if Stereo information is embedded). 209 | 210 | """ 211 | self.embed = embed 212 | self.res = resolution 213 | self.augment=augment 214 | self.flip = flip 215 | self.jitter = jitter 216 | self.rotation = rotation 217 | self._xdim = self._ydim = int(self.embed*2/self.res) 218 | self._channels = 8 219 | self.dims = (self._xdim, self._ydim, self._channels) #Tensorflow order, channels are last 220 | self.z_scales = { 1: [0.91380303970722476, 0.73165158255080509, 0.57733314804364055], 221 | 5: [0.30595181177678765, 1.0, 0.12627398479525687], 222 | 6: [0.5188990175807231, 0.78421022815543329, 0.0], 223 | 7: [1.0, 0.69228268136793891, 1.0], 224 | 8: [0.8483958101521436, 0.385020123655062, 0.22145975047392397], 225 | 9: [0.96509304017607156, 0.0, 0.063173404499183961], 226 | 14: [0.0, 0.71343324836845667, 0.44224053276558339], 227 | 15: [0.41652648238720696, 0.62420516558479067, 0.76653406760644893], 228 | 16: [0.33207883139905292, 0.43675896202099707, 0.37174865102406474], 229 | 17: [0.43492872745120104, 0.080933563161888766, 0.25506635487736889], 230 | 34: [0.25220492897252711, 0.41968099329754543, 0.42430137548104002], 231 | 35: [0.35428255473513803, 0.029790668511527674, 0.52407440196510457], 232 | 53: [0.12908295522613494, 0.057208840097424385, 0.77529938134095022]} 233 | 234 | def fit(self, mols, extra_pad = 5): 235 | """To be done, it could be nice to be able to precalculate the approximate embedding from a dataset""" 236 | print("TBD") 237 | 238 | def preprocess_mols(self, mols): 239 | """Calculate GasteigerCharges and 2D coordinates for the molecules 240 | 241 | :parameter mols: RDKit molecules to be processed in a list or array 242 | :returns: preprocessed RDKit molecules in a Numpy array 243 | 244 | """ 245 | mols_o = [] 246 | for i,mol in enumerate(mols): 247 | cmol = Chem.Mol(mol.ToBinary()) 248 | cmol.ComputeGasteigerCharges() 249 | AllChem.Compute2DCoords(cmol) 250 | mols_o.append(cmol) 251 | return np.array(mols_o) 252 | 253 | 254 | def _rotate_coords(self, origin, points, angle): 255 | """ 256 | Rotate coordinates counterclockwise with specified angle and origin. 257 | 258 | :parameter origin: coordinate for rotation center 259 | :parameter points: numpy array with 2D coordinates 260 | :parameter angle: The rotation angle in degrees 261 | :return: Rotated coordinates 262 | """ 263 | ox, oy = origin 264 | 265 | coords_o = np.zeros((points.shape[0], 2)) 266 | 267 | cosa = math.cos(math.radians(angle)) 268 | sina = math.sin(math.radians(angle)) 269 | 270 | coords_o[:,0] = ox + cosa * (points[:,0] - ox) - sina * (points[:,1] - oy) 271 | coords_o[:,1] = oy + sina * (points[:,0] - ox) + cosa * (points[:,1] - oy) 272 | return coords_o 273 | 274 | 275 | def vectorize_mol(self, mol, augment=None): 276 | """Vectorizes a single RDKit mol object into a 2D "image" numpy array 277 | 278 | :parameter mol: RDKit mol with precomputed 2D coords and Gasteiger Charges 279 | :parameter augment: Overrule objects .augment, useful for consistency in predictions 280 | 281 | """ 282 | coords = mol.GetConformer(0).GetPositions() 283 | 284 | if augment is None: 285 | augment = self.augment 286 | 287 | if augment: 288 | #Rotate + jitter + flip coords. 289 | rot = np.random.random()*self.rotation 290 | coords = self._rotate_coords((0,0), coords, rot) 291 | 292 | jitter_x = np.random.random()*2*self.jitter - self.jitter 293 | jitter_y = np.random.random()*2*self.jitter - self.jitter 294 | coords = coords + np.array([[jitter_x, jitter_y]]) 295 | 296 | if self.flip: 297 | flip_choice = np.random.random() 298 | #print(flip_choice) 299 | if flip_choice > 0.5: 300 | #print("Flip") 301 | coords = coords[:,::-1] #Flip around X-axis 302 | 303 | vect = np.zeros(self.dims, dtype='float32') 304 | #Bonds first 305 | for i,bond in enumerate(mol.GetBonds()): 306 | #TODO Future: add stereo-info? 307 | bondorder = bond.GetBondTypeAsDouble() 308 | bidx = bond.GetBeginAtomIdx() 309 | eidx = bond.GetEndAtomIdx() 310 | bcoords = coords[bidx] 311 | ecoords = coords[eidx] 312 | frac = np.linspace(0,1,int(1/self.res*2)) #with a res of 0.5 this should be adequate#TODO implement automatic determination/better line drawing algoritm. 313 | for f in frac: 314 | c = (f*bcoords + (1-f)*ecoords) 315 | idx = int(round((c[0] + self.embed)/self.res)) 316 | idy = int(round((c[1]+ self.embed)/self.res)) 317 | vect[ idx , idy ,0] = bondorder 318 | 319 | #Atoms and properties 320 | for i,atom in enumerate(mol.GetAtoms()): 321 | idx = int(round((coords[i][0] + self.embed)/self.res)) 322 | idy = int(round((coords[i][1]+ self.embed)/self.res)) 323 | if (idx > vect.shape[0]) or (idy > vect.shape[1]): 324 | print("WARNING: atom outside embedding, consider increasing embedding") 325 | continue 326 | else: 327 | scales = self.z_scales[ atom.GetAtomicNum() ] 328 | vect[ idx , idy, 1] = scales[0] 329 | vect[ idx , idy, 2] = scales[1] 330 | vect[ idx , idy, 3] = scales[2] 331 | hyptype = atom.GetHybridization().real 332 | vect[ idx , idy, 4] = hyptype 333 | charge = atom.GetProp("_GasteigerCharge") 334 | vect[ idx , idy, 5] = charge 335 | valence = atom.GetTotalValence() 336 | vect[ idx , idy, 6] = valence 337 | isarom = atom.GetIsAromatic() 338 | vect[ idx , idy, 7] = isarom 339 | 340 | #Remove Nans if present 341 | if np.sum(np.isnan(vect)) > 0: 342 | vect[np.isnan(vect)] = 0 343 | 344 | return vect 345 | 346 | def transform(self, mols, augment=None): 347 | """Batch vectorization of molecules 348 | 349 | :parameter mols: RDKit mols with precomputed 2D coords and Gasteiger Charges 350 | :parameter augment: boolean. Overrides objects .augment setting if not None 351 | :returns: Numpy array with the chemception images. Shape [number_mols, xdim, ydim, channels] 352 | 353 | """ 354 | if len(mols.shape) > 1: 355 | mols = mols.reshape(-1) #TODO: What if Pandas? 356 | 357 | mols_array = np.zeros([len(mols)] + list(self.dims)) 358 | 359 | for i,mol in enumerate(mols): 360 | mols_array[i] = self.vectorize_mol(mol, augment = augment) 361 | 362 | return mols_array 363 | 364 | 365 | 366 | class MorganDictVectorizer(object): 367 | def __init__(self, radius=2, augment=None): 368 | self.radius = radius 369 | self.augment = augment #Not used 370 | self.dims = None 371 | 372 | def fit(self, mols): 373 | """Analyses the molecules and creates the key index for the creation of the dense array""" 374 | keys=set() 375 | for mol in mols: 376 | fp = AllChem.GetMorganFingerprint(mol,self.radius) 377 | keys.update(fp.GetNonzeroElements().keys()) 378 | keys = list(keys) 379 | keys.sort() 380 | self.keys= np.array(keys) 381 | self.dims = len(self.keys) 382 | 383 | def transform_mol(self, mol, misses=False, binary=False): 384 | """ transforms the mol into a dense array using the fitted keys as index 385 | 386 | :parameter mol: the RDKit molecule to be transformed 387 | :parameter misses: wheter to return the number of key misses for the molecule 388 | """ 389 | assert type(self.keys) is np.ndarray, "keys are not defined or is not an np.array, has the .fit(mols) function been used?" 390 | #Get fingerprint as a dictionary 391 | fp = AllChem.GetMorganFingerprint(mol,self.radius) 392 | fp_d = fp.GetNonzeroElements() 393 | 394 | if binary: 395 | return np.isin(self.keys, list(fp_d.keys()), assume_unique=True) 396 | 397 | #Prepare the array, and set the values 398 | #TODO is there a way to vectorize and speed up this? 399 | arr = np.zeros((self.dims,)) 400 | _misses = 0 401 | for key, value in fp_d.items(): 402 | if key in self.keys: 403 | arr[self.keys == key] = value 404 | else: 405 | _misses = _misses + 1 406 | 407 | if misses: 408 | return arr, _misses 409 | else: 410 | return arr 411 | 412 | def transform(self, mols, misses=False, binary=False): 413 | """Transforms a list or array of RDKit molecules into a dense array using the key dictionary (see .fit()) 414 | 415 | :parameter mols: list or array of RDKit molecules 416 | :parameter misses: Wheter to return the number of key misses for each molecule 417 | :parameter binary: only binary bits, ignores misses but is faster 418 | """ 419 | arr = np.zeros((len(mols), self.dims)) 420 | 421 | if binary: 422 | for i, mol in enumerate(mols): 423 | arr[i,:] = self.transform_mol(mol, binary=True) 424 | return arr 425 | 426 | elif misses: 427 | _misses = np.zeros((len(mols),1)) 428 | for i, mol in enumerate(mols): 429 | arr[i,:], _misses[i] = self.transform_mol(mol, misses=misses) 430 | return arr, _misses 431 | else: 432 | for i, mol in enumerate(mols): 433 | arr[i,:] = self.transform_mol(mol, misses=False) 434 | return arr 435 | 436 | 437 | class HashedVectorizer(object): 438 | def __init__(self, nBits=2048, augment=None, **kwargs): 439 | self.nBits = nBits 440 | self.augment = augment #Not used 441 | self.dims = (nBits,) 442 | self.keys = None 443 | self.kwargs=kwargs 444 | 445 | def get_fp(self,mol): 446 | """Abstract method, must be overriden in subclass""" 447 | raise NotImplementedError('Abstract class instantiated, subclass, and override get_fp') 448 | 449 | def transform_mol(self, mol): 450 | """ transforms the molecule into a numpy bit array with the morgan bits 451 | 452 | :parameter mol: the RDKit molecule to be transformed 453 | """ 454 | fp = self.get_fp(mol) 455 | arr = np.zeros((self.nBits,)) 456 | DataStructs.ConvertToNumpyArray(fp, arr) 457 | return arr 458 | 459 | def transform(self, mols): 460 | """Transforms a list or array of RDKit molecules into an array with the Morgan bits 461 | 462 | :parameter mols: list or array of RDKit molecules 463 | """ 464 | 465 | arr = np.zeros((len(mols), self.nBits)) 466 | for i, mol in enumerate(mols): 467 | arr[i,:] = self.transform_mol(mol) 468 | return arr 469 | 470 | 471 | class HashedAPVectorizer(HashedVectorizer): 472 | def __init__(self, **kwargs): 473 | super().__init__(**kwargs) 474 | 475 | def get_fp(self, mol): 476 | return AllChem.GetHashedAtomPairFingerprint(mol,nBits=self.nBits, **self.kwargs) 477 | 478 | 479 | class HashedMorganVectorizer(HashedVectorizer): 480 | def __init__(self, radius=2, **kwargs): 481 | self.radius = radius 482 | super().__init__(**kwargs) 483 | 484 | def get_fp(self, mol): 485 | return AllChem.GetMorganFingerprintAsBitVect(mol,self.radius,nBits=self.nBits, **self.kwargs) 486 | 487 | 488 | class HashedTorsionVectorizer(HashedVectorizer): 489 | def __init__(self, **kwargs): 490 | super().__init__(**kwargs) 491 | 492 | def get_fp(self, mol): 493 | return AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=self.nBits, **self.kwargs) 494 | 495 | 496 | #RDKit Fingerprints 497 | class HashedRDKVectorizer(HashedVectorizer): 498 | def __init__(self, **kwargs): 499 | super().__init__(**kwargs) 500 | 501 | def get_fp(self, mol): 502 | return Chem.rdmolops.RDKFingerprint(mol, fpSize=self.nBits, **self.kwargs) 503 | 504 | 505 | #MACCS (Not a hashed fingerprint, but with fixed length 506 | #from rdkit.Chem import MACCSkeys 507 | #fps = [MACCSkeys.GenMACCSKeys(x) for x in ms] 508 | 509 | #Avalon 510 | 511 | #2D pharmacophore fingerprint 512 | 513 | 514 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='molvecgen', 4 | version='0.1', 5 | description='molecular vectorizer and batch generator', 6 | #url='', 7 | author='Esben Jannik Bjerrum, kfxl284', 8 | author_email='esben.bjerrum@astrazeneca.com', 9 | license='MIT', 10 | packages=['molvecgen'], 11 | install_requires=[ 12 | #'rdkit', #not available through git but conda 13 | 'numpy' 14 | ], 15 | zip_safe=False, 16 | ) 17 | --------------------------------------------------------------------------------