├── MANIFEST.in ├── ReadMe.txt ├── setup.py └── standardiser ├── SDF.py ├── __init__.py ├── bin ├── __init__.py ├── ipynb_to_html.sh ├── rules_applied.py └── standardise_mols.py ├── break_bonds.py ├── data ├── rules_base.dat ├── rules_specific.dat ├── rules_tautomerism_and_aromaticity.dat ├── rules_to_be_used_with_caution.dat └── salts.tsv ├── docs ├── 00_Introduction.ipynb ├── 01_break_bonds.ipynb ├── 02_neutralise.ipynb ├── 03_rules.ipynb ├── 04_unsalt.ipynb ├── 05_standardise.ipynb ├── 06_alternative.ipynb ├── 99_issue_01.ipynb ├── Charge-separated_systems.ipynb ├── Conjugated_cations_and_charge_neutralisation.ipynb ├── Hydroxy_pyridine_within_ring.ipynb ├── Keto-enol_tautomerism.ipynb ├── Miscellaeny.ipynb ├── Multiple_possible_tautomers.ipynb ├── Neutralisation_strategy.ipynb ├── Rule_application_strategy.ipynb ├── Tautomerism_and_aromaticity.ipynb ├── notebook_setup.py ├── standardiser.pdf └── standardiser.pptx ├── make_logger.py ├── neutralise.py ├── rules.py ├── rules_demo.py ├── standardise.py ├── templates ├── rules_table.html └── show_rule.html ├── test.html ├── unsalt.py └── utils.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include ReadMe.txt 2 | -------------------------------------------------------------------------------- /ReadMe.txt: -------------------------------------------------------------------------------- 1 | This is a tool designed to provide a simple way of standardising molecules as a prelude to e.g. molecular modelling exercises. 2 | 3 | To install, download this project and ensure that Python can find the standardise/ directory. Note that RDKit must also be installed and accessible to Python. 4 | 5 | Or, install via pip: 6 | ``` 7 | pip install standardiser 8 | ``` 9 | 10 | Please see the IPython Notebooks in docs/ or the HTML versions in docs/html/ for more information. The HTML version is also available for browsing at 11 | https://wwwdev.ebi.ac.uk/chembl/extra/francis/standardiser/. 12 | 13 | This work was funded by the IMI eTOX project. 14 | 15 | This code is released under the Apache 2.0 license. Copyright [2014] is retained by the EMBL-EBI. 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | __author__ = 'mnowotka' 5 | 6 | import sys 7 | 8 | try: 9 | from setuptools import setup 10 | except ImportError: 11 | from ez_setup import use_setuptools 12 | use_setuptools() 13 | from setuptools import setup 14 | setup( 15 | name='standardiser', 16 | version='0.1.7', 17 | author='Francis Atkinson', 18 | author_email='francis@ebi.ac.uk', 19 | description='Provides a simple way of standardising molecules as a prelude to e.g. molecular modelling exercises.', 20 | url='https://www.ebi.ac.uk/chembldb/index.php/ws', 21 | license='Apache License, Version 2.0', 22 | entry_points={ 23 | 'console_scripts': [ 24 | 'standardiser=standardiser.bin.standardiser:main']}, 25 | packages=['standardiser', 26 | 'standardiser.bin'], 27 | long_description=open('ReadMe.txt').read(), 28 | package_data={ 29 | 'standardiser': ['bin/*', 'data/*', 'docs/*'], 30 | }, 31 | classifiers=['Development Status :: 2 - Pre-Alpha', 32 | 'Environment :: Console', 33 | 'Intended Audience :: Science/Research', 34 | 'License :: OSI Approved :: Apache Software License', 35 | 'Operating System :: OS Independent', 36 | 'Programming Language :: Python', 37 | 'Topic :: Scientific/Engineering :: Chemistry'], 38 | zip_safe=False, 39 | ) 40 | -------------------------------------------------------------------------------- /standardiser/SDF.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # 3 | # Copyright [2014] EMBL - European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in 7 | # compliance with the License. You may obtain a copy of 8 | # the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software distributed under the 13 | # License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 14 | # either express or implied. See the License for the specific language governing permissions 15 | # and limitations under the License. 16 | # 17 | #################################################################################################### 18 | 19 | """ 20 | This module provides a lightweight utility to read molecule blocks from an SD file. This is so the block can be 21 | written to a file for later inspection if RDKit cannot build a molecule from it. The readers in RDKit return 22 | nothing if they cannot build the molecule, so the molblock cannot easily be saved. 23 | """ 24 | 25 | #################################################################################################### 26 | 27 | import re 28 | 29 | #################################################################################################### 30 | 31 | def readFile(filename): 32 | 33 | lines = [] 34 | 35 | for line in open(filename) if type(filename) == str else filename: 36 | 37 | lines.append(line) 38 | 39 | if line == "$$$$\n": 40 | 41 | molblock = "".join(lines) 42 | 43 | lines = [] 44 | 45 | yield Molfile(molblock) 46 | 47 | class Molfile(dict): 48 | 49 | n_mols = 0 50 | 51 | def __init__(self, molblock): 52 | 53 | self.__class__.n_mols += 1 54 | 55 | self.original = molblock 56 | 57 | self.molblock, data = re.search("\A(.*\nM\s+END\s*\n)(.*)\$\$\$\$", molblock, re.DOTALL).groups() 58 | 59 | data = dict(re.findall("^>\s+<(.*?)>(?:\s*\(\d+\))?\s*\n(.*?)\s*\n\n", data, re.DOTALL | re.MULTILINE)) 60 | 61 | self.name = re.match('^(\w+)?\n', molblock).group(1) or data.get("Name") or data.get("name") or data.get("molregno") or "mol_{n:04d}".format(n=self.__class__.n_mols) 62 | 63 | self.__dict__.update(data) 64 | 65 | def __getitem__(self, key): 66 | 67 | return self.__dict__[key] 68 | 69 | def write(self, name="junk"): 70 | 71 | open(name + ".mol", "w").write(self.original) 72 | 73 | #################################################################################################### 74 | # End 75 | #################################################################################################### 76 | -------------------------------------------------------------------------------- /standardiser/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "break_bonds", 3 | "neutralise", 4 | "rules", 5 | "unsalt", 6 | "standardise", 7 | "SDF", 8 | "utils" 9 | ] 10 | -------------------------------------------------------------------------------- /standardiser/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flatkinson/standardiser/107b68d8f01c4d24111902c87751f897c442e299/standardiser/bin/__init__.py -------------------------------------------------------------------------------- /standardiser/bin/ipynb_to_html.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | mkdir html 4 | 5 | \ls -1 *.ipynb | while read ipynb; do stem=`basename ${ipynb} \.ipynb`; jupyter nbconvert --to=html --output html/${stem}.html ${ipynb}; done 6 | 7 | cd html/ 8 | perl -i -p -e 's#(>> Starting mol '{name}'...".format(name=original.name)) 95 | 96 | ok = True 97 | 98 | try: 99 | 100 | if config.output_rules_applied: 101 | 102 | rules_applied = [] 103 | 104 | parent = standardise.run(original.molblock, output_rules_applied=rules_applied) 105 | 106 | else: 107 | 108 | parent = standardise.run(original.molblock) 109 | 110 | except standardise.StandardiseException as err: 111 | 112 | logger.warn(">>> {error} for '{name}'".format(error=errors[err.name], name=original.name)) 113 | 114 | counts[err.name] += 1 115 | 116 | errfile.write("{mol}> \n{nread}\n\n> \n{error}\n\n$$$$\n".format(mol=original.molblock, nread=counts["read"], error=errors[err.name])) 117 | 118 | ok = False 119 | 120 | if ok: 121 | 122 | logger.info("Mol '{name}' OK".format(name=original.name)) 123 | 124 | counts["standardised"] += 1 125 | 126 | parent = re.sub(r'^\w*\n', original.name + '\n', parent) 127 | 128 | if config.output_rules_applied: 129 | 130 | rules_applied = ';'.join(rule_names[x-1] for x in rules_applied) if rules_applied else '' 131 | 132 | outfile.write("{mol}> \n{nread}\n\n\n{rules}\n\n$$$$\n".format(mol=parent, nread=counts["read"], rules=rules_applied)) 133 | 134 | else: 135 | 136 | outfile.write("{mol}> \n{nread}\n\n$$$$\n".format(mol=parent, nread=counts["read"])) 137 | 138 | if counts["read"] % 100 == 0: logger.info("...done: {read} read, {standardised} OK...".format(**counts)) 139 | 140 | else: # Read/write (tab-seperated) SMILES + name... 141 | 142 | infile = csv.reader(open(config.infile), delimiter="\t") 143 | outfile = csv.writer(open(config.outfile, "w"), delimiter="\t") 144 | errfile_name = outfile_basename + "_errors." + outfile_ext 145 | errfile = csv.writer(open(errfile_name, "w"), delimiter="\t") 146 | 147 | for original in infile: 148 | 149 | counts["read"] += 1 150 | 151 | smiles, name = original 152 | 153 | logger.info(">>> Starting mol '{name}'...".format(name=name)) 154 | 155 | ok = True 156 | 157 | try: 158 | 159 | if config.output_rules_applied: 160 | 161 | rules_applied = [] 162 | 163 | parent = standardise.run(smiles, output_rules_applied=rules_applied) 164 | 165 | else: 166 | 167 | parent = standardise.run(smiles) 168 | 169 | except standardise.StandardiseException as err: 170 | 171 | logger.warn(">>> {error} for mol '{name}'".format(error=errors[err.name], name=name)) 172 | 173 | counts[err.name] += 1 174 | 175 | errfile.writerow(original + [err.name]) 176 | 177 | ok = False 178 | 179 | if ok: 180 | 181 | logger.info("Mol '{name}' OK".format(name=name)) 182 | 183 | counts["standardised"] += 1 184 | 185 | if config.output_rules_applied: 186 | 187 | rules_applied = ';'.join(rule_names[x-1] for x in rules_applied) if rules_applied else '' 188 | 189 | outfile.writerow([parent, name, smiles, rules_applied]) 190 | 191 | else: 192 | 193 | outfile.writerow([parent, name]) 194 | 195 | if counts["read"] % 100 == 0: logger.info("...done: {read} read, {standardised} OK...".format(**counts)) 196 | 197 | logger.info("Finished: {read} read, {standardised} OK in total.".format(**counts)) 198 | 199 | logger.info("Counts: " + json.dumps(counts, indent=4)) 200 | 201 | # main 202 | 203 | #################################################################################################### 204 | 205 | if __name__ == '__main__': 206 | main() 207 | 208 | #################################################################################################### 209 | # End 210 | #################################################################################################### 211 | -------------------------------------------------------------------------------- /standardiser/break_bonds.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # 3 | # Copyright [2014] EMBL - European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in 7 | # compliance with the License. You may obtain a copy of 8 | # the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software distributed under the 13 | # License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 14 | # either express or implied. See the License for the specific language governing permissions 15 | # and limitations under the License. 16 | # 17 | #################################################################################################### 18 | 19 | """ 20 | Break bonds to Group I and II metals 21 | """ 22 | 23 | #################################################################################################### 24 | 25 | from . import make_logger 26 | logger = make_logger.run(__name__) 27 | 28 | from collections import defaultdict 29 | 30 | from rdkit import Chem 31 | 32 | from .utils import StandardiseException, sanity_check 33 | 34 | #################################################################################################### 35 | 36 | # Module configuration... 37 | 38 | bonds_to_break = [("[Li,Na,K,Mg,Ca]-[#7,#8,#16]", 1), ("[Mg,Ca]=N", 2)] # SMARTS pattern, charge increment 39 | 40 | #################################################################################################### 41 | 42 | # Module initialization... 43 | 44 | bonds_to_break = [(Chem.MolFromSmarts(smarts), charge_incr) for smarts, charge_incr in bonds_to_break] 45 | 46 | #################################################################################################### 47 | 48 | def run(mol): 49 | 50 | charge_added = defaultdict(int) 51 | 52 | n_broken = 0 53 | 54 | for pattern, charge_incr in bonds_to_break: 55 | 56 | idx_pairs = mol.GetSubstructMatches(pattern) 57 | 58 | ed_mol = Chem.EditableMol(mol) 59 | 60 | for i, j in idx_pairs: 61 | 62 | ed_mol.RemoveBond(i, j) 63 | 64 | n_broken += 1 65 | 66 | mol = ed_mol.GetMol() 67 | 68 | for i, j in idx_pairs: 69 | 70 | atom = mol.GetAtomWithIdx(i) 71 | atom.SetFormalCharge(atom.GetFormalCharge() + charge_incr) 72 | 73 | charge_added[i] += charge_incr 74 | 75 | atom = mol.GetAtomWithIdx(j) 76 | atom.SetFormalCharge(atom.GetFormalCharge() - charge_incr) 77 | 78 | charge_added[j] -= charge_incr 79 | 80 | try: 81 | 82 | sanity_check(mol) 83 | 84 | except StandardiseException as err: 85 | 86 | logger.debug("Molecule failed sanity check") 87 | 88 | raise 89 | 90 | logger.debug("Broke {n} bonds to Group I and II metals".format(n=n_broken)) 91 | 92 | return mol 93 | 94 | # run 95 | 96 | #################################################################################################### 97 | # End 98 | #################################################################################################### 99 | -------------------------------------------------------------------------------- /standardiser/data/rules_base.dat: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # 3 | # Copyright [2014] EMBL - European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in 7 | # compliance with the License. You may obtain a copy of 8 | # the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software distributed under the 13 | # License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 14 | # either express or implied. See the License for the specific language governing permissions 15 | # and limitations under the License. 16 | # 17 | #################################################################################################### 18 | 19 | [OX2H1:1]-[C:2]=[NX2:3]>>[OH0:1]=[*:2]-[NH1:3] hydroxy imine -> carboxamide 20 | 21 | [OX2H1:1]-[c:2]:[nX2:3]>>[OH0:1]=[*:2]:[nH1:3] 2-hydroxy pyridine -> 2-pyridone 22 | 23 | [OX2H1:1]-[c:2]1:[c:3]:[a:4]:[nX2:5]:[a:6]:[a:7]:1>>[OH0:1]=[*:2]1:[*:3]:[*:4]:[nH1:5]:[*:6]:[*:7]:1 4-hydroxy pyridine -> 4-pyridone (within-ring) 24 | ### [OX2H1:1]-[c:2]:[c:3]:[a:4]:[nX2:5]>>[OH0:1]=[*:2]:[*:3]:[*:4]:[nH1:5] 4-hydroxy pyridine -> 4-pyridone (any) 25 | 26 | [nH!$(*c=O):1]:[a:2]:[nX2:3]:[c:4]=[O:5]>>[nH0:1]:[*:2]:[nH:3]:[*:4]=[O:5] 4-pyrimidone -> 2-pyrimidone (any) 27 | 28 | [NH1+0:1]=[c:2]:[nH!$(*c=O):3]>>[NH2:1]-[*:2]:[nH0:3] hydropyridin-2-imine -> 2-amino-pyridine 29 | [NH0+0:1]=[c:2]:[nH!$(*c=O):3]>>[NH1:1]-[*:2]:[nH0:3] hydropyridin-2-imine -> 2-amino-pyridine (N-subst.) 30 | 31 | [NH1+0:1]=[c:2]:[a:3]:,-[a:4]:[nH!$(*c=O):5]>>[NH2:1]-[*:2]:[*:3]:[*:4]:[nH0:5] hydropyridin-4-imine -> 4-amino-pyridine 32 | [NH0+0:1]=[c:2]:[a:3]:,-[a:4]:[nH!$(*c=O):5]>>[NH1:1]-[*:2]:[*:3]:[*:4]:[nH0:5] hydropyridin-4-imine -> 4-amino-pyridine (N-subst.) 33 | 34 | [nH1r6!$(*1c(=O)aaaa1)!$(*1aac(=O)aa1):1]:[c:2]:[nX2r5:3]>>[nH0:1]:[*:2]:[nH1:3] Fix heterocyclic tautomer 1 35 | [nH1r6!$(*1c(=O)aaaa1)!$(*1aac(=O)aa1):1]:[a:2]:[c:3]:[nX2r5:4]>>[nH0:1]:[*:2]:[*:3]:[nH1:4] Fix heterocyclic tautomer 2 36 | [nH1r6!$(*1c(=O)aaaa1)!$(*1aac(=O)aa1):1]:[a:2]:[c:3]-[c:4]:[nX2r5:5]>>[nH0:1]:[*:2]:[*:3]:[*:4]:[nH1:5] Fix heterocyclic tautomer 3 37 | [nH1r6!$(*1c(=O)aaaa1)!$(*1aac(=O)aa1):1]:[c:2]-[c:3]:[nX2r5:4]>>[nH0:1]:[*:2][*:3]:[nH1:4] Fix heterocyclic tautomer 4 38 | 39 | [OH:1][C:2]=[CH0:3]>>[OH0:1]=[C:2][CH1:3] Enol -> Ketone 1 40 | [OH:1][C:2]=[CH1:3]>>[OH0:1]=[C:2][CH2:3] Enol -> Ketone 2 41 | 42 | [N,O,P,S;-1:1]-[A+0:2]=[N,O,P,S;+1:3]>>[*-0:1]=[*:2]-[*+0:3] Fix 1,3 charge-seperated systems (non-aromatic) 43 | [n,o,p,s;-1:1]:[a:2]=[N,O,P,S;+1:3]>>[*-0:1]:[*:2]-[*+0:3] Fix 1,3 charge-seperated systems (aromatic 1) 44 | [N,O,P,S;-1:1]-[a:2]:[n,o,p,s;+1:3]>>[*-0:1]=[*:2]:[*+0:3] Fix 1,3 charge-seperated systems (aromatic 2) 45 | 46 | [N,O,P,S;-1:1]-[A+0:2]=[A:3]-[A:4]=[N,O,P,S;+1:5]>>[*-0:1]=[*:2]-[*:3]=[*:4]-[*+0:5] Fix 1,5 charge-seperated systems (non-aromatic) 47 | [n,o,p,s;-1:1]:[a:2]:[a:3]:[c:4]=[N,O,P,S;+1:5]>>[*-0:1]:[*:2]:[*:3]:[c:4]-[*+0:5] Fix 1,5 charge-seperated systems (aromatic 1) 48 | [N,O,P,S;-1:1]-[c:2]:[a:3]:[a:4]:[n,o,p,s;+1:5]>>[*-0:1]=[c:2]:[*:3]:[*:4]:[*+0:5] Fix 1,5 charge-seperated systems (aromatic 2) 49 | 50 | [N,O;+0!H0:1]-[A:2]=[N!$(*[O-]),O;+1H0:3]>>[*+1:1]=[*:2]-[*+0:3] Fix 1,3 conjugated cation (non-aromatic) 51 | [n;+0!H0:1]:[c:2]=[N!$(*[O-]),O;+1H0:3]>>[*+1:1]:[*:2]-[*+0:3] Fix 1,3 conjugated cation (aromatic 1) 52 | ### [N,O;+0!H0:1]-[c:2]:[n!$(*[O-]),o;+1H0:3]>>[*+1:1]=[*:2]:[*+0:3] Fix 1,3 conjugated cation (aromatic 2) 53 | 54 | [N,O;+0!H0:1]-[A:2]=[A:3]-[A:4]=[N!$(*[O-]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5] Fix 1,5 conjugated cation (non-aromatic) 55 | [n;+0!H0:1]:[a:2]:[a:3]:[c:4]=[N!$(*[O-]),O;+1H0:5]>>[n+1:1]:[*:2]:[*:3]:[*:4]-[*+0:5] Fix 1,5 conjugated cation (aromatic 1) 56 | ### [N,O;+0!H0:1]-[c:2]:[a:3]:[a:4]:[n!$(*[O-]),o;+1H0:5]>>[*+1:1]=[c:2]:[*:3]:[*:4]:[*+0:5] Fix 1,5 conjugated cation (aromatic 2) 57 | ### [n;+0!H0:1]:[a:2]:[a:3]:[a:4]:[n!$(*[O-]);+1H0:5]>>[n+1:1]:[*:2]:[*:3]:[*:4]:[n+0:5] Fix 1,5 conjugated cation (aromatic 3) 58 | 59 | [Sv4:1](=[O:2])[*:3]>>[S+:1](-[O-:2])[*:3] Charge-seperate sulphoxides 60 | [S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3]) Un-charge-seperate sulphones 61 | 62 | [NX3H$(*[CX4]):1]-[NX3H:2][CX4:5][NX2+:3]#[NX1:4]>>[N:1]=[NH+:2][C:5][N+0:3]=[NH:4] Fix hydrazine-diazonium system 63 | 64 | #################################################################################################### 65 | # End 66 | #################################################################################################### 67 | -------------------------------------------------------------------------------- /standardiser/data/rules_specific.dat: -------------------------------------------------------------------------------- 1 | [Nv3:1]1=[C:2][C:3](=[C:4]([OH:5])[OX2:6])[CX4:7][C:8]=[C:9]1>>[*:1]1[*:2]=[*:3](-[*:4](=[*:5])[*:6])[*:7][*:8]=[*:9]1 dihydro_pyridine 2 | -------------------------------------------------------------------------------- /standardiser/data/rules_tautomerism_and_aromaticity.dat: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # 3 | # Copyright [2014] EMBL - European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in 7 | # compliance with the License. You may obtain a copy of 8 | # the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software distributed under the 13 | # License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 14 | # either express or implied. See the License for the specific language governing permissions 15 | # and limitations under the License. 16 | # 17 | #################################################################################################### 18 | 19 | # See notebook Tautomerism_and_aromaticity.ipynb 20 | 21 | [OH1:1][cr5:2][o,sv2,nv3X3:3][cH:4]>>[OH0:1]=[C:2][*:3][CH2:4] Non-aromatic 1 22 | [OH1:1][cr5:2][cH:4][o,sv2,nv3X3:3]>>[OH0:1]=[C:2][CH2:4][*:3] Non-aromatic 2 23 | [OH1:1][cr5:2]([o,sv2,nv3X3:3])[cH:4]>>[OH0:1]=[C:2]([*:3])[CH2:4] Non-aromatic 3 24 | [Nv3X3:1][cr5:2]([nH:3])[cH:4]>>[*:1][C:2]([CH2:3])=[NH0:4] Non-aromatic 4 25 | # 26 | #################################################################################################### 27 | # End 28 | #################################################################################################### 29 | -------------------------------------------------------------------------------- /standardiser/data/rules_to_be_used_with_caution.dat: -------------------------------------------------------------------------------- 1 | [OX2H1:1]-[c:2]:[c:3]:[a:4]:[nX2:5]>>[OH0:1]=[*:2]:[*:3]:[*:4]:[nH1:5] 4-hydroxy pyridine -> 4-pyridone (any) 2 | 3 | [N,O;+0!H0:1]-[c:2]:[n!$(*[O-]),o;+1H0:3]>>[*+1:1]=[*:2]:[*+0:3] Fix 1,3 conjugated cation (aromatic 2) 4 | 5 | [N,O;+0!H0:1]-[c:2]:[a:3]:[a:4]:[n!$(*[O-]),o;+1H0:5]>>[*+1:1]=[c:2]:[*:3]:[*:4]:[*+0:5] Fix 1,5 conjugated cation (aromatic 2) 6 | [n;+0!H0:1]:[a:2]:[a:3]:[a:4]:[n!$(*[O-]);+1H0:5]>>[n+1:1]:[*:2]:[*:3]:[*:4]:[n+0:5] Fix 1,5 conjugated cation (aromatic 3) 7 | -------------------------------------------------------------------------------- /standardiser/data/salts.tsv: -------------------------------------------------------------------------------- 1 | SMILES name 2 | CNCC(O)C(O)C(O)C(O)CO (di)meglumine 3 | CC(=O)O acetate uncharged 4 | CC(=O)C acetone 5 | CC(=O)NCC(=O)O aceturate uncharged 6 | CCCCCCCCCCCCCCCCCC(=O)O acistrate or stearate 7 | OC(=O)CCCCC(=O)O Adipate 8 | [Al] Aluminium 9 | N ammonia or ammonium 10 | OCC(O)C1OC(=O)C(=C1O)O ascorbate 11 | NC(CC(=O)O)C(=O)O Aspartate 12 | [Ba] Barium 13 | C(Cc1ccccc1)NCc2ccccc2 benethamine 14 | C(CNCc1ccccc1)NCc2ccccc2 benzathine 15 | OC(=O)c1ccccc1 benzoate 16 | OS(=O)(=O)c1ccccc1 besilate or besylate 17 | [Bi] bismuth 18 | Br Bromide 19 | CCCC=O butyraldehyde 20 | CCCC(=O)OCC butyrate 21 | [Ca] Calcium 22 | CC1(C)C2CCC1(CS(=O)(=O)O)C(=O)C2 camsilate or camsylate uncharged 23 | OC(=O)O carbonate 24 | Cl Chloride 25 | OCC[N+](C)(C)C choline 26 | OC(=O)CC(O)(CC(=O)O)C(=O)O citrate 27 | OS(=O)(=O)c1ccc(Cl)cc1 closylate 28 | OS(=O)(=O)NC1CCCCC1 cyclamate 29 | OC(=O)C(Cl)Cl dichloroacetate 30 | CCNCC diethylamine 31 | CC(C)(N)CO dimethylethanolamine 32 | CS(=O)C dimethylsulphoxide 33 | OCCNCCO diolamine 34 | NCCN edamine 35 | OS(=O)(=O)CCS(=O)(=O)O edisylate 36 | OCCN1CCCC1 Epolamine 37 | CC(C)(C)N erbumine 38 | CCCCCCCCCCCCOS(=O)(=O)O estolate or laurylsulfate 39 | CCS(=O)(=O)O esylate uncharged 40 | CCO ethanol 41 | CCOS(=O)(=O)O ethylsulfate no charge 42 | F Fluoride 43 | C(=O)O formic acid 44 | OC(=O)C=CC(=O)O fumarate 45 | OCC(O)C(O)C(O)C(O)C(O)C(=O)O gluceptate 46 | OCC(O)C(O)C(O)C(O)C(=O)O gluconate 47 | OC1OC(C(O)C(O)C1O)C(=O)O glucuronate 48 | NC(CCC(=O)O)C(=O)O glutamate 49 | OCC(O)CO glycerate 50 | OCC(O)COP(=O)(O)O glycerophosphate 51 | OC(=O)CNC(=O)c1ccccc1 hippurate charged 52 | OP=O hypophosphite uncharged 53 | OC(C=O)C(O)C(O)C(O)C(=O)O incorrect glucuronate 54 | OC(=O)c5c(O)cc6ccccc6c5Cc7c(C(=O)O)c(O)cc8ccccc78 incorrect pamoate 55 | OC1OC(C(O)C(O)C1O)C(O)=O incorrect ring gluconate 56 | OC(C(O)C(=O)O)C(=O)O incorrect tartrate uncharged no stereo 57 | I Iodide 58 | OCCS(=O)(=O)O isethionate 59 | [K] Potassium 60 | CC(O)C(=O)O lactate uncharged 61 | OCC(O)C(OC1OC(CO)C(O)C(O)C1O)C(O)C(O)C(=O)O lactobionate 62 | [Li] Lithium 63 | NCCCCC(N)C(=O)O lysine 64 | OC(CC(=O)O)C(=O)O malate 65 | OC(=O)C=CC(=O)O maleate 66 | CS(=O)(=O)O mesylate 67 | OP(=O)=O metaphosphate uncharged 68 | COS(=O)(=O)O methosulfate 69 | O[Mg+] Mg monohydroxide (looks odd but OK in drug) 70 | [Mg] Magnesium 71 | OP(F)(O)=O monofluorophosphate 72 | [Na] Sodium 73 | OS(=O)(=O)c1cccc2c(cccc12)S(=O)(=O)O napadisilate 74 | OS(=O)(=O)c1ccc2ccccc2c1 napsylate 75 | O[N+](=O)O nitrate uncharged 76 | NCCO olamine or ethanolamide 77 | OC(=O)C(=O)O oxalate 78 | CCCCCCCCCCCCCCCC(=O)O palmitate 79 | OC(=O)c1cc2ccccc2c(Cc3c(O)c(cc4ccccc34)C(=O)O)c1O pamoate 80 | OCl(=O)(=O)=O perchlorate uncharged 81 | Nc1ccc(cc1)P(=O)(O)O phosphanilate 82 | OP(=O)(O)O phosphate 83 | Oc1c(cc(cc1[N+](=O)[O-])[N+](=O)[O-])[N+](=O)[O-] picrate 84 | C1CNCCN1 piperazine 85 | CC(O)CO propylene glycol 86 | OC(C(O)C(O)C(=O)O)C(O)C(=O)O Saccharate 87 | O=C1NS(=O)(=O)c2ccccc12 saccharin 88 | OC(=O)c1ccccc1O salicylate uncharged 89 | [Ag] Silver 90 | [Sr] Strontium 91 | OC(=O)CCC(=O)O succinate 92 | OS(=O)(=O)O sulfate 93 | c1ccc(C(=O)O)c(O)c1S(=O)(=O)O sulfosalicylate 94 | S sulphide 95 | OC(=O)c1ccc(cc1)C(=O)O terephthalate 96 | Cc1ccc(cc1)S(=O)(=O)O tosilate or tosylate uncharged 97 | Oc1cc(Cl)c(Cl)cc1Cl triclofenate 98 | CCN(CC)CC triethylamine 99 | OC(=O)C(c1ccccc1)(c2ccccc2)c3ccccc3 trifenatate 100 | OC(=O)C(F)(F)F triflutate 101 | NC(CO)(CO)CO tromethamine 102 | CCCCC1CCC(CC1)C(=O)O U_buciclate 103 | CCCC(=O)O U_butyrate 104 | CCCCCC(=O)O U_caproate 105 | CC12C=CC(C(=O)O)(CC2)CC1 U_cyclotate 106 | C1CCCC1CCC(=O)O U_cypionate 107 | CN(C)CCC(=O)O U_daproate 108 | OC(=O)CN(CC(=O)O)CCN(CC(=O)O)CC(=O)O U_EDTA 109 | OC(=O)CCCCCCCC=CCCCCCCCC U_elaidate 110 | CCCCCCC(=O)O U_enanthate 111 | CCOC(=O)O U_etabonate 112 | COCCO U_ethanediol 113 | c1ccccc1C(=O)NCC(=O)O U_etiprate 114 | OCOC(=O)C(CC)CC U_etzadroxil 115 | CCCCCCCCCCCCCCOP(=O)(O)O U_fostedate 116 | OC(=O)c1ccco1 U_furoate 117 | c1cc(O)ccc1C(=O)c2ccccc2C(=O)O U_hybenzate 118 | CCCCCCCCCCCC(=O)O U_laurate 119 | CC=C(C)C(=O)O U_mebutate 120 | CC(O)(C)CCCC(O)(C(=O)O)CC(=O)OC U_mepesuccinate 121 | OC(=O)c1cccc(c1)S(=O)(=O)O U_metazoate 122 | CSCCC(N)C(=O)C U_methionil 123 | c1ccncc1C(=O)O U_nicotinate 124 | CCCCCCCCC=CCCCCCCCC(=O)O U_oleate 125 | OO U_peroxide 126 | c1ccccc1CCC(=O)O U_phenpropionate 127 | c1ccccc1CC(=O)O U_phenylacetate 128 | CC(C)(C)C(=O)O U_pivalate 129 | CCC(=O)O U_propionate 130 | CC(C)(C)CC(=O)O U_tebutate 131 | OCCN(CCO)CCO U_trolamine 132 | CCCCCCCCCCC(=O)O U_undecylate 133 | OC(=O)CCCCCCCCC=C undecylenate 134 | CCCCC(=O)O valerate uncharged 135 | O water or hydroxide 136 | OC(=O)c1ccc2ccccc2c1O xinafoate 137 | [Zn] Zinc 138 | [N+](=O)([O-])O nitrate 139 | OS(=O)(=O)C(F)(F)F trifluorosulphonate 140 | O[Cl+3](O)(O)O perchlorate_rdkit 141 | c1nnn[nH]1 tetrazole 142 | 143 | CN methyl-amine 144 | C1CCCCC1N cyclohexyl-amine 145 | -------------------------------------------------------------------------------- /standardiser/docs/00_Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# `standardiser`\n", 8 | "\n", 9 | "This is the first publically-released version of a tool designed to provide a simple way of standardising molecules as a prelude to _e.g._ molecular modelling exercises. A Python module is provided that performs the complete standardisation procedure; in addition, the modules that implement the individual steps may be used seperately if required, perhaps as part of a custom standardisation pipeline.\n", 10 | "\n", 11 | "The tool is open-source and is available from [GitHub](https://github.com/flatkinson/standardiser).\n", 12 | "\n", 13 | "A slide-set describing some of the background to the project is shown below...\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/html": [ 26 | "" 27 | ], 28 | "text/plain": [ 29 | "" 30 | ] 31 | }, 32 | "execution_count": 1, 33 | "metadata": {}, 34 | "output_type": "execute_result" 35 | } 36 | ], 37 | "source": [ 38 | "import IPython; IPython.display.HTML('')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "In summary, the general procedure for standardising a molecule (with the documentation for the appropriate module linked) is...\n", 46 | "\n", 47 | "* Break bonds to Group I or II metals [[**`break_bonds`**](01_break_bonds.ipynb)]\n", 48 | "\n", 49 | "* Neutralize charges by adding/removing protons [[**`neutralise`**](02_neutralise.ipynb)]\n", 50 | "\n", 51 | "* Apply standardization rules [[**`rules`**](03_rules.ipynb)]\n", 52 | "\n", 53 | "* Re-run neutralisation (in case any charges are exposed by rules)\n", 54 | "\n", 55 | "* Discard any salt/solvate components [[**`unsalt`**](04_unsalt.ipynb)]\n", 56 | "\n", 57 | "* Return standardized parent\n", 58 | "\n", 59 | "The complete procedure is implemented by the [**`standardise`**](05_standardise.ipynb) module; a bare-bones alternative workflow using the individual modules is shown [here](06_alternative.ipynb).\n", 60 | "\n", 61 | "The documentantion is contained in the project **`docs/`** directory, and consists of a set of [IPython Notebooks](http://ipython.org/notebook.html), which can be viewed (and run and edited) by starting a notebook server in that directory. Alternatively, the notebooks have been exported as HTML pages, which can be viewed by pointing a browser at the **`docs/html/`** directory.\n", 62 | "\n", 63 | "A simple command-line driver program **`standardiser.py`** is available in the project **`bin/`** directory. It take SD or SMILES as input, and writes out a file containing those structures that have been successfuly standardised and one containing structures for which the procedure has failed.\n", 64 | "\n", 65 | "In the project **`test/`** directory are examples of running **`standardiser.py`** on structures from taken from the PubChem and EPA ACToR databases." 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Further work\n", 73 | "\n", 74 | "* Tidy up the code\n", 75 | "\n", 76 | "\n", 77 | "* Proper installer\n", 78 | " \n", 79 | " \n", 80 | "* Proper documentation\n", 81 | " - Hopefully the notebooks serve to show how things work, but more Pythonic documentation is still needed\n", 82 | "\n", 83 | "\n", 84 | "* Improve the rule set, neutralisation algorithm and salt dictionary\n", 85 | "\n", 86 | "\n", 87 | "* Better (optional) logging of what each module has done to a molecule...\n", 88 | " - Verbose logging may be turned on, but this output can't (easily) be used programmatically\n", 89 | " - The **`rules`** module can return a list of what rules have been applied\n", 90 | " - Other modules cannot do anything equivalent as yet." 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "### Acknowledgements\n", 98 | "\n", 99 | "* This work was funded by the IMI eTOX project.\n", 100 | "\n", 101 | "\n", 102 | "* The salt dictionary used is based on that used in the ChEMBL database; this was compiled by L.J. Bellis, A. Hersey and others and was in turn was based on that used in [USAN](http://www.ama-assn.org//ama/pub/physician-resources/medical-science/united-states-adopted-names-council/naming-guidelines/organic-radicals-counterions-solvent-molecules-used.page) nomenclature.\n", 103 | "\n", 104 | "\n", 105 | "* Some of the standardisation rules were inspired by those used in the [InChI](http://www.inchi-trust.org/home) software.\n", 106 | "\n", 107 | "\n", 108 | "* This project is built using the [RDKit](http://www.rdkit.org/) chemistry toolkit. " 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "

\n", 116 | "### Licensing\n", 117 | "\n", 118 | "This code is released under the [Apache 2.0](http://opensource.org/licenses/apache2.0.php) license. Copyright [2014] is retained by the [EMBL-EBI](http://www.ebi.ac.uk)." 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "### Contact details\n", 126 | "\n", 127 | "Please sent bug reports and suggestions for improvements to Francis Atkinson." 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Python 3", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.5.1" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 0 152 | } 153 | -------------------------------------------------------------------------------- /standardiser/docs/01_break_bonds.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run notebook_setup.py\n", 12 | "\n", 13 | "sys.path.append('../..')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": false, 21 | "slideshow": { 22 | "slide_type": "-" 23 | } 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "from standardiser import break_bonds" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "break_bonds.logger.setLevel('DEBUG')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "slideshow": { 45 | "slide_type": "slide" 46 | } 47 | }, 48 | "source": [ 49 | "# `break_bonds`: break bonds to Group I and II metal atoms" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "### Introduction\n", 57 | "\n", 58 | "The **`break_bonds`** module beaks covalent bonds between oxygen or nitrogen atoms and Group I and II metal atoms as a prelude to neutralization and application of the standardization rules." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "### Examples" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": { 72 | "collapsed": false, 73 | "slideshow": { 74 | "slide_type": "slide" 75 | } 76 | }, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFtUlEQVR4nO3d0XKbSBRFUZia//9l\n5kFjRRYSQRwQfZu1Kg9OVVLGtnrrIlp4nKZpAGCrf84+AIDaZBQgIqMAERkFiMgoQERGASIyChCR\nUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCI\njAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJE\nZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQg\n8u/ZBwB9Gsfx/vE0TSceCUeTUYg85vLRYzrHcVTSjvnpwiprcrn83621XplG4dnLYoog73iG5LrC\nAXPDp7PcuuTnyrWce+VHSbvkpJ7LOTFk0zQpaX/sG+VCJIwjyCjs792rrsPPQPrNg+FoMspVfHMU\nXW6lknZGRuEQSnodMgoQkVEu4d0Z/aEjoYH0ImSUYRiGYRz//GE/SnoF9n8wDOM4PD4Mnv5a38Io\n+p3H//Insg2rOtPo5c2jOU1m0n2ZSfvmXUwXc1+ulxl/Th9F6Z6M9u5pzBGOkyy/DdSbREuT0X78\nuunGn4+szFYoaa9ktAnz9bNmRT29oGYFzrV2Rq+kXZLRVmxYP/ust9sFpa6v1DdFK/sjo61YWF33\nqfOotfd0ab6XFd7aKLqGyFYko617XFQHLjDr9ouc2nfGvtGGvNw/aDl1yU7SnshoW16un/HHKYdU\nVPtn9FrZDRlt3W3Z35x4EKd96qsS2UJktDktrp9qbw9tZ+Rc5tS+DzLaoscE3NbSjXUVajCvStqB\n5h5VtKvIftKFVjaY0ZuKx8ydaZTVqp3aP2m5RwtTp4G0fTLKJ4qXtCglbZyM0pWWR85lWlmXjPKh\nmgNpiby+K2mJg78yGeVzrZa0g9xUP/5rklH6VzqvpQ/+ImSUTbyQBz9klI1auyTS5dTW5RfVHxml\nc0rE0WSU7doZSLWSE8kokXZK+lLpvJY++EuRUVKnl1RuOJdfIkJVa9pdN6+eGwqRUXZw6K8PWrhn\nx+O/ER3OIqPsIy/pmlwe99lhMxllH/e7Sq9p2ctiiuCdp4RaZJQDJQPmpwyknMXDjh089usxnd9/\ndHVQ0g6+hKsxjbKzc6/8mEn5PvtGSbWWrdP3sSZa+2ayhoxyFEXgImSUDpUeSClHRom8GzlPH0Ur\nlvT0bxrbyCjdqlhSKpJRelaopEbRumSU7Zo9o4dvklE6V2ggpSgZZaNCI2f7JS30zWRORtno3aJv\nswjtl5S6ZJRNxnFor5XLmi1pm088rOc99exJEV4693YtHE1G+dz7UbTxQhx945INNwb0xNMBGWU/\nFc70dynpN++jSvtklA9VaOWy8C79g1zym4yyk1J5XSjpN1/HdEbfBxnlE6VauY2u8SkbnthDwbye\nvv/JKNoN0yirFWzlslN+40ibe1dJyCixynk9rqQuT12HjLJO5VYuy0v6sphyeR0ySqbfvL6kmMzJ\nKOt0XYr5QOqUnPVcKyTQ1yjqne9sYxqF/0kn29g3SkB3QEYBQjIKEJFRgIiMAkRkFCBiwxN/87gR\n3aV5mJFRFj1tsO9rvz3swkk9782jOU2D+7zBbzIKEJFRgIiMAkRkFCAio7w3v6DkSj3M2PDEoqeS\naijMyCh/I52wyEk9QERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQg\nIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMA\nERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkF\niMgoQERGASIyChCRUYDIfx2hxnunQNIvAAAAAElFTkSuQmCC\n", 81 | "text/plain": [ 82 | "" 83 | ] 84 | }, 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "mol = Chem.MolFromSmiles(\"[Na]OC(=O)c1ccccc1\")\n", 92 | "mol" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": { 99 | "collapsed": false, 100 | "slideshow": { 101 | "slide_type": "fragment" 102 | } 103 | }, 104 | "outputs": [ 105 | { 106 | "name": "stderr", 107 | "output_type": "stream", 108 | "text": [ 109 | "[18/05/16 13:18:24 standardiser.break_bonds DEBUG] Broke 1 bonds to Group I and II metals\n" 110 | ] 111 | }, 112 | { 113 | "data": { 114 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFPklEQVR4nO3dwVbiShiFUXKX7//K\nuYNoiwEiegz1V9Xeo9ZJxyz4ODEuWNZ1vQDwW/+1PgCAvskoQERGASIyChCRUYCIjAJEZBQgIqMA\nERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkF\niMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgo\nQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIySnvLsnz7HShLRilBN+mXjFLCuq6PSrp8\nePEhwZNklNKWZVk/7EoqrBQho1Rxd5Cu69rkYOB5b60PAD5tJd2l8+AG1PYPqaUtGaW066r+q+f2\nndvgQhMu6qnl4F4T1CSjlHO9MbeqbnaFNUUpwmURQMQaBYjIKEBERgEiMgoQkVGAiIwCRGQUICKj\nABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUIOJDROBk12/m7+19RySjcKZl+ZLO3ZcMQUbZa/JRSGN+\nCsNtNNdVSccjo3zR6uM2fcwn/XKLCSAio3xqOAl9rjL9klGAiIzyrvlvJwccpNsNpWvuL43ILSY4\n066kGjoit0e5XApM0X/qHAk8yUOWo3KdepV98J/2+rB0zT4lF/Uc6TVn8EJuMc2u4PTr9V6TKTor\nGQWIyOjUCk7RTX+D9GCK9vWD8HMyCmdypT8BGZ1X2Sm66WmQauXcZHRSxRu66amkd8nrHGQUMlo5\nPRmdURdTdNPxIJXXacgoBLQSGZ1QR1N00+UgldeZyOhcumvopmhJH7VSQycjo/Abi1byQUYn0ukU\n3RQdpLfkdT4yCj/W9QsSf05GZzHAM7/IID06k6bolGQU/obflk5LRqcwwBTdNB+kw5xJ/pCMjm+w\nZ37zkt412EnmR2QUnqWV3CWjgxvymd9kkB5/8N94J5nnyShAREZHNvBKevEgNUU5IKPwPaHkgBfS\nYT1aST2up4Of5fUHc30kPZ5M/txb6wPgLNtl7+2T/NH3y6r2etDX2eMFXNTDL+kpGxkd2aP7MDX/\ngv2ualP0cnX2NJSNjAJEZHRwXQ/SglN008XZ42VklKKatxKeJKPj63qQ3iqS107PHmeQ0Sl0V9Ky\nl/PXyp49XkxGKadUK+FbMjqL7gbprYJ57ejscR4ZpZaCrYRjMjqRrgdp2bx2cfY4lYxSSNlWHlvX\n9aKkE5PRuVQepN7Tk07J6HQql/SuPhpqkE5MRimhj1YeU9JZyeiMOhqkI+SV0cko7Y3TSoN0SjI6\nqTqD1J0leiej86pT0rt6bahBOh8ZpaVeW3lMSScjo1NrO0hdzjMGGYUTGKQzkdHZtRqk40/RAX4E\nniOjABEZpcEgHX+KMhMPWd5V6FeFY4CfskYBIjLKu+Z/dW+K0ikZBYjIKJ8aDlJTlH69tT4AmMD1\ni5NXi+GYAOw1GaQjPw6X5Us6d1/SPxmFM92NppKOxe9GASIyChBxiwna2f0a2pV+n2QU2tHNIbio\nhzPdvvGo+0vDsUbhZLuSauhwZBTOJ51Dc1EPEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFR\ngIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiM\nAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRk\nFCAiowARGQWI/A/7VTilpzMmzgAAAABJRU5ErkJggg==\n", 115 | "text/plain": [ 116 | "" 117 | ] 118 | }, 119 | "execution_count": 5, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "break_bonds.apply(mol)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 6, 131 | "metadata": { 132 | "collapsed": false, 133 | "slideshow": { 134 | "slide_type": "slide" 135 | } 136 | }, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAH/UlEQVR4nO3d3XajxhKAUXFW3v+V\nORdObFmy+Cvormr2XrmYWZNksNR8dAsE0zzPDwCO+l/vDQCoTUYBQmQUIERGAUJkFCBERgFCZBQg\nREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBE\nRgFCZBQgREYBQmQUIERGAUL+6b0BcA/T9PPree63HZxPRuF60/QrnS+/pTiLerjYezTn+dfklOLM\nRuFU330037wNGYUwn3vem4zCbtM0PR6Pn15K573JKKybfn+UOesmT2SUe3kO4moNv//lUDe/Tig5\nUz+uyXGV+5imXwP+5bdX/90/v7bTjUVGuYvlaJ4z8eSWLOrhV2GbTlEZgowypl2zy/7d9GlpZTLK\nIIIn06e+Xyt6Pw1FHTJKWdP0eDy+4xeZUb4s6qMbdoySliWjlPJ2vnt7deZ57nmmfgslrSnZMOK2\nlq8HOu+L6p+uG33+LLVzXpW0GhklAVenv/AKlOJGefTmPnIUJ6OQjwNJKTIKKSlpHTIKWSlpES54\ngsRaXjawcDmE+6oscqaeBJypX3RVSV+mup/+Cu/OGhklB/OdReeUdGM3X/6TPyen3qMnFvXkYLe8\ngIedtCGjUMDG71Z52EkXMgo1fCrprseicAUZJQGftW3zXFK368/DKSYSkNE9rr0E6v2yJ2fq15iN\n0pvdsq/V0/cv3wLwZr2RUSgmeiu/A5c9Secii3q6MhU9antJXfZ0NbPRGpxPYDuXPTUmoxm9Pw7I\n43959v40lOc/NUIak9EUtu8G/R9xcSIr+hhrlCRktIOFyeYWQ5WUAGMgCRlt4fQ1l5Lembc+Gxm9\nhM+q1lnRMwoZPUFwkX6MCSkkYT88rtkH/Au5rFpSU9Gjqr7jQ/MsppB5nptNPPf+EdCGjB7UeFIw\nWEmLbS4sktEyhimpZelhXrqcZHQQtUoKI5HRI3pNCpZbqaTQhYyeqUHFqpfUsvQwL11aMlpP9ZLC\nYGR0t0+TgpaTBa2EPGS0qmFO3LOFFX1mMlpYuZJqAUOS0X0yrOiflSvpo8mJOGhJRkeWs6Q5tyoz\ns/jkZLS8Kifun1uQZ6sgTkZ3SDspSF7SaZpEk4El7UJO2T4Y3bUZzTZy+91Xk7xuyXmV8nPb5nEs\n38j5uts8L9/qf3WarBFUJ6NbldjhG4Rp763+e8UdmjGCt0q+on927t3yT3muVJIPHMrxypRgNjqg\nhSneltnfFc/jM+tkYDK6yUgJeClas+fxBeMOaRm7mxRa0X9bXUd//aLx9i9tlefc/ZZ5dPHMdaPD\nWr1ctM3z+HaY54fLSylIRtfVnRR8KmnHn2gl7kpKQTJ6XIm8JtxCJd2ixOjii4zSgZIyEhldMd6k\nIMlPNC+3UkmpQ0YPShKj2rTyA6OrFhmlq4WS3jKy7oZVkcvvl4w3KSj2E32VtNAG77Tw3Ydi79S9\nySi9LbdyrJJu/6Ktb3YVIqNHGN8nG7ekkRsUKGkVMvrReCM49U+0WtIKmt2ggFRklDQKzjqvuBvW\ny/8w9cGPx+MhowcY1rfVZbKppPnJ6N/GG7g1fqKUE9Jed8P6pqTJyehuRvO1Upa0wZu+HGslzUxG\n/2bU9pTpZb9uGOz9XNWYTEtGPxpp1A7zg5QWPx810pgciYwuMWqJuPo8PknIKHy09yDaoJsO7QnJ\n6AqjNpfnTiV4U7rMN43JbGR0XfVRW3rjf3k5g9/vhL5LoHjmRnmbrD4ejsu9R/PiO+ktdGr+z3V/\n+ypjMg8Z3cqoJRtjMgkZ3aHiqLX0G1vFMTkeGd3HqL2JQocfY7I7GQUIkdHdCh38C02p1r2fUMr3\n1fteCo3JIcnoEUZtH18l/f7nsoZWPPwYkx3J6EFGbR/z/PMPvxmTvcjocRlG7fSfP/+o3JSKoAxj\n8oZ8iymk/ZdJ3O3iatUPP77g1J6MRl09anWTvZS0MRk9wbmj9pRu2ougGRntz3wzj2EOPyakLcno\nOXaNWt2kASVtRkZPszpqm91dzc7DFyVtQ0bPtDxqWz5dkgNKF6f0xlcnoydrefx/j6YdiWfa2oaM\nnu+6kvpQlT/JZV8yeomzSqqbzSgRh8loIhbpnMiBoRkZvcqWCanJJnFy2Z2MXuilpCabaSkRETJ6\nrec77thRacaBoSUZvZzRzHXkMgP3G+XulIggGYXRODA0JqPwhxJfq5XLJGSUW/tUIk/jYDsZhb8V\nLakpansyyq0ttzJzSeUyDxnl7jK3khJkFJZKmjOyn6aipqhdyCisyFlS8pBReDwqf0hKdzIK/6pS\nUiv6bGQUflQpKanIKPyipOwlo/Aqc0mt6BOSUfiDWSfbySjsJrI8k1H4W8KlvRV9TjIKHyUsKQnJ\nKCxRUlbJKKxIUlIr+rRkFNYlKSk5yShs0r2kppxpyShslXDWaUWfgYzCORJGljZkFHbovrQnIRmF\nffKU1Io+CRmF3VZLevUGTNNk2puHoxkc1Gwy+F5Mu20qMgrHXVTSl27aSZOTUQiJl9Rks7p/em8A\n3I7J5mDMRiFqdUKqm2OTUTjBc0kt0u9GRuEc3/W0T92NjAKEuPweIERGAUJkFCBERgFCZBQgREYB\nQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFC\nZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQv4PDJOqN7zNr20AAAAASUVORK5CYII=\n", 141 | "text/plain": [ 142 | "" 143 | ] 144 | }, 145 | "execution_count": 6, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "mol = Chem.MolFromSmiles(\"c1ccccc1C(=O)O[Ca]OC(=O)c1ccccc1\")\n", 152 | "mol" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 7, 158 | "metadata": { 159 | "collapsed": false, 160 | "slideshow": { 161 | "slide_type": "fragment" 162 | } 163 | }, 164 | "outputs": [ 165 | { 166 | "name": "stderr", 167 | "output_type": "stream", 168 | "text": [ 169 | "[18/05/16 13:18:24 standardiser.break_bonds DEBUG] Broke 2 bonds to Group I and II metals\n" 170 | ] 171 | }, 172 | { 173 | "data": { 174 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAHkklEQVR4nO3d0XLiRhCGUZTa939l\ncqHKBiNGFgzSdPecU7nIpiq2UMznX4jYy/1+vwHwqX9GHwBAbjIK0EVGAbrIKEAXGQXoIqMAXWQU\noIuMAnSRUYAuMgrQRUYBusgoQBcZBegiowBdZBSgi4wCdJFRgC4yCtDlz+gDgMyW5f+/9/t4ZiWj\nE1ken/NXqfzLvpblRzqf/sg0ZHQWy7IMKdqoz3u6bTTvdyWdk9dGAbrI6BQGTsL7/T7kxQS4jIwC\ndJHR+oa/OmmQUpuMwkfWG0qP3F+aVdG7qPxn+BT9K86RfJP3jeINT9M69Sq7YC5b5nmktFUcCPwn\n2gCMdjzvcc1Og9dGywrYrMT3mjSUNhkF6CKjNQWcoquUg7Q1RdM9EM4ho7Brp6Ehv1FxPRktKOwU\nXaUcpNAmo9BminKAjFYTfIquDFIqkVHGSFBSU5RjZLSUFFM0B63kMBmtI11DEwzSLXllQ0Zhw+U8\n75DRItJN0VXAQbpoJW+SUThGXmmQ0QqSTtFVqEGa+kwyiozCAaYobTKaXoEBFWSQts6kV0vZJ6O5\nFWjoKkhJ4QMyCrfb7hSt8Y2K88hoYsWe4QMHabEzycVkFJrklSNkNKuSz/Ahg7TkmeRKMgqvySsH\nyWhKhZ/hFw/SwmeSy8hoHRnfMDT8mFsNlVeO+zP6AHhb6xm+7rhET/79B3L98dzaVYUdMppPuly+\na+CjWz917dPL17moLyXR/wsUOVVhD4yYZDSlRLl819i8Fj6xnEdGq0kRgshTFN4lo1nt5DJFSV+K\nkNe8Z49RZJSrRWglfJGMJlZskMbJa8azx0Aymlu6J7yfR0c9MlpWwMImamXAs0dYMppegUv7RHmF\nLRnlIulameWbEMPJaAWpB2nkvN7v91vss0cEMsoVIrcSOsloEUkHaYK8GqT8RkbrCJtLb3KiNhmd\nQtjC5mCQsktGSwl4aW+KUp6McqI6rTRIaZPRagIO0q2UeVVSGmS0oCC5dDnPJGR0LkEKm5VByisy\nWtPwS3tTlHnIKLzDIGVDRssaOEiLT9ECD4GvklGALjJa2ZBBWnyKwoav7Poi9CvCMcBJrFGALjJa\n3/D3ipqi1CajAF3MhFmMmoTFp+jjzC/8MNn1Z/QBQFrL8iOdT39kGqWXAj8NeYW07BfYy2gq6ZSs\n0YmULRoM5RYTQBdrFE7w9PqJ64DSZBROoJszcVEPH9n+xDz3l2ZljcKnnkqqobOSUeggnbioB+gk\nowBdZBSgi4wCdJFRgC4yCtBFRgG6yChAFxkF6CKjAF1kFKCLjAJ0kVGALjIK0EVGAbr4eaPQwY9t\nRkbhc0+/NcQvEZmVjE5kefrdQZe4Vy3LNprr7xSp+nhpk9FZLMsypGijPi9cxi0mgC7W6BQGTsL7\n/T7jIH16/WS2hz8ZGYUT6OZMXNTXN3wMroN04AGc4umX1N/cqZ+XNQqfeiqphs7KGi1uZ4qetBBf\nftiag/R2u93v///FrGS0sv2GnnSl3ypm2ZIyPRkF6CKjZQ2Zoqvig7TAQ+CrZBTe4XY8GzJa08Ap\nuio+SOGBjM5l+HtIczNFeUVGCwrSSoOUSchoNcMv549IWVJTlAYZ5UQpcwlvktFSAk7RIpf2piht\nMjqFOJfzKWkou2S0jrCtLDJIoUFGiwh4OX9EgpKaovxGRrlCglzCp2S0gqRTdBW6sKYoB8goFwmd\ny1cWDeUYGU0v9RRdpSssPJLRsgI2NFEuA549wpLR3NI92735iXpkNLECl/OP4pQ049ljIBnlanFy\nCV8ho1kVm6KrCIXNe/YYRUarSVGBCLmEb5HRlFK08jNjC1v4xHKeP6MPgG9KVIE1lzGP9qQDe/z2\n8NnH//sRYp63acloPmHr8y0DB+l6Ys9I/NMH/ODjP/4r5b8GcpHRfFpP8nRPrTIP5FfbR/SU1O0/\n3Cp2Tirx2mgdGZ9mw4+5NXsvW8RrYVdPn7F1APW+zWQnoykVvtN9cSOGn8meS3uCcFEPr112E2zb\n8b//ZP0bL4kGZ41mNXxGnWFIJiK8xWr1eEh/b3ZpaHwyCk3fKuzx1z3JyPe33CotlLGPpfXZv3hU\nrfeNPt6p3/9/fB//WOa/ewF1noRzktELDmD4gRGci/rcyrxCKlXkJaPpFShpkIbu/EjpW/IzzKlk\nFA5QUtpktILUgzTIFF2lPpOMIqNwjEFKg4wWkXRGhZqiK6+E8i4ZhY1WSRWWV2S0jnSDNOAU/Z2S\nsiGjjBG9oXLJYTJaSrpBGppLe46R0WpSlDT6FIV3yCi0GaQcIKMFBR+kpijFyCjsMkj5jYzWFHaQ\nppyiOyUFGQXoJKNlBRykKafoyiU8bWm/rDlm5ye6n/dJ/Qx5puIXLE9K0eBbXNQXF+TS3hSlMBkF\n6CKj9Q0fpKYotckoQBczYRajJmHxKfo48ws/THa5Uw+fWpYf6Xz6I9MovRT4acgrpGW/wF5GU0mn\nZI1OpGzRYCi3mAC6yChAFxkF6CKj8JHtz3xyf2lWbjHBp55KqqGzklHoIJ24qAfoJKMAXWQUoIuM\nAnSRUYAuMgrQRUYBusgoQBcZBegiowBdZBSgi4wCdJFRgC7/AlBcbYfvIz2hAAAAAElFTkSuQmCC\n", 175 | "text/plain": [ 176 | "" 177 | ] 178 | }, 179 | "execution_count": 7, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "break_bonds.apply(mol)" 186 | ] 187 | } 188 | ], 189 | "metadata": { 190 | "kernelspec": { 191 | "display_name": "Python 3", 192 | "language": "python", 193 | "name": "python3" 194 | }, 195 | "language_info": { 196 | "codemirror_mode": { 197 | "name": "ipython", 198 | "version": 3 199 | }, 200 | "file_extension": ".py", 201 | "mimetype": "text/x-python", 202 | "name": "python", 203 | "nbconvert_exporter": "python", 204 | "pygments_lexer": "ipython3", 205 | "version": "3.5.1" 206 | } 207 | }, 208 | "nbformat": 4, 209 | "nbformat_minor": 0 210 | } 211 | -------------------------------------------------------------------------------- /standardiser/docs/05_standardise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run notebook_setup.py\n", 12 | "\n", 13 | "sys.path.append('../..')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "from standardiser import standardise" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "standardise.logger.setLevel('DEBUG')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "slideshow": { 42 | "slide_type": "slide" 43 | } 44 | }, 45 | "source": [ 46 | "# `standardise`: get standardised parent" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "### Introduction\n", 54 | "\n", 55 | "This document provides some examples of the use of the **`standardise`** module, which uses the component modules described in the previous documents." 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### Examples" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": { 69 | "collapsed": false, 70 | "slideshow": { 71 | "slide_type": "slide" 72 | } 73 | }, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAHZElEQVR4nO3d3XabOhCAUTir7//K\nnAu3XoQYGzNIGom9r5K0TUhjfxnEj+dlWSYAzvqv9QYA9E1GAUJkFCBERgFCZBQgREYBQmQUIERG\nAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCDkT+sNgGmapmme\np/WL1D7fnee/H9n86ZOXtqU1GSWxdVvXYX0ZXGjETj2J6SM9MI2SxnpXHfoho6Sxt/r5eNtkSlYy\nSm6WPknP2iiJaSg9MI2S2+9zm5bFCU+kMi8ehQABduoBQmQUIERGAUJkFCBERgFCZBQgREYBQmQU\nIERGAUJkFCBERgFCZJT+zPOL+5FCKzIKECKjdGlztzxoSEYBQmSUPmzWQycDKWm4+z15uck9XZBR\n0nnW82M6DaRkIKNkcbyekIrXYqIlu+0MwDRKG4+ASicDcKSegt5cbrQsRRpqqZT6ZJShOOhEfTJK\nWbrG8GSU0Qg3lckoxdXvmpJSk4zSvZfFVFKqkVFqKBo1xaQtGWVY8kodrmJiEPP8+kTUvY/DVUyj\n1FBhKtybPZdlmg2llCSjjMNePE3IKG3U7N2yLAZSypFRhrK/a6+klCKjjMZiKJXJKHdhIKUQGWVA\ne8VUUkqQUcakmFQjowzrZUnllcvJKPcyz67c42IyysjMnlQgowxuPXsaRSlBRrkLDaUQGeUWNJRy\nZBQgREZpo+ZoaBSlKBmlhoYR01BKk1FGNs+ThlKajAKEyCjD8ipM1CGjjElDqUZGAUJklILm+cdL\nelS7ut0oSk0yymg0lMpklLK86DHDk1Ha2OzvX/hpjaJU9qf1BjC+x0C6qdvj3WdJr2qfhlKfjNLS\ns3qX9xSqkVFq+LhCuumpmNIRGSWXQjv7UI6b35CanpKfaZTU7OyTnxOe6MOy/Ehqk4uj4CUZBQiR\nUbrk4ijykFGAEIeY6NXvi6M286njUdQho4xDN2nCTj0ds0JKBjIKEOIqJoAQ0yhAiIwChMgoQIiM\nAoTIKECIjAKEyChAiIwChMgoQIiMAoTIKECIG+UxTdPPW3W6zQJ8Q0aZXtz9WEnhMDv13No8z7Nb\nlhJjGuWOnul83Chynt0xkvNklBvZ1PNpWRYl5TQZZXx79VxTUk6TUYZ1pJ5rSso5HjRM0zTaCU+P\ngJ57bEf+Lfcko4zj2/Hz/afy1OAgj5V7G+IU0dP1fN9KJeUga6P81E9Y47Pn+8VQS6UcJKN05sI9\n90lJuYKM3lg/g+dDoYM/j1bufWYl5SMXg7KSPqyFcrYsyzOmL/+0yQWjLlTthYzCX0lKOv/zPu7k\nYaeePtTZs36zC196796Fqv0yjdKrQmNa5Zl0M3u+Kfi1X5cLmUZh681Bp6tmw3MXqh7/+9Qko3eV\n/mhSW29uoBcs2ul/655+afmR3NXvjCYO6+92VKvJmy/01Ta4UHVg1kavtDlDxXrWAIJLpUeWPi/c\nJJqwU39T8zSZZw46cfj+2kutdr5u2p2H25HRi/1+XpV+Rl1FWN84eNCp5s96WTIvw9yLjBb3fEbl\nWdLKsyVHJNnacgedApukpClYG73e76Wr/Ff1JUnVQQ239v1SaeWNmf6VlLZktLjn4YXWG8I1sh3h\neZQ00xbdjowWke2ZxrVanX21Z1mMpS3JaHGPpD6eadpKOUraikNMV1qPJC/ftmv/reaDXl8cdGrC\nNNpGw7G09zD1vv2lmUnrk9E2Uu3gC9Ng/DArk9FmnmumrTeEEL+EsDba0t+nX93VrPX1Nsmf/wpF\nF0yjCVRczXr5AhVSNarN+aTPtzcPt/XHnYJ6gmk0h8JHWId5gYq+trYv6wegw/1fkdE0CpT0yJ0y\nzKRjO/6w2vs7kvqRjGZyVUn/5fP4C1RM+Qa9bNvzUhcb+dLenvvj431+T83IaDKnH7/rp8WpZ3Z3\nO/gc9PK38/rdn4+dvx9Zr9hr63sy2rPfD/8YJWWtxYkkXZLRPhUbDzKXNO2G5XfkZBC5PE1Gc9ub\nN0s+3jOUtPkG3NA6tbUea4OQ0cQ240HFaSFDSfPL/1+0V8O9ddLc301eTr/ntYbXqqa6vTx8ZBpl\nV+UToTZnueaf9eBBRvmg9A7+MFdYcVsyymcliuYKK4YhoxxyVUm/fSX3tDv4CTeJVmQ0sc3Jfq2f\ntMGSRuZKO/hkJqO5JQvHiZx9O35e+KWvYmGB92SU7xxcr7yqnr+/dLWclfgWGJKM8rU365Wl01Oh\npOrJt6w3cd6zaJXTU6KkX30LFmpZM41yXqsTkq79upY+CfJLlY7FzxyYvg+o7LIho/St2pkD1kzZ\nI6N070hJ1ZNyZJQRvCnpuX1w9eQ4GWUQTa5VhUlGGUmra1W5ORllKN/W0PhJnIwyoI9jqXpyIRll\nTE2uVeWeZJRhtbpWlbuRUUbmwBEVyChAiBdYBgiRUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUI\nkVGAEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUIkVGAkP8BpcFlOHREwJMA\nAAAASUVORK5CYII=\n", 78 | "text/plain": [ 79 | "" 80 | ] 81 | }, 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "mol = Chem.MolFromSmiles(\"[Na]OC(=O)Cc1ccc(C[NH3+])cc1.c1nnn[n-]1.O\")\n", 89 | "\n", 90 | "mol" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 5, 96 | "metadata": { 97 | "collapsed": false, 98 | "slideshow": { 99 | "slide_type": "fragment" 100 | } 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stderr", 105 | "output_type": "stream", 106 | "text": [ 107 | "[2016/May/13 08:16:41 DEBUG ] Starting fragment 1 '[Na+]'...\n", 108 | "[2016/May/13 08:16:41 DEBUG ] 1) Check for non-organic elements...\n", 109 | "[2016/May/13 08:16:41 DEBUG ] Starting fragment 2 '[NH3+]Cc1ccc(CC(=O)[O-])cc1'...\n", 110 | "[2016/May/13 08:16:41 DEBUG ] 1) Check for non-organic elements...\n", 111 | "[2016/May/13 08:16:41 DEBUG ] 2) Attempting to neutralise (first pass)...\n", 112 | "[2016/May/13 08:16:41 DEBUG ] 3) Applying rules...\n", 113 | "[2016/May/13 08:16:41 DEBUG ] 4) Attempting to neutralise (second pass)...\n", 114 | "[2016/May/13 08:16:41 DEBUG ] 5) Checking if frag is a salt/solvate...\n", 115 | "[2016/May/13 08:16:41 DEBUG ] ...fragment kept.\n", 116 | "[2016/May/13 08:16:41 DEBUG ] Starting fragment 3 'c1nnn[n-]1'...\n", 117 | "[2016/May/13 08:16:41 DEBUG ] 1) Check for non-organic elements...\n", 118 | "[2016/May/13 08:16:41 DEBUG ] 2) Attempting to neutralise (first pass)...\n", 119 | "[2016/May/13 08:16:41 DEBUG ] 3) Applying rules...\n", 120 | "[2016/May/13 08:16:41 DEBUG ] 4) Attempting to neutralise (second pass)...\n", 121 | "[2016/May/13 08:16:41 DEBUG ] 5) Checking if frag is a salt/solvate...\n", 122 | "[2016/May/13 08:16:41 DEBUG ] Starting fragment 4 'O'...\n", 123 | "[2016/May/13 08:16:41 DEBUG ] 1) Check for non-organic elements...\n", 124 | "[2016/May/13 08:16:41 DEBUG ] 2) Attempting to neutralise (first pass)...\n", 125 | "[2016/May/13 08:16:41 DEBUG ] 3) Applying rules...\n", 126 | "[2016/May/13 08:16:41 DEBUG ] 4) Attempting to neutralise (second pass)...\n", 127 | "[2016/May/13 08:16:41 DEBUG ] 5) Checking if frag is a salt/solvate...\n" 128 | ] 129 | }, 130 | { 131 | "data": { 132 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAGa0lEQVR4nO3d23baOhiFUejY7//K\n3heMUsLBMV62rF+e86pN0lSA+CIfc52m6QLAWn+OHgBAbTIKEJFRgIiMAkRkFCAiowARGQWIyChA\nREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYB\nIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIK\nEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIv8dPQAy\n1+u/P0/TceOA85LRyq7XH+l8+ivQhI36sl6jOU0/FqdAEzIKEJFRgIiMAkRkFCAio2W9HlBypH6B\n6/V6dSCOTTnhqbKnkmroB4/dnKbp9pHJ08VGTCbG9JrO1y8w+dmEmcRQ7vVcMrGVlE2YRgXZB/rT\nrwvP+X/rLUDIvlFKStL5aJomJSVkAlXzaSl6giXqVul8+529EVjNapTefbW7cx1rUhIySo/2WHjO\nh1JJWU1GhzDEFv1+2+z376mk7EFGSxkil69uAW3QLyVlDy4GpQvNynULZfIF8ERGOR0lZVsyWse4\nW/TtN6JvoZxp5a9fAHcyWt+ged3bNE3zq85fvwBuZJRTK7eBb43cIRmlRy1L0X9Jrw+skTvkhKci\nBt1y7+Tsoj5PhJq5fMuJWV2R0eIGzWt7/ZR04cWvStoPGa3hermc5+1yVB0OLOm6K7iUtBMyCv80\nLml+15X7flIxPZCMFjDzvi29Su1zJdWgpNves8pvlzqcjBbmnbOTPUp6+I1X2I+M0pdOWrBVSRvc\nLPXbIbE5GYX3ft3tOJOtlvVcOCT24xnv3cwbtfRrV+hxzQ/p/tm9N9uXcxZcY1aj8Iv5JV6Hx8qn\nSUmbcjEoHelwKXqz5ILRZoNZ4lZS2pBRWKTclexK2oyMdq3QDsQzeFvSnl8LJW1DRjlAz+mZV27Y\nStqAjMLglHRvMtq1+9rn8Wa9dZdy80Z9XD1Q0l054alrT2dxC01vCr0izoLaj4z26NM1MGNcozLA\nQyhKSXciox1ZcgXhGCXlKEq6Bxk93rfXXw9Z0vEeUbc8zZuT0eP8zeeKfAxZUihKRtt6PFyaVbBo\nSSuO+ZORHgsJGW3iXs9N33VFS0o/XveT3j8y86nLXjO6Khndzc+F507/yRglHeAh1LXiiNNTT710\nMvrO/A/iy2+JvH221eQao6QcZcWxe3PtiYx+7zWpT9Oq+SyrUtISg2Q5S9EbGf3S22h2MJuqlHQY\nIz3bn6bw/PWjHcz6XsjoOIqWtOKYx/O2pK97th7/7EW7k9EPat7IoWhJqUVDn8joBzM/iPvW4a8G\nutH3zrkL1GoyOqBat4MqMcgnVZ7bby0vaZPT+cqQ0S+97kPqdQunqw38Wr/F6DxeZ8f9I1996uRk\n9HtPP7I7nlOHl9T9UjkD03p8jeP1uPB8/X/HKOkYj4KtWI2Or82adOHt/g5fIMPmTOiz2Cle394s\nddfBtFF68OzBavQsNlwGzm+2Nx4MHE5GTySM17qF5/xgtvpucCAZPZcVJd22nk+DudhGpj4ZPZ2F\nJd2vnuvGA90yfU/qbbnynZ7bjudAn3Y49DZOemA1elKPa8BmC8+F4zlQD08F5cjoefV2kOeokh64\nBmcMMnpqvVWjZUktPNnK8ZtR8GTXkib17GG3Ax2yGqU7m69JbbazKxmlR5uUdNvNdvf64xMbKfRr\nXUk3rKdlLEvIKF1bWNJte+foE1+RUXo3X9INz9lST9aRUQrY7/xWm+3kZJQyNjx8b+HJhmSUSsKS\nqid7kFGKWX2jv4t6sg8ZpZ6ubvQHMkpJn0qqnrQno1T1eqO/i3pyBBmlsK5u9MdpyShA5M/RAwCo\nTUYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYB\nIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIK\nEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFR\ngIiMAkRkFCAiowARGQWIyChAREYBIjIKEPkfeTAX9iW4a5QAAAAASUVORK5CYII=\n", 133 | "text/plain": [ 134 | "" 135 | ] 136 | }, 137 | "execution_count": 5, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "parent = None\n", 144 | "\n", 145 | "try:\n", 146 | " \n", 147 | " parent = standardise.apply(mol)\n", 148 | " \n", 149 | "except standardise.StandardiseException as e:\n", 150 | " \n", 151 | " logging.warn(e.message)\n", 152 | " \n", 153 | "parent" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "### Multiple non-salt/solvate components" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 6, 166 | "metadata": { 167 | "collapsed": false, 168 | "slideshow": { 169 | "slide_type": "slide" 170 | } 171 | }, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAHaElEQVR4nO3d3ZabuBKAUThr3v+V\nORd0vGhsaEwhVBJ7X2U6M2kysb+UxI/HaZoGAM76X+0DAGibjAKEyChAiIwChMgoQIiMAoTIKECI\njAKEyChAiIwChMgoQIiMAoTIKECIjAKEyChAiIwChMgoQIiMAoTIKECIjAKEyChAyH+1DwCGYRiG\ncRyWH1L7+sdx/PnK6mdffLQttckoiS3bugzrx+BCJRb1JKaPtMA0ShrLpTq0Q0ZJY2v3c/6xyZSs\nZJTcbH2Snr1REtNQWmAaJbf3a5umyQVPpDJOXoUAARb1ACEyChAiowAhMgoQIqMAITIKECKjACEy\nChAiowAhMgoQIqMAITJKUuP44aGjkJCMAoTIKHmtHokHOckoQIiMksVqM3RmICU/T7+nslclPUCc\nRskodRyvp4GU5GSU+/gIJbrks5gozrKdvplGKWgOqHrSN2fqCdm/12iaijTUVimpyCjtcdKJVGSU\nKFHj4WSUJmk3ecgoF6gSNSUlCRmlDR+LqaRkIKNco3TRFJO0ZJS2ySvVuYuJlozj5wtRt74ONzCN\nco17RsKt2XOahtFQSiUySmOs4slGRinl5thN02QgpQoZpT3bS3slpQIZpUk2Q8lDRumKgZT7ySit\n2iqmknIzGaVhikkGnn5P2+aSru4iuSGv7lvhRUbp0HtYoRyLeppnaU9dMkoPlrOnUZSbyShd0VDu\nJ6P0Q0OpQkYBQmSUUm6eC42i1CKjXKNuwTSUimSU5o2ji+GpSUYBQmSUtvkUJqqTURqmoWQgowAh\nMkrIOP76PI87b203ipKEjNIkDSUPGSXKJx7zcDJKQasl/4W/rFGUPDy2mQvMA+l72uavvEp6Vfs0\nlFRklOJe1bu8p5CBjHKNIzukq56KKX2QUSootNiHKjwXh/r0lKaZRqnPYp+mueCJRKbpV1Jr3R8F\nX5FRgBAZJS/3R9EEGQUIcYqJ1N7vj1rNp85HUZ2M0hjdJBuLerKzQ0pyMgoQ4i4mgBDTKECIjAKE\nyChAiIwChMgoQIiMAoTIKECIjAKEyChAiIwChMgoQIgH5TEMw++neHrMAnxDRhk+PBhZSeEwi3oe\nbRzH0dNMiTGN8kSvdM4PihxHT4zkPBnlQVb1fJmmSUk5TUbp31Y9l5SU02SUbh2p55KSco4XDcMw\n9HbB0xzQc6/tyH/LM8ko/fh2/Nz/pbw1OMhr5dm6uET0dD33W6mkHGRvlN/aCWt89tzfDLVVykEy\nSmMuXLkPSsoVZPTB2hk8Z4VO/syt3PqVlZQ/uRmUhfRhLZSzaZpeMf34s1VuGHWjaitkFH4kKen4\nz37cycOinjbcs7LeWcKXXt27UbVdplFaVWhMu3kmXc2eOwW/9vtyIdMorO2cdLpqNjx3o+rxf587\nyehTpT+bVNfOA/SCRTv933qmX1r+SJ7qPaOJw/rejttqsvONvjoGN6p2zN7olVZXqNjP6kBwq/TI\n1ueFh0QVFvUPNQ6DeeagE6fvr73VauP7pl08PI6MXuz9fVX6HXUVYd1x8KTTnX/W05R5G+ZZZLS4\n1zsqz5ZWniM5IsnRljvpFDgkJU3B3uj13reu8t/VlyRVB1U82v2t0psPZvhXUuqS0eJepxdqHwjX\nyHaGZy5ppiN6HBktIts7jWvVuvpqyzQZS2uS0eLmpM7vNG2lHCWtxSmmKy1Hko8/trT/VvVBry1O\nOlVhGq2j4ljaephaP/7SzKT3k9E6Ui3whakz/jBvJqPVvPZMax8IIf4Swt5oTT9vv3t3s5b32yR/\n/ysUTTCNJnDjbtbHD6iQql6trid9/Xj1clt+3SWoJ5hGcyh8hrWbD6ho62jbsnwBOt3/FRlNo0BJ\njzwpw0zat+Mvq61/R1L/JKOZXFXSf/k8/gEVQ75BL9vxfNTEQX60tXKfv97m76kaGU3m9Ot3+bY4\n9c5uboHPQR//dl7+4+/Xzs9Xljv22rpPRlv2/vKPUVKWalxI0iQZbVOx8SBzSdMeWH5HLgaRy9Nk\nNLetebPk6z1DSasfwAMtU3vXa60TMprYajy4cVrIUNL88v8v2qrh1j5p7t9NXi6/57OK96qmerw8\n/Mk0yqabL4RaXeWaf9aDmYzyh9IL/G7usOKxZJS/lSiaO6zohoxyyFUl/faT3NMu8BMeErXIaGKr\ni/1qv2mDJY3MlRb4ZCajuSULx4mcfTt+Xvitr2JjgX0yyncO7ldeVc/3b31bzkr8FuiSjPK1nf3K\n0um5oaTqybfsN3Heq2g3p6dESb/6LdioZck0ynm1Lki69vva+iTIX6o0LH7lwPB9QGWXFRmlbbdd\nOWDPlC0ySvOOlFQ9KUdG6cFOSc+twdWT42SUTlS5VxUGGaUnte5V5eFklK58W0PjJ3EySof+HEvV\nkwvJKH2qcq8qzySjdKvWvao8jYzSMyeOuIGMAoT4gGWAEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkF\nCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFRgBAZBQj5P9VSazYi\neae0AAAAAElFTkSuQmCC\n", 176 | "text/plain": [ 177 | "" 178 | ] 179 | }, 180 | "execution_count": 6, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "mol = Chem.MolFromSmiles(\"[Na]OC(=O)Cc1ccc(C[NH3+])cc1.Cc1nnn[n-]1.O\")\n", 187 | "\n", 188 | "mol" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 7, 194 | "metadata": { 195 | "collapsed": false, 196 | "slideshow": { 197 | "slide_type": "skip" 198 | } 199 | }, 200 | "outputs": [ 201 | { 202 | "name": "stderr", 203 | "output_type": "stream", 204 | "text": [ 205 | "[2016/May/13 08:16:41 DEBUG ] Starting fragment 1 '[Na+]'...\n", 206 | "[2016/May/13 08:16:41 DEBUG ] 1) Check for non-organic elements...\n", 207 | "[2016/May/13 08:16:41 DEBUG ] Starting fragment 2 '[NH3+]Cc1ccc(CC(=O)[O-])cc1'...\n", 208 | "[2016/May/13 08:16:41 DEBUG ] 1) Check for non-organic elements...\n", 209 | "[2016/May/13 08:16:41 DEBUG ] 2) Attempting to neutralise (first pass)...\n", 210 | "[2016/May/13 08:16:41 DEBUG ] 3) Applying rules...\n", 211 | "[2016/May/13 08:16:41 DEBUG ] 4) Attempting to neutralise (second pass)...\n", 212 | "[2016/May/13 08:16:41 DEBUG ] 5) Checking if frag is a salt/solvate...\n", 213 | "[2016/May/13 08:16:41 DEBUG ] ...fragment kept.\n", 214 | "[2016/May/13 08:16:41 DEBUG ] Starting fragment 3 'Cc1nnn[n-]1'...\n", 215 | "[2016/May/13 08:16:41 DEBUG ] 1) Check for non-organic elements...\n", 216 | "[2016/May/13 08:16:41 DEBUG ] 2) Attempting to neutralise (first pass)...\n", 217 | "[2016/May/13 08:16:41 DEBUG ] 3) Applying rules...\n", 218 | "[2016/May/13 08:16:41 DEBUG ] 4) Attempting to neutralise (second pass)...\n", 219 | "[2016/May/13 08:16:41 DEBUG ] 5) Checking if frag is a salt/solvate...\n", 220 | "[2016/May/13 08:16:41 DEBUG ] ...fragment kept.\n", 221 | "[2016/May/13 08:16:41 DEBUG ] Starting fragment 4 'O'...\n", 222 | "[2016/May/13 08:16:41 DEBUG ] 1) Check for non-organic elements...\n", 223 | "[2016/May/13 08:16:41 DEBUG ] 2) Attempting to neutralise (first pass)...\n", 224 | "[2016/May/13 08:16:41 DEBUG ] 3) Applying rules...\n", 225 | "[2016/May/13 08:16:41 DEBUG ] 4) Attempting to neutralise (second pass)...\n", 226 | "[2016/May/13 08:16:41 DEBUG ] 5) Checking if frag is a salt/solvate...\n", 227 | "[2016/May/13 08:16:41 WARNING ] Multiple non-salt/solvate components\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "parent = None\n", 233 | "\n", 234 | "try:\n", 235 | " \n", 236 | " parent = standardise.apply(mol)\n", 237 | " \n", 238 | "except standardise.StandardiseException as e:\n", 239 | " \n", 240 | " logging.warning(e.message)\n", 241 | " \n", 242 | "parent" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "### No non-salt/solvate components" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 8, 255 | "metadata": { 256 | "collapsed": false 257 | }, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAEeUlEQVR4nO3dy26jQBRF0dDq///l\n6oFblgXmEY4Nt2AtZZA4dsJo6xbPobX2A8Bef87eAIC+yShAREYBIjIKEJFRgIiMAkRkFCAiowAR\nGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWI\nyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChA\nREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYB\nIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREY53zCsvwJlySgl6Cb9klFKaE1J6ZWMAkRklBMMw5vZ\n00BKp/6evQHcyLOSrb1/w6Okc7+FmmSUr1utJ3RNRvmWffW0tKc7MsqH5bOnktKXoVlo8QlW7tyW\naZSIeoKMstMjoOoJzhvlvdXr3Fs7rqF2lVKZjDKrTrwcdKIyGWVWqXiV2hh4JaMAERllSakZsNTG\nwJOMsqJUvEptDDzIKBUttFJJqUZGWXd8uZb/o5JSioyyyfGn2WslvXBNPaUt3H7UnUkpwjRKr4yr\nFCGjrDg3VRt2kkopJ5NRqjN1UpyMstORaVsoaWvNQMq5ZJQ+KCllySjdsCeUmmSUKzCQciIZpSdy\nSUEySmfmSqqwnEVGuY5fl3T5SSnD8P9r+p65X3FLHmlHfx65/O51zK+Xms59P/2RWzKN0qWvL+3f\nxnEaTdcGIKNczOenVPMmayzq6dVnlvbLs6SGsoGM0rFRQ/dUdfT+0SEmDWUDi3ou4sPLeQ1lMxmF\nzaYHlNQWi3qu4SvnP70W8/nHRyXVUGSUC9jf0OmnXnO5/VPcm0U9OxWJydfPw4c1MgoQkVFWVB71\njKJUIKP0ahjG543CKWSULjnRiDpkFCAio/THKEopMkpnNJRqZJSeaCgFySjvLT9f4xQaSk0yyqzT\nuwldkFFmlXpAhlGUsmSUDmgolckoSxYG0iMfMKyhVOZGeax4lHTulnLPkiodtyWjRF6f3z56BW5C\nRlm35ViTnnJbMsom24/aT3v6I6lcmts1cgQjKhdmGuUIlvxcmIxyqFFPxZQLcN4o52jt/V7UuVeg\nLBmlBN2kXzJKCaWu34dfkVGAiIxShYGUTjlSTyHT6/dHYXVkn4JklNJ0k/os6qnF0p7uyCjlmEDp\ni2vqASKmUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIy\nChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCR\nUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCI\njAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiPwDlUUUtHxV6BwAAAAASUVO\nRK5CYII=\n", 262 | "text/plain": [ 263 | "" 264 | ] 265 | }, 266 | "execution_count": 8, 267 | "metadata": {}, 268 | "output_type": "execute_result" 269 | } 270 | ], 271 | "source": [ 272 | "mol = Chem.MolFromSmiles(\"c1nnn[n-]1.O\")\n", 273 | "\n", 274 | "mol" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 9, 280 | "metadata": { 281 | "collapsed": false 282 | }, 283 | "outputs": [ 284 | { 285 | "name": "stderr", 286 | "output_type": "stream", 287 | "text": [ 288 | "[2016/May/13 08:16:41 DEBUG ] Starting fragment 1 'c1nnn[n-]1'...\n", 289 | "[2016/May/13 08:16:41 DEBUG ] 1) Check for non-organic elements...\n", 290 | "[2016/May/13 08:16:41 DEBUG ] 2) Attempting to neutralise (first pass)...\n", 291 | "[2016/May/13 08:16:41 DEBUG ] 3) Applying rules...\n", 292 | "[2016/May/13 08:16:41 DEBUG ] 4) Attempting to neutralise (second pass)...\n", 293 | "[2016/May/13 08:16:41 DEBUG ] 5) Checking if frag is a salt/solvate...\n", 294 | "[2016/May/13 08:16:41 DEBUG ] Starting fragment 2 'O'...\n", 295 | "[2016/May/13 08:16:41 DEBUG ] 1) Check for non-organic elements...\n", 296 | "[2016/May/13 08:16:41 DEBUG ] 2) Attempting to neutralise (first pass)...\n", 297 | "[2016/May/13 08:16:41 DEBUG ] 3) Applying rules...\n", 298 | "[2016/May/13 08:16:41 DEBUG ] 4) Attempting to neutralise (second pass)...\n", 299 | "[2016/May/13 08:16:41 DEBUG ] 5) Checking if frag is a salt/solvate...\n", 300 | "[2016/May/13 08:16:41 WARNING ] No non-salt/solvate components\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "parent = None\n", 306 | "\n", 307 | "try:\n", 308 | " \n", 309 | " parent = standardise.apply(mol)\n", 310 | " \n", 311 | "except standardise.StandardiseException as e:\n", 312 | " \n", 313 | " logging.warning(e.message)\n", 314 | " \n", 315 | "parent" 316 | ] 317 | } 318 | ], 319 | "metadata": { 320 | "kernelspec": { 321 | "display_name": "Python 3", 322 | "language": "python", 323 | "name": "python3" 324 | }, 325 | "language_info": { 326 | "codemirror_mode": { 327 | "name": "ipython", 328 | "version": 3 329 | }, 330 | "file_extension": ".py", 331 | "mimetype": "text/x-python", 332 | "name": "python", 333 | "nbconvert_exporter": "python", 334 | "pygments_lexer": "ipython3", 335 | "version": "3.5.1" 336 | } 337 | }, 338 | "nbformat": 4, 339 | "nbformat_minor": 0 340 | } 341 | -------------------------------------------------------------------------------- /standardiser/docs/06_alternative.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run notebook_setup.py" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "sys.path.append('../..')" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": { 29 | "collapsed": false 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "from standardiser import break_bonds, unsalt, neutralise, rules" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "for module in [break_bonds, unsalt, neutralise, rules]: module.logger.setLevel('DEBUG')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "slideshow": { 51 | "slide_type": "slide" 52 | } 53 | }, 54 | "source": [ 55 | "# `standardise`: get standardised parent" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### Introduction\n", 63 | "\n", 64 | "This notebook provides simple examples of the use of the individual modules in the **`standardise`** package in a 'bare bones' alternative workflow.\n", 65 | "\n", 66 | "TODO: add further examples and some commentary" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "### Examples" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 5, 79 | "metadata": { 80 | "collapsed": false, 81 | "slideshow": { 82 | "slide_type": "slide" 83 | } 84 | }, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAGZ0lEQVR4nO3d0bKiRhSGUU3l/V+Z\nXFBz5kQUlR/o3c1aNRfJnEwKR/ncDQr3aZpuAGz1T+sNAOibjAJEZBQgIqMAERkFiMgoQERGASIy\nChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCR\nUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCI\njEJJ9/vfXw+/v/wvaerf1hsALNzvt2l6+a8UYxqFYpbRnCYjZ2UyChCxqIfemEyLkVHozcOSX1Vb\ns6gHiMgoFLM8oeRMfW0W9VDPQ0k1tLb75BkCCFjUA0RkFCAio1CPzzB1RUYBIjIKxfh4U29kFCAi\nowARGQWIyChU4sBoh2QUICKjABEZhTKs6PskowARGQWIyChAREahBgdGuyWjABEZhRJcGq9fMgoQ\ncUs79uMubFvd7+6K1jEZZScPZ0icMOEyLOrZwzKay5utw6BkFCAio9CYA6O9k1GAiIwCRGSUPSxP\nKDlTz2X4wBM7eSiphn7GgdEByCj7kQMuyaIemrn7aO0QTKMEHAD93u90zst56/reySgcbpnO36Zp\nUtKuySgHMKXebrdf9XybSCXtmoyylVY+sz54rlDSfskoe7teXjen84GSdkpGYYu90skAZBS+8Pnh\nzvX/yas/biDtkSeMTV6t3Edc0R8xeK63Ukn7Yho91nJ/sId04eg1+/rUaSbti4wezv7QlzmgJzxl\nH5R0vMl+TL4Merh5b3n6o/sfJ28S605721t5bdzch6UfMtrMPInMOivpuAdGz186KOkAZPQMT3cV\nK31mSto7GT3J013Fop6ZknZNRpv5vahvvS3feXWgt/cV/SvnvM/N55Re/1RJ65LR8/R3DPSZgT94\n0PyhrbdSScuS0VP93kvnqs4GKGzfW//ayW1V0h753Oixnl5c8u0/d6f5HDeSuZWv/jrXf0oTplG4\n3Yq9E5hJ+yKjfKFUa8ampB2R0XJ6P0g6krZvG29LShEyWk53p5sGGFHLPgRTZxdktKKaJS3bmrEp\naX0yCtV58ypORouqOZAuDTyiDvzQ2JeM1jVZzp1igFwuXyZeOGeS0drKlHSA1oytxsvkomS0vLmk\n9hJWlXnDvSIZ7cE0Nd9L5lH04bJ+A4yorx7CAA+N0/hOfT8afZt6eXM3ianJ1+1bkdGunLiLrNyQ\n/edebGJajZI2IaP89fldhce4A7AVPbuQ0cu7329/rhb6VTvGKOl4Wh9FvyIZvZ6HnWyabrfbthYq\naU1KejIZvYBn3dyLklaw/Ov3hJzJDjCun3oe/xT3WFIHRtmLabRnv8fMpgOJmbQ5J+gbktFuPew3\nrXcjJeWyfIupT8toFjit0MtVqW7Drdxbv4denYyyp45K+tRgeeUcMsrOei8pfEtG2V/xkho52ZeM\ncojiJR2JA6PNyWiflieU6u1M3ZXUlMo2PvDUrYeSltz/fQqKK/AS53ClSlpqY3L1FiFXZFHP4Yqs\n7h8u3Q97sajnDK1W90+voDrYQEpzMspJTivp24tPO2LLvryYONVB/fr8uv1Hb8mZHBgtwjTKqXac\nBDek86Ataee+9Yrb7ElGOVvSrzCdO24J/PAaoo3P+7VvOpMtKaXTzR6SaZQ21ifBo9P5+ZbAWzJK\nMw/9OjOd61sCX/HSobGfejZ/KXZU0o429QpMozRWJwdmUrbxZVD4q8j3Vn/4AmsXTKPwP81n0obH\niNnGEgaeOLmkX6XTkYdqTKPwxAkzqalzGDIKzx1U0jqfTGAvVgewZpeS7jh4WtEXZBqFNZtn0iPW\n7M7a1+SdDd77sKRHp9PeWpOMwkfWSzrHTjqvSUbhU6edu7dX9kVG4Qu7l9TgOQAZhe/kJZXOwcgo\nfG1DSaVzYDIKW3x77t6ONjAZhY1eldTgeTUyCtv9lFQ6r0xGIbLvJ0bpkYwCRFz9HiAiowARGQWI\nyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChA\nREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYB\nIjIKEJFRgIiMAkRkFCAiowARGQWI/Af6v15amKhTUgAAAABJRU5ErkJggg==\n", 89 | "text/plain": [ 90 | "" 91 | ] 92 | }, 93 | "execution_count": 5, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "mol = Chem.MolFromSmiles(\"[Na]OC(=O)Cc1cc(O)ncc1\")\n", 100 | "\n", 101 | "mol" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 6, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [ 111 | { 112 | "name": "stderr", 113 | "output_type": "stream", 114 | "text": [ 115 | "[2016/May/13 12:36:57 DEBUG ] Broke 1 bonds to Group I and II metals\n" 116 | ] 117 | }, 118 | { 119 | "data": { 120 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAGN0lEQVR4nO3d23KbShBAUXEq///L\nnAf5IoOELg3MdM9alYfEdlWQbbZ6BIJpnucLAJ/6r/UGAOQmowAhMgoQIqMAITIKECKjACEyChAi\nowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQIqMAITIKECKj\nACEyChAiowAhMgoQIqMAITIKECKjACEySjLTND39CJxJRslHN+mKjJLPPM+PSjp9O3mTGJmMUsc0\nTfO3RUmFlePIKCndHUjneW6yMQzuX+sNgA9dS7pI58YBqOtfpJbdySh13Fb1p57Xj6yDC3uxqCex\njWNNcBoZJbfbGfNa1atFYY2iHMdKByDENAoQIqMAITIKECKjACEyChAiowAhMgoQIqMAITIKECKj\nACEyChAiowAhMgoQIqMAITIKECKjACEyCk1N0++fxcfXX0mX3NIO2pmmy+3tJxb/JAnTKDSyjuY8\nGzkzklGAEIt66JXJNAkZhV4tlvyq2iuLeoAQGYVG1geUHKnPyaIe2lmUVENzmmY/OYAAi3qAEBmF\nRhx5r0JGAUJkFFpwUL4QGQUIkVE4nVG0FhkFCJFRgBAZhXNZ0ZfjzaC8wzsXYUVGeZk7XsA9FvW8\nxh0vduG5pyIZBQiRUTiLUbQoGQUIkVE4iReSq5JRXuOOFzHT5BLpZTnhiZe54wXcI6O8QzphxaIe\nDmdFX5uMsskJ9vCMjAKEyCiPPToWb0SFGzIKx/LCaHkyygMbo6gowA0ZBQiRUd5hFH2TFf0InH7P\nPXIZNjkQNwwZhR2so/kzhBpIy5NRXmZEvbHoplCOTEZZkcuVjWHzqXmeDaS1ySjcse+wqaS1+dHy\n16ini56wSFfSqkyjjCiySIcFGeWP6XK5k5Mqo+hPPQ+N5qOp09K+Kqff82uEnXye56Mf4zWX736K\nvGSU52rs92c+SSjpUGSUL48qM8KIChEyCod4NpCevDkcSEbZUmYUbfJANkvq4td1yCiXS6Fcvuvo\nlymVdAQySn3dPkkoaQ0yyrgHl855gNuvhCppATIKh9tupZJmJ6OjG3bkPPmBa2VhMsp95fPaFZFN\nTUZHp5WnsbSvysQxursXiCszinayov/7X29d5qXKRWDG4gpPo9h4R83iy2oEtFvXqfPR93j7s/RJ\nRouapsvfS4q8GEcXc2vO9z4dGa1iMWzO8+XulUNH0uGK/srIWYyM9uc2iNuvot3ab6c0kJ5ASSuR\n0c4s9q31rvZTzyN3wcIl7edx9bEV7MAJTz1ZR3N9Fsw8f/05WParC/eTS8qTUR7KXlI4h4yyJWlJ\nuz249K719z7hT6M+r422cNjRIepxJKp/MtpCqt2i8OGmFBzT759FfU/WB5T62IFyLe3LrOjJwjTa\nmUVJu9ntzaQNGUg7J6P96XV3SV3SvFt+paQ9s6jnDf2v7rPnkoxkFHJwQdJuySjv6X8gXSszopZ4\nEAXJKG/rtqRlcvlj/WhqPb4iZJRPdFtSOJ+M8qHeSup0UVqRUT7XW0mL8a3NQkapzCjKCWSUkE4G\n0nq5dLJ9It7FRFTDdzc9LXixttInGWUH55R0Hc3b/7HeQEoWfvPYzb4h247mCRvQkBV9LqZRdvPx\nTHp3bV4jiIxARjnbB2Pmi1Jfg4q8ZJQ9rUN2XDRf3AA4ml849nebzlZH8PP+YnthNB3TKPtrnjAz\nKWdy+j01dfK+AEYgo9ARK/qMZJSycg6k6TYYGaW0nCUlGRmluEQldVgsKRmlvkQlJSMZZQj9l9Qo\nmpeMAoTIKKNoPpBO3xpuA0ewjmAsp62d37qYgBV9at4MylgOep/oyVdgoSsyynDiJRVNbllKMKjX\nS3p0NK3oszONwh3Nr/VHIp4GGVcPY2AP20CQaZRx9XxraBLxTMjomt8amuxkFNrfGprUZBQul09L\n6tbQXLw2Cq8zZnKXaRS+tL01NHnJKPxyuigfkFGAEBfKAwiRUYAQGQUIkVGAEBkFCJFRgBAZBQiR\nUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFR\ngBAZBQiRUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYCQ/wFSZ+mI37GexAAAAABJRU5ErkJggg==\n", 121 | "text/plain": [ 122 | "" 123 | ] 124 | }, 125 | "execution_count": 6, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "mol = break_bonds.apply(mol)\n", 132 | "\n", 133 | "mol" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "metadata": { 140 | "collapsed": false 141 | }, 142 | "outputs": [ 143 | { 144 | "name": "stderr", 145 | "output_type": "stream", 146 | "text": [ 147 | "[2016/May/13 12:36:57 DEBUG ] Fragment contains a non-organic element\n" 148 | ] 149 | }, 150 | { 151 | "data": { 152 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAF2klEQVR4nO3d3ZaaWhCFUT0j7//K\nnotOJ0aEVpfCrtpzjtzkH9vms7YgnC+XywmAV/139AYA1CajABEZBYjIKEBERgEiMgoQkVGAiIwC\nRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQU\nICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKj\nABEZBYjIKBzqfP774+bXl3+SIf06egNgYufz6XJZ/SlFmEbhIMtoXi5GzopkFCBiUQ+jMpkWIaMw\nqpslv6qOyqIeICKjcJDlASVH6muyqIfj3JRUQ2s6XzxzAAGLeoCIjAJEZBQO4gSmLmQUICKjcATn\nNjUiowARGYXdGUV7kVGAiIwCRHwYlGf45GLOir4dGeVh7ngB91jU8xh3vIAVMgo7MsJ3JKMAERmF\nvRhFm5JRgIiM8hh3vIg5HteVE554mDteBM5nd5poS0Z5hhDAgkU9QERG2eQE+3ewou9NRgEiMso6\nx+LhATLK86z04YqMssIo+ibeGG1PRnmSvMK/ZBQgIqPcszZyGkWfZEU/A59igvc7Owo3ExmF1DKa\n1xOogbQ9TzALVvSbtqO59lfsaI2ZRmHV3bW5IHLDiyT/mngUfWHMfOoft691ZRplUh+N5tLlclHS\nrmSUf5xPpzs7epdR9DqdnyvaWi6VtCvnjfLXDDv55dtH/4u1E542fou6ZJSf9djvZ3iR4BAyym9r\nlVGfFxhIpyKj8BE/lXTnzeGDZJQtbUbR0R7I8n7V1CWjnE7jVWY3H11fb6/flbQNGaW/A18klHQG\nMsqq9iPqCA9QSRuQUYaoSWM/HlBS0upklOZGOJFLKHuT0dmNUJkZbJdUZ0uTUdiJknZl4uB0ujrv\n5+v7oc0oOuCsvX2Zly4XgZmLKzzNaOMacW0CWtTXTOoZqEVGp3DTzY1Qupjbp/0YSl/7cmS0o+9o\n/mnnnFkccEX/xcjZjIxWtnZI4nsHfW0/NZDuQEk7kdHxXMdxuZ9t/+6bNC7pOI9rjK3gDWR0MDcj\nynJi2e+M8dolLb3x1OK80ZHcjeZxJxO6wDA8QkbZUrSkwx5cetbya1/w2ehPRmFoujk+740e4WbP\nGHs+qv4maXWO6Y9PRo9QbZ+oVdI2K3qqsKgfyfKA0jBzSNE3SXtw1ZLBmUYHc7PHjNHQL7Vm0ht1\nt/yLpf3IZHQ8A+8r45d08M2jJYt6qMHSflgyynMqvknaZkRt8SAaklGeNmxJ2+Tyj+Wj6fX4mpBR\nXjFsSWF/MsqLRiup00U5iozyutFK2owvbRUySmdGUXYgo0QGGUj75dLJ9oU4/Z7Ugefk/1jwZm1l\nTDLKG+xT0o37Qp86DqRU4TuPt3lvyLajucMGHMiKvhbTKG/z8kx6d23eI4jMQEbZ2wtj5oPGv3IK\nLcko77QM2eei+eAGwKf5huP9rtN51BH8ut/Y3hgtxzTK+x2eMDMpe3L6PT0N8rkAZiCjMBAr+opk\nlLZqDqTlNhgZpbWaJaUYGaW5QiV1WKwoGaW/QiWlIhllCuOX1Chal4wCRGSUWRw+kJ6/HbgNfIJ1\nBHPZbe381MUErOhL82FQ5vKhz4nufAUWhiKjTCcvqWhyzVKCST1e0k9H04q+OtMo3HH4tf4oxMsg\n8xphDBxhGwiZRpnXyLeGphCvhMzu8FtDU52MwvG3hqY0GYXT6dWSujU0J++NwuOMmdxlGoXfjr01\nNHXJKPzldFFeIKMAERfKA4jIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGA\niIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwC\nRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAyP9WDq91\n9ucCCwAAAABJRU5ErkJggg==\n", 153 | "text/plain": [ 154 | "" 155 | ] 156 | }, 157 | "execution_count": 7, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "mol = [x for x in Chem.GetMolFrags(mol, asMols=True) if not unsalt.is_nonorganic(x)][0]\n", 164 | "\n", 165 | "mol" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 8, 171 | "metadata": { 172 | "collapsed": false, 173 | "slideshow": { 174 | "slide_type": "fragment" 175 | } 176 | }, 177 | "outputs": [ 178 | { 179 | "name": "stderr", 180 | "output_type": "stream", 181 | "text": [ 182 | "[2016/May/13 12:36:57 DEBUG ] 0 positive/H, 0 positive/quat and 1 negative (of which 1 are acid) charges identified\n", 183 | "[2016/May/13 12:36:57 DEBUG ] Overall H balance: +1; formal charge: 0\n" 184 | ] 185 | }, 186 | { 187 | "data": { 188 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAF40lEQVR4nO3d23baSBCGUZiV939l\n5sKxTRDi9IO6q7T3mptJZhL5oM/VqJGOp9PpAMCr/ht9AAC1yShAREYBIjIKEJFRgIiMAkRkFCAi\nowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowAR\nGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWI\nyChAREYBIjIKQx2Pv/9c/Pryv2RKf0YfAOzY8Xg4nVb/lSJMozDIMpqnk5GzIhkFiFjUw6xMpkXI\nKMzqYsmvqrOyqAeIyCgMsryg5Ep9TRb1MM5FSTW0puPJVw4gYFEPEJFRgIiMwiA2MHUhowARGYUR\n7G1qREYBIjIKmzOK9iKjABEZBYh4MyjP8M7FnBV9OzLKwzzxAq6xqOcxnngBK2QUNmSE70hGASIy\nClsxijYlowARGeUxnngRcz2uKxueeJgnXgSOR0+aaEtGeYYQwIJFPUBERrnJBvt3sKLvTUYBIjLK\nOtfi4QEyyvOs9OGMjLLCKPomXhhtT0Z5krzCv2QUICKjXLM2chpFn2RFvwfexQTvd3QVbk9kFFLL\naJ5PoAbS9nyBWbCiv+l2NNf+FydaY6ZRWHV1bS6IXPBDkn/teBR9Ycx86g93rnVlGmWnPhrNpdPp\npKRdySj/OB4OV070LqPoeTo/V7S1XCppV/aN8msPJ/np20f/irUNTzd+i7pklPt6nPd7+CHBEDLK\nX2uVUZ8XGEh3RUbhI+6VdOPD4YNklFvajKKzfSDL51VTl4xyOMxXmc18dH19e/2upG3IKP0N/CGh\npHsgo6xqP6LO8AEqaQMyyhQ1aezuBSUlrU5GaW6GjVxC2ZuM7t0MldmD2yXV2dJkFDaipF2ZODgc\nzvb9fH0/tBlFJ5y1b9/mpctNYPbFHZ526mIXzk9T2gS0qK+Z1FegFhndi7VuXnAzt0+7G0qf+3Jk\ntKnvaP60c4dlnHBF/8XI2YyMFrd2VeL7HH3hVDWQbkBJO5HREZYn0MWvnMdxeard/t13aFzSeT6u\nOY6CN5DR+SyTenHCbXL+VS9p6YOnFvtGJ3M1moP2E7rBMDxCRrmlaEmnvbj0rOXnvuBXoz+L+kGc\nDTzGlaj5yeggy0tMs6r+Iml1runPz6Ke+2ot7dus6KlCRiezvKA0xyhSq6TNuGvJ5Czq53Nx0kzQ\n0C+lV/d1j/yLpf3MZHSE5dkwYmfoC+Yv6eSHR0sW9VCDpf20ZJTnVHyRtM2I2uKDaEhGedq0JW2T\nyx93X/5hBjLKK6YtKWxPRnnRbCW1XZRRZJTXzVbSZnxqq5BROjOKsgEZJTLJQNovlzbbF2L7PamB\ne/LvFrxZW5mTjPIG25R0Gc3zv7HfQEoVvvN4m/eG7HY0NziAgazoazGN8jYvz6RX1+Y9gsgeyChb\ne2HMfND8d06hJRnlnZYh+1w0HzwA+DTfcLzfeTpHXcGv+43thdFyTKO83/CEmUnZku339DTJ+wLY\nAxmFiVjRVySjtFVzIC13wMgordUsKcXIKM0VKqnLYkXJKP0VKikVySi7MH9JjaJ1yShAREbZi+ED\n6fHbwGPgE6wj2JfN1s5P3UzAir40bwZlXz70PtGN78DCVGSU3clLKpqcs5Rgpx4v6aejaUVfnWkU\nrhh+rz8K8WOQ/ZphDJzhGAiZRtmvmR8NTSF+ErJ3wx8NTXUyCuMfDU1pMgqHw6sl9WhoDl4bhccZ\nM7nKNAp/jX00NHXJKPyyXZQXyChAxI3yACIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCR\nUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCI\njAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJE\nZBQg8j9UN7R2PtxHmQAAAABJRU5ErkJggg==\n", 189 | "text/plain": [ 190 | "" 191 | ] 192 | }, 193 | "execution_count": 8, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "mol = neutralise.apply(mol)\n", 200 | "\n", 201 | "mol" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 9, 207 | "metadata": { 208 | "collapsed": false 209 | }, 210 | "outputs": [ 211 | { 212 | "name": "stderr", 213 | "output_type": "stream", 214 | "text": [ 215 | "[2016/May/13 12:36:57 DEBUG ] apply> mol = 'O=C(O)Cc1ccnc(O)c1'\n", 216 | "[2016/May/13 12:36:57 DEBUG ] apply> starting pass 1...\n", 217 | "[2016/May/13 12:36:57 DEBUG ] rule 2 '2-hydroxy pyridine -> 2-pyridone' applied on pass 1\n", 218 | "[2016/May/13 12:36:57 DEBUG ] ...total of 1 hits in pass: will continue...\n", 219 | "[2016/May/13 12:36:57 DEBUG ] apply> starting pass 2...\n", 220 | "[2016/May/13 12:36:57 DEBUG ] ...total of 0 hits in pass: finished.\n" 221 | ] 222 | }, 223 | { 224 | "data": { 225 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAF7ElEQVR4nO3d25LaOBSGUTM17//K\nngtqCMENbfP7IG2tdZVKKgkH87EFavs2z/MEwLf+ufoGAPRNRgEiMgoQkVGAiIwCRGQUICKjABEZ\nBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjI\nKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBE\nRgEiMgoQkVGAiIwCRP69+gbQjdvt9vj1PM8X3hJoiozys+do3j2n83a7KSnceTEwTb9F891fcfDA\nZBod0LKYk0U6BAwUxX0xZm76xx0/4GVQyqHRfPc/OoQYnNdA9y7/Al1JGZzPRiu4tmLzPCspI3P0\n9+2Efq2cdpWUYZlG+cv5n65C70wQHcsHwH2jaSBlTKbRgRw9afqQlDHJaFmXLM+VlAE54nv1rlaP\nel74zCopQzGNVtNCv8ykDMX5RgEiMtql9me9+0B69a2AM8goR1FSBiGj/Wl/FH1QUkYgoxxLSSlP\nRnvUxyj6oKTUJqOdud2mThb0MAoZ5QwGUgqz/Z6TXLgn//IzW1NbN9/5MpVY0Z98gtQ7l4bmUKZR\nTrX7TLr1DCx+UJXdyShn+zpkLg1Nm2SURh13oj8DKftyMHWjwAejz15C5tLQ9Ms0yjVetkCdXzQz\nKXuxb5TDfdgxOv/vzNvz/L/bzUpORvtQbEU/NbOmVlJyMsqxGsklHEdG+yBExzGQEpJRDvThuntN\njahKSkJGYZqUlICMcrbWRtEHJeU7MspRms3lB0rKF2QUICKjnKr9EdVAylYy2pbl6/fxOx/+qEHt\n5/IDJWUTGW2O128LlJT1ZLQ589x9SXvZLvqZkrKSjMJbSsoaTpTXovtAupzbvKJH9PysPx8Ty0Ok\n3glsOiGjjfqxpMtXTYNqrOgfLj4t6ctBIJRNsqiHX1y2tP/xjbTNN8+xmUbbVekl0+ko+nD4TPpu\n5U4PZLRp3ZW091x+sE9J3z2d3/2zfR0cdcloW5avpsfvfPgjzrG1pI+PAv78hX2fsy4+LB+AjLKb\nYl8ufeHlI9RB7jUyCmvd3w+e3xWO7eZyu4Zv6psko/3xUrrW8xf3h8+bL5+Oe+KbNMpqq5gGS1p+\nRV/+DvI1+0a71N03+FCYjMLvjKJ8IKO9amogVRlGJqMda6qkA/ImwZ2M9q2FkpavSfk7SEhGu9dC\nSX+kPgxCRkmN2UpvEjzIaAUNDqRlKlPmjnAcGS2iwZLCIGS0jnl+/RFvQjZysYaMltLIJdjKVKbC\nfeB4Tk1SzYXXDmqh4Cco8ybBXmS0oHNKuoxmtbi8OQFMrTvJDmSUVepHE75leVLW1wPpj2vz4Y6T\nd+cibPAchVzNNFrWyqW9MRNCMlrW8ooXk2iuZBRlCxkt7mULlGjC7mS0puchVDrhULbfj2WQrZ0R\nK3o2ktGC7A+HM8noQOT1d0ZOtpNRWEFeeU9Gq3FSooiHiO1kFH5jFOUjGQWIyGgpVvSH8NDxkYwC\nRGS0DiMnXEJG65NXOJSM1qGUcAkZreLNphyjKBzNGZ5gmqZpej5pizcetpDREt7vD9eDVV4eQPvt\n2cKivjQ5WGP5KM3z5IyCrCajABEZ7Z/TDMOlZBQgIqMAERntnBV9bvmFkkePLWx4gkVJNZQtZLRn\nhqYdeST5lkV9RfIKJ5LRnmklNEBGyzGKwrlkFCAio+UYReFcMgoQkVGAiIwCRGQUICKjABE/DNon\nPwAOzZDRDrlwELTEor43LhwEjZFRgIiMAkRkFCAiowARGe2NCwdBY2x46pALB0FLZLRP0gnNsKgH\niMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgo\nQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERG\nASIyChCRUYCIjAJE/gPNuLlnqo19NQAAAABJRU5ErkJggg==\n", 226 | "text/plain": [ 227 | "" 228 | ] 229 | }, 230 | "execution_count": 9, 231 | "metadata": {}, 232 | "output_type": "execute_result" 233 | } 234 | ], 235 | "source": [ 236 | "mol = rules.apply(mol)\n", 237 | "\n", 238 | "mol" 239 | ] 240 | } 241 | ], 242 | "metadata": { 243 | "kernelspec": { 244 | "display_name": "Python 3", 245 | "language": "python", 246 | "name": "python3" 247 | }, 248 | "language_info": { 249 | "codemirror_mode": { 250 | "name": "ipython", 251 | "version": 3 252 | }, 253 | "file_extension": ".py", 254 | "mimetype": "text/x-python", 255 | "name": "python", 256 | "nbconvert_exporter": "python", 257 | "pygments_lexer": "ipython3", 258 | "version": "3.5.1" 259 | } 260 | }, 261 | "nbformat": 4, 262 | "nbformat_minor": 0 263 | } 264 | -------------------------------------------------------------------------------- /standardiser/docs/Charge-separated_systems.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "slideshow": { 9 | "slide_type": "skip" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "%run notebook_setup.py\n", 15 | "\n", 16 | "sys.path.append('../..')" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "from standardiser import rules, neutralise, standardise\n", 28 | "\n", 29 | "from standardiser.rules_demo import rules_table, show_change" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "for module in [rules, neutralise, standardise]: module.logger.setLevel('DEBUG')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "slideshow": { 47 | "slide_type": "slide" 48 | } 49 | }, 50 | "source": [ 51 | "# Rules for conjugated charge-seperated systems\n", 52 | "\n", 53 | "### Introduction\n", 54 | "\n", 55 | "These rules address cases in which positive and negative formal charges are in conjugation, and the molecule can be neutralised _via_ successive rearrangement of adjacent double and single bonds.\n", 56 | "\n", 57 | "Some issues relating to these rules are discussed below." 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "### Possible redundancy of rules\n", 65 | "\n", 66 | "Recall that bonds to Group I & II metals are broken and a round of protonation/deprotonation-based neutralisation are done _before_ these rules are applied, and that a further round of neutralisation is carried out afterwards. The neutralisation step before rule application is worthy of note, as the it can mean some rules for charge-seperated systems are apparently redundant in some situations (unsubsituted analogues, in particular)." 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": { 72 | "slideshow": { 73 | "slide_type": "slide" 74 | } 75 | }, 76 | "source": [ 77 | "Below is an example of where the cation in the charge-seperated species bears a proton and the `neutralise` module thus produces an equivalent overall effect as the `rules` module..." 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "1) Application of _rules_ module..." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 4, 90 | "metadata": { 91 | "collapsed": false 92 | }, 93 | "outputs": [ 94 | { 95 | "name": "stderr", 96 | "output_type": "stream", 97 | "text": [ 98 | "[2016/Mar/24 16:22:42 DEBUG ] rule 15 'Fix 1,3 charge-seperated systems (non-aromatic)' applied\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "smiles = 'CC[N-]C(C)=[NH+]C'\n", 104 | "\n", 105 | "HTML(show_change(smiles))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "2) Application of _neutralise_ module (_i.e._ simple addition/removal of protons)..." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 5, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stderr", 124 | "output_type": "stream", 125 | "text": [ 126 | "[2016/Mar/24 16:22:42 DEBUG ] 1 positive/H, 0 positive/quat and 1 negative (of which 0 are acid) charges identified\n", 127 | "[2016/Mar/24 16:22:42 DEBUG ] Overall H balance: 0; formal charge: 0\n" 128 | ] 129 | }, 130 | { 131 | "data": { 132 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAESElEQVR4nO3dy1LCWBRA0dDV///L\n6QG2Ii8hOxAurFUOBCcZpLbn5rmb53kCYKl/tt4AgLHJKEAiowCJjAIkMgqQyChAIqMAiYwCJDIK\nkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDIKEAi\nowCJjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMAiYwC\nJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDI\nKEAiowDJv1tvAHyo3W73/fs8zxtuCZGMwmMd5vKQdL4NGYXVHATTpPlBZBTudmG+nA6CKZ0fREZh\nmqZpt5uOpsajbw7Tab7kkIzCl9OSHpJOLnHBE3yZ54urdbhCRgESGYUfBlIWkFH4RUm5l4wCJDIK\nxwyk3EVG4QyXN3G7nTvVAArTKEAio+BIKImbQbmJh2PCJTLKL7c8HHO3e6tD6tdvpYc/yeiHKs8S\nnuf5zUoKhYy+v7PFFME9oyidjL6PZ76swkAK32R0bBue+XnvkppSuZ2Mjm3bio1eUq1kFa4bJdmX\ndOutWJm8chcZBUhklGrQgfTSyGkU5V4yygrmeR4wpLAOGWUdYz2j08jJimQUfsgrC8goqxlqIB1m\nQ3l9MsqahijppWtdjaIsI6OsbIiSwopklM9y9bYr+WcJGWV9Iw6kQ9/VyrZklIcYsaSwjIzyKC9Y\n0ssnl4yiLCejAImM8kAvOJDC6mSUx3qdtbIVPQ8iowCJjPIRjKI8jpeI8J5GfAQqg/KvmLH9X8vj\naJ7u2AZPHsSOxWCOpsy79l8l5REs6nme00cofX9z5U/fH/dkkFfjFBNPtfiI5Tx//RSDvjaKFyej\nPNXmF+QrKauTUT6OkrIux0Z5tv1Aero8f2bZ9iV1uolVyCgbOFvS01NMMASLej6UpT1rkVG2sfm5\npklJWYmMsplXODKppHSOsoO7m0hMowCJjIKlPYmMwjQpKYGMwhclZRkZhR9KygIyCpDIKPxiIOVe\nMgrHlJS7yCicoaTcTkbhPCXlRjIKkMgoXGQg5RYyCtcoKX+SUfiDknKdjMLflJQrZBQgkVG4iYGU\nS2QUbqWknCWjcAfvGuGUjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJjAIk\nMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMAiYwCJDIKkMgo\nQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJ\njAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChA8h+NJB2QMU3F\nJwAAAABJRU5ErkJggg==\n", 133 | "text/plain": [ 134 | "" 135 | ] 136 | }, 137 | "execution_count": 5, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "neutralise.apply(Chem.MolFromSmiles(smiles))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "For full-substituted species, this is not an issue, as the `neutralise` module will have no effect..." 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "1) Application of _rules_ module..." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 6, 163 | "metadata": { 164 | "collapsed": false 165 | }, 166 | "outputs": [ 167 | { 168 | "name": "stderr", 169 | "output_type": "stream", 170 | "text": [ 171 | "[2016/Mar/24 16:22:42 DEBUG ] rule 15 'Fix 1,3 charge-seperated systems (non-aromatic)' applied\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "HTML(show_change(\"C[N-]C(C)=[N+](C)C\"))" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "2) Application of _neutralise_ module (has no effect here)..." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 7, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [ 193 | { 194 | "name": "stderr", 195 | "output_type": "stream", 196 | "text": [ 197 | "[2016/Mar/24 16:22:42 DEBUG ] 0 positive/H, 1 positive/quat and 1 negative (of which 0 are acid) charges identified\n", 198 | "[2016/Mar/24 16:22:42 DEBUG ] Overall H balance: 0; formal charge: 0\n" 199 | ] 200 | }, 201 | { 202 | "data": { 203 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAETUlEQVR4nO3dwXLaQBREUSbl//9l\nZYErcYwtZDrovRnOWVFs0Ma3ehCYsW3bBYBH/aq+AIC5yShAREYBIjIKEJFRgIiMAkRkFCAiowAR\nGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWI\nyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChA\nREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyCitjTGqLwHukFH6GmNs21Z9FXCHjAJEZJSmTFFmIaMA\nERmlI1OUicgoQERGaccUZS4yChCRUXoxRZmOjAJEZJRGTFFmJKN0McZFQ5mRjAJEZJQWxrhYokxK\nRgEiMko9U5SpyShAREYpZooyOxmlkoayABkFiMgoZUxR1iCjABEZpYYpyjJklBoayjJkFCAiowAR\nGQWIyChAREYBIjIKEJFRnmiM+8/A7GSU59JNlvdWfQEUGI+07Qcfl//40fpte6EvLPll09cko6+o\nz5/6SoXV0JflUM/TXQcprEpGOcNtScd4f+bPg6mZoq/MoZ4a1+ascajX0BdnjXISR3tWJaOc53ax\nLbDhTFFklCdaspvwiYzC40xRLjJKNxO9f6qhXMkoNb7LpTtRTEdGqTF7Lsdo9GUwasko7fQv7Bof\nd+V/kVHK9M8lHCGjdNS5sKYon8golTrnEg6SUZrqWVhTlFsySrGeuYTjZJR635W0W2FNUb4ko7S2\nbY/95AmcR0ZpodvwvGWK8h0Zpbtt2wxSOpNRutgZpOUlNUXZIaM04p1QZiSjzKF2kJqi7JBRetnJ\nZfnRHr4ko7Qjl8xFRpmJwtKQjNLR/tH+5IuBfTIKEJFRmnJ+ZxYySl8nl/T2pWScI2QU/tJNHvBW\nfQGw56FB+rN7UB9vWV2/kOomFj8io7Q2xnBrnuYc6uEf/f9lH91Yo/RVNUVvj/afwmof85GMwn26\nyQ6HepqqfVfU0Z7jZJSOOtxZqn59piGj8O62m0rKETJKOx2mKBwnowARGaUXU5TpyCiNaCgzklGA\niIzShSnKpGQUICKjtGCKMi8ZpZ6GMjUZBYjIKMVMUWYno1TSUBYgowARGaWMKcoaZBQgIqPUMEVZ\nhoxSQENZiYwCRGSUs5miLEZGASIyyqlMUdYjo5xHQ1mSjAJEZJSTmKKsSkYBIjLKGUxRFiajnEFD\nWZiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiM\nAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRk\nFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkR+A0/T\nA50hAPU2AAAAAElFTkSuQmCC\n", 204 | "text/plain": [ 205 | "" 206 | ] 207 | }, 208 | "execution_count": 7, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "neutralise.apply(Chem.MolFromSmiles(\"C[N-]C(C)=[N+](C)C\"))" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "The `neutralise` module will not attempt to charge balance these systems as they appear to be potential zwitterions; see the [documentation](3_neutralise.ipynb) for that module for futher details." 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": { 227 | "slideshow": { 228 | "slide_type": "slide" 229 | } 230 | }, 231 | "source": [ 232 | "### Interaction of `neutralise` module and 'charge-seperated' rules\n", 233 | "\n", 234 | "The pre-application of the `neutralise` module could conceivably lead to problems in aromatic systems where potentially undesirable imine species might be produced. However, it doesn't seem to be a problem in practice as the undesirable species are fixed by the subsequent application of other rules.\n", 235 | "\n", 236 | "This is illustrated below..." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 8, 242 | "metadata": { 243 | "collapsed": false 244 | }, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFJklEQVR4nO3d0XaaQBSGUezq+78y\nvTClVIlGf4GZM3tftclqwCV8PcgYL/M8TwC869fZOwDQNxkFiMgoQERGASIyChCRUYCIjAJEZBQg\nIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMA\nERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkF\niMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgo\nQERGASIyChCRUYDI77N3ABp1uVyuf5jn+dw9oXEyCv8s6ZxW9bxcLkrKA44Phrbu5vT94KmkPODg\nYCw/7ObmP3SysMmRQX2bl+rv/RznC/ccFhT09sj5k5/slOGGY4IK9uvm5racNaw5IOjVpy7V39u0\nE4eFBU905oDlnFaM8hIZpT971O2l2XaeZwMpC4cCPflsvMKXBZSUK9MoY/ngK6pmUq5klPr2uxml\npEwySlWH3cdXUjz9dONprSyB4hSmUbrXwvokM+nIZJTuNRIvJR2W335PH7oo1LWkZ+8FR5NR+CQl\nHZCMwocp6WhklA50cUW/pqRDkVHYhZKOQ0YBIjIKezGQDkJGaV13L4yuKekIZLQg521TlLQ8GS3I\nedsaz0htMlpTmfO26yv6tTLPCPdktCznbWs8I1XJaGXOWziAjNKufq/oN//36vfh8JiMFmcghb3J\naH1KejBT52hkdAhKCvuR0VF0V9JiM12xh8OajA6ku5L2SC4HJKNjmedZSOGzZHQ48zy1X9JiM12x\nh8MNnwzKyVr4eGRIyOiIrgPpKdVaDcIF62nqHJOMDurgki71XG3x220Xi1Gxh8M9GR3XkSWVEQpz\ni2loXdxu6oWpc1gyOjolhZCM0pZiM12xh8Mmr41y5o37Re9vr5LLkcko03RGSf9mc2PZkyTRFxnl\nywHhWk+cfze3sdXre/8LlLTGo+ApGeUI14D+PCl9lbSjXWUPbjFxhHl+edr1+6johYzCLoyo45BR\n2tXFQCqXyCgb7tu1fOXBt/bQRUkZnIyyrZ12KSmNk1G2NfUm0e5K6kp/KDLKh+2UuzZLKpdM1o3y\nwHdvbTqrZtcPklItWiOjPLJZ0pu/3t99enWxfbg/rTGijsZFPR+zrLF/Y7H9S1tp5OJeLrmSUZ5o\nJ1uLBneJkckoz700ch32qSSnl3RzFDWiDkhG2XDfgeUrD751sBZKCpOM0jUlpQUySt+UlNPJKN1r\np6ReGB2TdaNUcPp60gbfYcVhZJQijv0gqdtoGkJHJqPw3Cqbl0k0+Z+Mwq37C/RVNgWUWzIKX5Z6\nmjV5iYzCF/XkPRY8UdDBn3TC4GSUmnSTw8goNbWzJp/yZBQgIqOUZSDlGO7UU9n9m0RvwuruPDkZ\nZSy6yce5qKc4l/bsTUapzwTKrvx6RICIaRQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMA\nERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkF\niMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgo\nQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJE/gDTNuhw7h1BTgAAAABJ\nRU5ErkJggg==\n", 249 | "text/plain": [ 250 | "" 251 | ] 252 | }, 253 | "execution_count": 8, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "mol = Chem.MolFromSmiles(\"[n-]1c(=[NH+]C)cccc1\")\n", 260 | "\n", 261 | "mol" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "The initial application of the `neutralise` module removes the charges _via_ protonation/deprotonation to give a potentially undesirable hydropyridine-imine..." 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 9, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [ 278 | { 279 | "name": "stderr", 280 | "output_type": "stream", 281 | "text": [ 282 | "[2016/Mar/24 16:22:42 DEBUG ] 1 positive/H, 0 positive/quat and 1 negative (of which 0 are acid) charges identified\n", 283 | "[2016/Mar/24 16:22:42 DEBUG ] Overall H balance: 0; formal charge: 0\n" 284 | ] 285 | }, 286 | { 287 | "data": { 288 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFHklEQVR4nO3d0XaaQBSGUejq+78y\nvbA1VIlJ+AFnzux91aarCSzhy0FHnJdlmQDY69e7NwCgbzIKEJFRgIiMAkRkFCAiowARGQWIyChA\nREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYB\nIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIK\nEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFR\ngIiMAkRkFCAiowCR3+/eAGjUPM+3PyzL8t4toXEyCh/u6ZxW9ZznWUl5wfHB0NbdnD4fPJWUFxwc\njOWb3dz8j04WNjkyqG/zUn3f93G+8MxhQUG7R87vfGenDA8cE1RwXjc3f5azhjUHBL066lJ93492\n4nBnwROduWA5pxWj/IiM0p8z6vaj2XZZFgMpdw4FenJsvMKnBZSUG9MoYznwGVUzKTcySn3nvRil\npEwySlWXvY6vpHj46caXtbIEircwjdK9FtYnmUlHJqN0r5F4Kemw3P2ePnRRqFtJ370VXE1G4UhK\nOiAZhYMp6WhklA50cUW/pqRDkVE4hZKOQ0YBIjIKZzGQDkJGaV13T4yuKekIZLQg521TlLQ8GS3I\nedsaj0htMlpTmfO26yv6tTKPCM9ktCznbWs8IlXJaGXOW7iAjNKufq/oN3979bs7vCajxRlI4Wwy\nWp+SXszUORoZHYKSwnlkdBTdlbTYTFdsd1iT0YF0V9IeyeWAZHQsy7IIKRxLRoezLFP7JS020xXb\nHR7I6IjaKek8z55noHc+YHlQt5K+ZUJad7PYjGbqHJOMcoUfpbNYjIrtDs9kdFxnD6T/yjlP5aZO\nWJPRoR1e0vXznP++7SgBNXUOS0ZHl5d0K50wEBllZ0nv9Tw2ncVmumK7wyYZZZp2lfTwOHS98kku\nRyaj/HV9BB66ec+QJNEXGeU6n3Xzwe29/wVKWmMv+JKMcq59y576KmlHm8oZZJTjHbLsqa+SMjIZ\n5RiWPT3wO2AcMsp+Z6ezi4G0/S3kbDLKhufFTw9fuQX0gnp0UVIG50Z5bHu9iHNZrrtyd9N+Giej\nbGvnnqRThyU1QQ9FRulDmyWVSyYZ5YWmBtLJB0nRKhnllfZK2tb2bDKijkZG6Uw7JZVLbmSUL7ST\nrbsGN4mRyShfM3Jt2hxFjagDklE2PHegtTIYSGmHjNIrJaURMkrHlJQWyCh9a6qknhgdk1uT0L3D\nPyZ6hwbfYcVlZJQK2vkgKQYko/Atq2z+7ANRKE9GYcPzNfoqmwLKf2QUPtzradbk+2QUPqgnO1jw\nRDXP1+P3r7z4J9hNRilIHLmSjFJQU2vyKU9GASJeYqKmz97aZErlcDJKWZslffirqpJzUQ8QkVEq\n81oTF5BRirOinrO5PSJAxDQKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAi\nowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowAR\nGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWI\nyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIn8ATkDsgY76o1cAAAAASUVORK5CYII=\n", 289 | "text/plain": [ 290 | "" 291 | ] 292 | }, 293 | "execution_count": 9, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "neutralise.apply(mol)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "However, this is then fixed by application of the appropriate 'hydropyridine-imine -> aminopyridine' transform during the rule-application step, so the desired parent is actually obtained..." 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 10, 312 | "metadata": { 313 | "collapsed": false 314 | }, 315 | "outputs": [ 316 | { 317 | "name": "stderr", 318 | "output_type": "stream", 319 | "text": [ 320 | "[2016/Mar/24 16:22:42 DEBUG ] Starting fragment 1 'C[NH+]=c1cccc[n-]1'...\n", 321 | "[2016/Mar/24 16:22:42 DEBUG ] 1) Check for non-organic elements...\n", 322 | "[2016/Mar/24 16:22:42 DEBUG ] 2) Attempting to neutralise (first pass)...\n", 323 | "[2016/Mar/24 16:22:42 DEBUG ] 1 positive/H, 0 positive/quat and 1 negative (of which 0 are acid) charges identified\n", 324 | "[2016/Mar/24 16:22:42 DEBUG ] Overall H balance: 0; formal charge: 0\n", 325 | "[2016/Mar/24 16:22:42 DEBUG ] 3) Applying rules...\n", 326 | "[2016/Mar/24 16:22:42 DEBUG ] apply> mol = 'CN=c1cccc[nH]1'\n", 327 | "[2016/Mar/24 16:22:42 DEBUG ] apply> starting pass 1...\n", 328 | "[2016/Mar/24 16:22:42 DEBUG ] rule 7 'hydropyridin-2-imine -> 2-amino-pyridine (N-subst.)' applied on pass 1\n", 329 | "[2016/Mar/24 16:22:42 DEBUG ] ...total of 1 hits in pass: will continue...\n", 330 | "[2016/Mar/24 16:22:42 DEBUG ] apply> starting pass 2...\n", 331 | "[2016/Mar/24 16:22:42 DEBUG ] ...total of 0 hits in pass: finished.\n", 332 | "[2016/Mar/24 16:22:42 DEBUG ] 4) Attempting to neutralise (second pass)...\n", 333 | "[2016/Mar/24 16:22:42 DEBUG ] 0 positive/H, 0 positive/quat and 0 negative (of which 0 are acid) charges identified\n", 334 | "[2016/Mar/24 16:22:42 DEBUG ] Overall H balance: 0; formal charge: 0\n", 335 | "[2016/Mar/24 16:22:42 DEBUG ] 5) Checking if frag is a salt/solvate...\n", 336 | "[2016/Mar/24 16:22:42 DEBUG ] ...fragment kept.\n" 337 | ] 338 | }, 339 | { 340 | "data": { 341 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFIUlEQVR4nO3d0XKiShSGUZia939l\n5sIzHkeFqD9C7+61riZJVcQEvuwWcOZlWSYAPvXr7A0AqE1GASIyChCRUYCIjAJEZBQgIqMAERkF\niMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgo\nQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERG\nASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIy\nChCRUYCIjAJEZBQg8vvsDYBGzfN8+ceyLOduCY2TUfjfNZ3TTT3neVZSNtg/GNptN6f1wVNJ2WDn\nYDh/yzlP7yzYlZQ19gyGcDt0frzLKylP2S3o1i7pfPieDhnu2SfoyjfS+fAQjhr+4Uw9NczzfRbv\nPnMJqL5xPJffU8a/J9XvLcvODZ1XHm9ZlrUvMSYZpYxl+aGkez/cai6VlFsyCquUlFfIKJUcPJBO\nSsoLZJRilJTWyCj8TEnZIKPUc/xAuk1JByejlHT89aHbrVTSkbkfA96wfQuTG5zGZBqFN5hJeSSj\n1NBOnZSUOzIKb1NSbskoBTy+L8nplJQrGYUPaSUXMgqfc1k+k4xSVyONUlJklNY1+MLoHSUdnIx2\naITjtv22Xilp92S0Q47b4zlxPzIZ7VM3x203U2c3vxEeyWi3HLfHU9IxyWjPej1uWx5Re/2Zs0FG\nYWdrJfX+T72S0c6VHo5anjq3yeVQZLR/pUsK7ZPRIZQt6dOlcdURlV7J6CjKlbSzVxI7ezrcktGB\nlCvpig6eAl2R0bEsy1I6pGY6GiSjw2ntfyd+qrNcdvZ0uCOjIypRUqhCRgelpLAXGaWMokvjopvN\n62R0XM0OpLpDLTI6tGZLCoXI6OiqlNSISrNklLZK2lkuO3s6PPX77A2gCZeSnnu8b99hJUY0S0b5\nz8GZeozmNZQmOGqxv3KceZ6ud8Rv73h9lLSPZ8GPTKN81+3QuSzTNL2Ulcu7qGgQJcgoX3Gt58cl\nVFKqsJuys0tA99qt6pa07pbzLhc8sbNl2fNsVS/vkUrPZJQnHsN1/czGl75ESWmcjPKccMGLZJTn\nmrq1qdxA6oXRocgoNZQrKeNwwROr1u4QPatmLoGiTTLKlqclvfvwyKqWKGn7W8i+LOopppHV/fzX\n2RvC+Uyj/KCpc00XZ82kt9E0b3Ilo/xs5JK+m04r+gH5lVPYN5p1t05/9/vL6IBMoxS210xqtU5C\nRqnt45JKJ3uxAKEHL5b02+m0oh+TaZQebM+k13p+r3GufBqZP5704+Bh0MsCXJhG4VXhSXx6ZRql\nK/sOpLrJK2SU3oQltVTnXTJKh94qqZGTkIzSp1dO3F84BAjJKN26LamlOt8jo/TsgCtGQUYBIt62\nGSAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAi\nowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowAR\nGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWI\nyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEPkDnsUdda4XbhQAAAAASUVORK5C\nYII=\n", 342 | "text/plain": [ 343 | "" 344 | ] 345 | }, 346 | "execution_count": 10, 347 | "metadata": {}, 348 | "output_type": "execute_result" 349 | } 350 | ], 351 | "source": [ 352 | "standardise.apply(Chem.MolFromSmiles(\"[n-]1c(=[NH+]C)cccc1\"))" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "Note that this is as would be obtained by application of the rules in the absence of the neutralisation step..." 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 11, 365 | "metadata": { 366 | "collapsed": false, 367 | "slideshow": { 368 | "slide_type": "-" 369 | } 370 | }, 371 | "outputs": [ 372 | { 373 | "name": "stderr", 374 | "output_type": "stream", 375 | "text": [ 376 | "[2016/Mar/24 16:22:42 DEBUG ] rule 16 'Fix 1,3 charge-seperated systems (aromatic 1)' applied\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "HTML(show_change(\"[n-]1c(=[N+](C)C)cccc1\"))" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": { 387 | "slideshow": { 388 | "slide_type": "skip" 389 | } 390 | }, 391 | "source": [ 392 | "### Aromatics and imines\n", 393 | "\n", 394 | "In some cases it is not clear whether, when the starting structure contains an amide anion, that the uncharged imine produced is more or less desirable that the original zwitterion or than the cationic methyl-pyridine that would be produced by simple protonation of the amide anion. \n", 395 | "\n", 396 | "Examples are shown below; note that the `neutralise` module does not touch these molecules as it perceives them to be zwitterionic..\n" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 12, 402 | "metadata": { 403 | "collapsed": false, 404 | "slideshow": { 405 | "slide_type": "skip" 406 | } 407 | }, 408 | "outputs": [ 409 | { 410 | "name": "stderr", 411 | "output_type": "stream", 412 | "text": [ 413 | "[2016/Mar/24 16:22:42 DEBUG ] rule 17 'Fix 1,3 charge-seperated systems (aromatic 2)' applied\n" 414 | ] 415 | } 416 | ], 417 | "source": [ 418 | "HTML(show_change(\"C[n+]1c([N-](C))cccc1\"))" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 13, 424 | "metadata": { 425 | "collapsed": false 426 | }, 427 | "outputs": [ 428 | { 429 | "name": "stderr", 430 | "output_type": "stream", 431 | "text": [ 432 | "[2016/Mar/24 16:22:42 DEBUG ] rule 20 'Fix 1,5 charge-seperated systems (aromatic 2)' applied\n" 433 | ] 434 | } 435 | ], 436 | "source": [ 437 | "HTML(show_change(\"C[n+]1ccc([N-]C)cc1\"))" 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "As an alternative to attempting to neutralise these molecules, it might be preferable (in some cases?) to protonate the amide anion and return a cationic parent." 445 | ] 446 | } 447 | ], 448 | "metadata": { 449 | "kernelspec": { 450 | "display_name": "Python 3", 451 | "language": "python", 452 | "name": "python3" 453 | }, 454 | "language_info": { 455 | "codemirror_mode": { 456 | "name": "ipython", 457 | "version": 3 458 | }, 459 | "file_extension": ".py", 460 | "mimetype": "text/x-python", 461 | "name": "python", 462 | "nbconvert_exporter": "python", 463 | "pygments_lexer": "ipython3", 464 | "version": "3.5.1" 465 | } 466 | }, 467 | "nbformat": 4, 468 | "nbformat_minor": 0 469 | } 470 | -------------------------------------------------------------------------------- /standardiser/docs/Hydroxy_pyridine_within_ring.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run notebook_setup.py\n", 12 | "\n", 13 | "sys.path.append('../..')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "from standardiser import rules\n", 25 | "\n", 26 | "from standardiser.rules_demo import rules_table, show_change" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "rules.logger.setLevel('DEBUG')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### 4-hydroxy pyridines\n", 45 | "\n", 46 | "If the rule '[4-hydroxy pyridine -> 4-pyridone (within-ring)](03_rules.ipynb#4-hydroxy_pyridine_-_4-pyridone_within-ring)' is not enabled, the more general rule '[4-hydroxy pyridine -> 4-pyridone (any)](03_rules.ipynb#4-hydroxy_pyridine_-_4-pyridone_any)' alone can give undesirable effects.\n", 47 | "\n", 48 | "The following molecule is an example (note that the 'within-ring' version of the rule is temporarily disbled for this demo)..." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFzElEQVR4nO3dbW6jMBiF0TDq/rfM\n/LA0qvIJvGBfZ85ZQJRW7ZPrQpplXdcbAEf9Gf0EAOYmowAlMgpQIqMAJTIKUCKjACUyClAiowAl\nMgpQIqMAJTIKUCKjACUyClAiowAlMgpQIqMAJTIKUCKjACUyClAiowAlMgpQIqMAJT+jnwBfYlmW\n6x7cx4CTTEY5wbIsl5bu6seHCod6gBIZparDVFzX9dI/GkCFjAKUyCglT6dofTk+PoJBSiwZ5WSn\nnPFFk4nIKMd1voCurWSSUQ56dZw/K6yiySxklJloK4FklCOunqKNaDIFGWUy2koaGWW3PlO0EU3y\nySj79Gxo81hSbSWKjDIlJSWHjLJD/ynaiCbJZJRZaSshZJStRk3RRjSJJaNMTFtJIKNsMnaKNqJJ\nJhnls4SGNm5+IpCMApTIKB/kTNHGICWNjAKUyCjvpE3RxiAliowClMgoL2VO0cYgJUfErwSxQqK5\nxURPlS9jjQKUyCjvzHJSNkUZSEYBSmSUD/IHqSnKWDLKZ/klhYFklLmZogwno2xikMIrMsrETFES\nyChbpQ1SDSWEjLJDWkkhgYwyJVOUHDLKPgYp3JFR5mOKEkVG2c0ghd9klCMGltQUJY2MMhMNJZCM\ncpCjPTQyyjRMUTLJKMcZpHCTUYq6ldQUJZaMApTIKFUdBqkpSjIZBSjxIs85Lh2kfkpJ9jP6CfAN\nrj50O9STzKEeoERGqeowFd2gSjIZBSiRUUqeTtH6cnx8BIOUWDLKyU4544smE5FRjut8AV1bySSj\nnOnEsIoms5BRDlqWAXfFayuBZJQjluX2mNDTz/iiyRRklMloK2lklN1eTNFLzviiST4Z5QRPw3qW\nx5JqK1FklH0uLeZ2SkoOGaWqQ1hFk2Qyyg4hU7TRVkLIKCXdwiqaxJJRtoqaoo22kkBG2erFTU49\nn4BokkhGOa7/OHXzE4FkFKBERpmMQUoaGQUokVHuLcvt97YL3HldB2n7drz/pgR+j+jI59TDa3f3\nIgTe80UAGeWJdU0vRpufv/+n1PZB+vnLSv7KySOjzOru//J1/mAoB3n+kVGeyx+kg90d9vmPucTE\nN+g9ReEXGeWlNkiB9xzqmd6FU/TulcTg5RlHIabnRM9YDvVslXnA11CGk1G28qdSeEpGmZgpSgIZ\nZYeoQaqhhJBRgBIZZZ+QQWqKkkNGAUpklN2GD1JTlCgyClAioxyxrrdRH39kipJGRjloyAfJaSiB\nZBSgREY5rvMgNUXJJKMAJTJKSbdBaooSS0YBSmSUqg6D1BQlmYwClHiR5xyXDlI/pSTzkXac4OpD\nt0M9yRzqAUpklKoOU3HIG09hIxkFKJFRSp5O0fpyfHwEg5RYMsrJTjnjiyYTkVGO63wBXVvJJKMc\n9Oo4f1ZYRZNZyCgz0VYCyShHXD1FG9FkCjLKZLSVNDLKbn2maCOa5JNR9unZ0OaxpNpKFBllSkpK\nDhllh/5TtBFNkskos9JWQsgoW42aoo1oEktGmZi2kkBG2WTsFG1Ek0wyymcJDW3c/EQgGQUokVE+\nyJmijUFKGhkFKJFR3kmboo1BShQZBSiRUV7KnKKNQUqOiF8JYoVEc4uJnipfxhoFKJFR3pnlpGyK\nMpCMApTIKB/kD1JTlLFklM+SS6qhDCejACUyyiaZg9QUJYGMMqtluWkoCWSUrTIHKQwno+yQU9Jl\nuVmihJBRgBIZZZ+EQWqKEkVGAUpklN3GDlJTlDQyyhEJR3sIIaPMxBQlkIxy0JBBqqEE+hn9BPgq\nn8L6roISyaS8JZkS72oHh3pKXGsCGQUokVGqDFL+czJKqGW5/Y6zUBPLlXpOsHmQfr4Y5XoV05FR\nznHF9fp1db89E3CoByiRUaK1QQrJZBSgREZJZ5ASTkYBSrwhGqDEGgUokVGAEhkFKJFRgBIZBSiR\nUYASGQUokVGAEhkFKJFRgBIZBSiRUYASGQUokVGAEhkFKJFRgJK/QDrntt7HHe8AAAAASUVORK5C\nYII=\n", 61 | "text/plain": [ 62 | "" 63 | ] 64 | }, 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "mol = Chem.MolFromSmiles(\"C[n+]1ccc)Ncc1\")\n", 72 | "\n", 73 | "mol" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 5, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stderr", 85 | "output_type": "stream", 86 | "text": [ 87 | "[2016/Mar/24 16:26:11 DEBUG ] apply> mol = 'Oc1c2ccccc2nc2ccncc12'\n", 88 | "[2016/Mar/24 16:26:11 DEBUG ] apply> starting pass 1...\n", 89 | "[2016/Mar/24 16:26:11 DEBUG ] rule 4 '4-hydroxy pyridine -> 4-pyridone (any)' applied on pass 1\n", 90 | "[2016/Mar/24 16:26:11 DEBUG ] ...total of 1 hits in pass: will continue...\n", 91 | "[2016/Mar/24 16:26:11 DEBUG ] apply> starting pass 2...\n", 92 | "[2016/Mar/24 16:26:11 DEBUG ] ...total of 0 hits in pass: finished.\n" 93 | ] 94 | }, 95 | { 96 | "data": { 97 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAF0klEQVR4nO3dzXLaShSFUSvl939l\nZaDULS4W1s+RunfDWiN7klAp52O3hGGa5/kLgLP+9H4AAGOTUYASGQUokVGAEhkFKJFRgBIZBSiR\nUYASGQUokVGAEhkFKJFRgBIZBSiRUYASGQUokVGAEhkFKJFRgBIZBSiRUYASGQUokVFCTdPXNP3v\n26cvVr+F9r57PwDexLSrZ9uf5u0DvxmOjHKBaZrmG/o3z1/TJKykk1HG4yBPFBml6qYpulgdpI/f\nSirducUEUCKjlKxO0X23m/ZaBinEklEudusZHwK5Nsp5d18V3fz657fQnjXKSa+O87dOUad7Asko\nI3GdlEAyyhldpihkklEGY5CSRkY5zBSFRzLKBRo31CAlioxyjNUJT2SUA3KO8wYpOWQUoERG2Stn\nii4MUkLIKAOb54vfBgVOkFF2SZuikENG2Zbc0HmeDVL6klGAEhllQ/IUXRik9CWjACUyym/yp+jC\nIKUjGWVUukkIGeWl5Cn682EYpPQio7y0GqaEhq4K6TsfSEYZj2ISRUb5zSgnZWGlIxllMIpJGhll\nQ/4gFVb6klG25ZRUMQkkowwj+QVYfDIZZZecQQppZJQxmKLEklH2ShukGkoIGeWAXiVVTJLJKEMS\nVnLIKMe0H6SKSTgZJZo7S+STUQ5Lu9cEfckoZ7QpqSnKEGSUkWgogWSUk+4epIrJKGSUYQgrmWSU\n8+4bpIrJQGSUkptK6s4SA5FRgBIZparBi59MUZLJKECJJ3mucesg9VNKsu/eD4B3cPeh26GeZA71\nACUySlWDqejNUEgmowAlMkrJqzdhuvwvMkiJJaNczO0gPo2Mcl7jYhqkZJJRrmSK8oFklJO6FNMg\nJZCMcsY0eRMm+EdGGYxBShoZ5bBp+vo5Ok1RPpaMcoHVM/59DFKiyCjHrE7R9pSUHDJKVUhYoRcZ\n5YCoYhqkhJBRSqLCCl3IKHsFFtMgJYGMstfai5ziwgrtySjnJTTUIKU7GQUokVGGZ5DSl08G5X3s\njOn2pYiEqxWMw+9B82xp0X8/F6PcR7rrl/of0zzEPwTNWaPw2tNzyChPKbTl2igr5vlruIuNrpDS\ni4zyJrxTH7041LNuGaRjdcktJrqQUd6BKUpHDvW8NOIVUmjPGmV4N07Rp2cSg5c1Msqzx1bohn8C\nNjnUs1fmAd9VUbqTUfZyqRRWySgDM0VJIKMcEDVINZQQMgpQIqMcEzJITVFyyChAiYxyWPdBaooS\nRUYBSmSUMzoOUlOUNDLKSfO8943pLqShBJJRgBIZ5bzGn9thipJJRgFKZJSSZoPUFCWWjAKUyChV\nDQapKUoyGQUo8STPNW4dpH5KSeazmLjA3Yduh3qSOdQDlMgoVQ2mYuPX+cMhMgpQIqOUrE7R+nL8\n+ScYpMSSUS52yRlfNBmIjHJe4xvo2komGeWkV8f5q8IqmoxCRhmJthJIRjnj7im6EE2GIKMMRltJ\nI6Mc1maKLkSTfDLKMS0buvhZUm0liowyJCUlh4xyQPspuhBNkskoo9JWQsgoe/WaogvRJJaMMjBt\nJYGMskvfKboQTTLJKNsSGrrw4icCyShAiYyyIWeKLgxS0sgoQImM8pu0KbowSIkiowAlMspLmVN0\nYZCSI+K/BLFCornHQA+VN2ONApTIKL8Z5aRsitKRjAKUyCgb8gepKUpfMsq2/JJCRzLK2ExRupNR\ndjFI4RUZZWCmKAlklL3SBqmGEkJGOSCtpJBARhmSKUoOGeUYgxSeyCjjMUWJIqMcZpDCIxnljI4l\nNUVJI6OMREMJJKOc5GgPi+/eD4B3c2Fbn4anKUomGeW8ZZA+pU3p+DQO9ZQ0O9qbosSSUYASGaWq\nwSA1RUkmowAlnuS5xq2D1E8pyWQUoMShHqBERgFKZBSgREYBSmQUoERGAUpkFKBERgFKZBSgREYB\nSmQUoERGAUpkFKBERgFKZBSgREYBSmQUoERGAUpkFKBERgFKZBSgREYBSv4Cx0LOquVXzj0AAAAA\nSUVORK5CYII=\n", 98 | "text/plain": [ 99 | "" 100 | ] 101 | }, 102 | "execution_count": 5, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "# Temporarily remove rule '4-hydroxy pyridine -> 4-pyridone (within-ring)', then apply rules...\n", 109 | "\n", 110 | "original_rules = rules.rule_set\n", 111 | "\n", 112 | "rules.rule_set = [x for x in original_rules if x['name'] != '4-hydroxy pyridine -> 4-pyridone (within-ring)']\n", 113 | "\n", 114 | "rules.apply(mol)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "Note that the 'across-ring' product had been produced, instead of the desired 'within-ring' product. \n", 122 | "\n", 123 | "If the restrictive 'within-ring' version of the rule (which is applied before the more general version) is re-enabled, the desired product is obtained..." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 6, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [ 133 | { 134 | "name": "stderr", 135 | "output_type": "stream", 136 | "text": [ 137 | "[2016/Mar/24 16:26:11 DEBUG ] apply> mol = 'Oc1c2ccccc2nc2ccncc12'\n", 138 | "[2016/Mar/24 16:26:11 DEBUG ] apply> starting pass 1...\n", 139 | "[2016/Mar/24 16:26:11 DEBUG ] rule 3 '4-hydroxy pyridine -> 4-pyridone (within-ring)' applied on pass 1\n", 140 | "[2016/Mar/24 16:26:11 DEBUG ] ...total of 1 hits in pass: will continue...\n", 141 | "[2016/Mar/24 16:26:11 DEBUG ] apply> starting pass 2...\n", 142 | "[2016/Mar/24 16:26:11 DEBUG ] ...total of 0 hits in pass: finished.\n" 143 | ] 144 | }, 145 | { 146 | "data": { 147 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFt0lEQVR4nO3d4W7iOhSFUTLq+79y\n5kekClEosU8Sb9O1HqCXkXo/tp22LOu63gDo9W/0CwCYm4wClMgoQImMApTIKECJjAKUyChAiYwC\nlMgoQImMApTIKECJjAKUyChAiYwClMgoQImMApTIKECJjAKUyChAiYwClMgoQImMApR8jX4BfIhl\nWc774j4GnGQyygGWZTm1dGd/fahwqAcokVGqLpiK67qeemkAFTIKUCKjlDydovXl+PMrGKTEklEO\ndsgZXzSZiIzS7+IH6NpKJhml06vj/FFhFU1mIaPMRFsJJKP0OHuKbkSTKcgok9FW0sgoza6ZohvR\nJJ+M0ubKhm5+llRbiSKjTElJySGjNLh+im5Ek2Qyyqy0lRAyyl6jpuhGNIklo0xMW0kgo+wydopu\nRJNMMsp7CQ3d+OEnAskoQImM8kbOFN0YpKSRUYASGeU3aVN0Y5ASRUYBSmSUlzKn6MYgJUfE/xLE\nConmHhO9VD6MNQpQIqP8ZpaTsinKQDIKUCKjvJE/SE1RxpJR3ssvKQwko8zNFGU4GWUXgxRekVEm\nZoqSQEbZK22QaighZJQGaSWFBDLKlExRcsgobQxSeCCjzMcUJYqM0swghXsySo+BJTVFSSOjzERD\nCSSjdHK0h83X6BfApzmwrQ/D0xQlk4zSbxukD2lTOv4ah3pKLjvam6LEklGAEhml6oJBaoqSTEYB\nSrzJc4xTB6nvUpJ5Us8Bzj50O9STzKEeoERGqbpgKvqNKZLJKECJjFJy2a2lQUosGQUokVH6XfwA\n3SAlk4wClMgonZZlwE/FG6QEklF6LMvNj8PDRkaZjEFKGhmlmSkK92SU+RikRJFR2oRMUSUlh4wC\nlMgoDUKm6MYgJYSMApTIKHtFTdGNQUoCGWWvtIZCCBllbgYpw8koQImMMj2DlLF8MiifY2dM39/x\nugamhc+t5dHWou/vi+8H9A9P6tMe3J/1N6Tv0xz1DyaGNQqvhb91kMHdKE+s6226y0Y3pIxijdIg\nOVMXfzAUfJNRntsG6UOXHg64aTxiYggZ5ROYogzkbpSXZrwhhetZo0zvxCn68E5i8PKMoxDTc6Jn\nLId69so84Gsow8koe7kqhadklImZoiSQURpEDVINJYSMApTIKG1CBqkpSg4ZBSiRUZoNH6SmKFFk\nFKBERumxrnv/nNLhTFHSyCidhvyZZA0lkIwClMgo/S4epKYomWQUoERGKblskJqixJJRgBIZpeqC\nQWqKkkxGAUq8yXOMUwep71KS+Ug7DnD2oduhnmQO9QAlMkrVBVNxyC+ewk4yClAio5RcdmtpkBLL\nIyYOdmDsPFZiCh6A0u/6B+ge2RPIoZ5OigYbGWUmbkgJJKP0MEXhm4wyGYOUNDJKM1MU7skobRIa\napASRUaZkpKSQ0ZpkDBFIY2MMiuDlBAyyl6mKDwlo0zMICWBjLKLKQqvyCjvJTfUIGU4GQUokVHe\nSJ6iG4OUsWQUoERG+U3yFL1foAYpA8kobUJqldx3/hoZ5aWnqYrdfbEvjI8no7wUG6anfbdPGUVG\naRabVxhCRvlNYDFNUdLIKD0C8wqjyChvRBXTFCWQjPLe05IOyquGEkdG6XdxSZflJpgEklF2iTra\nfzNFSSCjlFyW159TdFluGkoCGWWvzEEKw8koDUY9a3oxRU/9b8JeMgpQIqO0uX6QmqKEk1GOcV5J\nFZNwMkqzsc+aTFHSyCg9kn6vCQaTUWZiihJIRuk0ZJBqKIFkFKBERunnhhRuMkrRq5IOeTEwhIwC\nlMgoVU7x/HEySqhlud3HWaiJ9TX6BfAJdg/S93emrlWZjoxyjDMeK62rn7dnAg71ACUySrRtkEIy\nGQUokVHSGaSEk1GAEh/zDVBijQKUyChAiYwClMgoQImMApTIKECJjAKUyChAiYwClMgoQImMApTI\nKECJjAKUyChAiYwClMgoQMl/PCyxplQ+TjEAAAAASUVORK5CYII=\n", 148 | "text/plain": [ 149 | "" 150 | ] 151 | }, 152 | "execution_count": 6, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "# Restore rule '4-hydroxy pyridine -> 4-pyridone (within-ring)', then apply rules...\n", 159 | "\n", 160 | "rules.rule_set = original_rules\n", 161 | "\n", 162 | "rules.apply(mol)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "Note that the desired 'within-ring' product has been produced, instead of the 'across-ring' product." 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "An example of where this problem manifested is [CHEMBL348887](https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL348887).\n", 177 | "\n", 178 | "Note that if only the more general rule is enabled, the problem might not be obvious. This is because which substructure (_i.e._ the within-ring or across-ring version) is affected is arbitrary, depending on the order of the atoms in the input molecule. " 179 | ] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.5.1" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 0 203 | } 204 | -------------------------------------------------------------------------------- /standardiser/docs/Miscellaeny.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Miscellaeny" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%run notebook_setup.py\n", 19 | "\n", 20 | "sys.path.append('../..')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from standardiser import rules" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "rules.logger.setLevel('DEBUG')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "### Fix 1,7 conjugated cation (aromatic 4)\n", 50 | "\n", 51 | "The following molecule fails, despite it's similarity to the [1,5 conjugated cation](https://neo4j-vm.windows.ebi.ac.uk:9999/notebooks/standardiser/standardiser/docs/03_rules.ipynb#Fix_1_5_conjugated_cation_aromatic_3) case..." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFU0lEQVR4nO3dy27bSBRFUbOR//9l\n9oBpDzqRLPLU47K41ihxDIgBhJ1TlORs+75/AXDVP7MvAODeZBQgIqMAERkFiMgoQERGASIyChCR\nUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCI\njAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiPyafQE8wrZtUx533/cpj8uj\nyCjdbds2K2cTH5rncKgHiMgofc3dg/u+z7qfwHPIKEBERumowq1Jg5TeZBQgMn8ssKpXU7T3Nnz1\noJ7qdOK5RRcFs1XwkliDQz1AREZpr+bu81oTncgoQERGaazmFD0YpPQgowARGaWlylP0YJDSnIzS\nTP2GHpSUtmQUICKjtHGXKXowSGlIRgEiMkoD95qiB4OUVmSU1B0belBSmpBRgIiMErnvFD0YpORk\nFCAio1x39yl6MEgJyShAREa5aI0pejBIScgoV6zU0IOScpmMAkRklNPWm6IHg5RrZBQgIqOcs+oU\nPRikXCCjNCA9PJmMcsKrKXrHEffqgu/4d2EuGeWEZRLz5tbE2nct6EFGaWOZwsJZMso5C+TSFKUt\nGaWZBQoLF8gop906l6YozckoV7wq6a0LC9fIKA9iitKDjHLR20E6/nJ+pqF0IqO0t+9fNUsKPcgo\n193oTqgpSj8yShcGKc8ho0Te3AmtU1JTlK5klFSdXMIUMkpHFQpritKbjNLAm1zOLamGMoCM0kaF\n4QlTyCjdzSqsKcoYMkoz74/2sCoZZU2mKMPIKC0VuUOqoYwkozRWpKQwjIyyGlOUwWSU9gxSHkVG\n6WLW5jNFGU9GASIyyjpMUaaQUdahoUwhowARGWVxpii9yShAREbp6M93jw5+P6kpygAySl/eh8/y\nfs2+ABZ3fKLp1CL8rLwffZMpygAyykx/Lexn6fv5mzZLmCEc6ulu1kfs931XUgaQUUb4s6Tb9vsr\n37+Am3KoZ47j5H72tikUZI0yiJ+ex6pklHGuvpqUPKLbo3Qno3Q0vpswnowCRGSUxTnX05uM0pF8\n8QQyChCRUXqp855Q53q6klGAiIwCRGSULuqc6KE3GeUR3B6lHxkFiMgo7TnR8ygyylM419OJjAJE\nZBQgIqM0VvnGqHM9PcgoQERGASIySkuVT/TQiYzyLG6P0pyM0pIpygPJKEBERnkc53raklGAiIwC\nRGSUJ3KupyEZ5RFEk35+zb4A6OsI6O6tWHRjjXLRn/Pu+ytv/miwbdv2fddQupJRrit+UD4a+upP\n3R6lFYd6rtv3oh+id5BnJBllNe9H6Pf3fOksjcgokVeDdMpx+cM4ftJZ+JyMkvprSf/32wFV/TCO\nGkpzMsoKHOSZSEZp4BikUzjIM52M0saUkhqhVOCfaO7qw4Z6htObNcr9OMhTioxyMw7yVCOjjBN+\n5MkIpSYZZZzvl6EuVM4IpSwZZagjcWdnqVeTqExGmeDzWfrJwDRCmUtGmeOTWWqEcguegkx2+W7p\nl4ZSgzXKZP/N0nNBdJCnDj/9nhJO/Sx6/zUIpTgTUciPG9MIpSAZpZxXB3x3QqnJoZ5y/nrA11DK\n8tSkqO/zu4M8xckopRmh1OdQT2kaSn0yChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCI\njAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJE\nZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQg8i8l7UKqXRR3rAAAAABJRU5ErkJggg==\n", 64 | "text/plain": [ 65 | "" 66 | ] 67 | }, 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "mol = Chem.MolFromSmiles(\"[nH]1ccc2cccc[n+]12\")\n", 75 | "mol" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 5, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [ 85 | { 86 | "name": "stderr", 87 | "output_type": "stream", 88 | "text": [ 89 | "[2016/Mar/24 17:05:48 DEBUG ] apply> mol = 'c1cc[n+]2[nH]ccc2c1'\n", 90 | "[2016/Mar/24 17:05:48 DEBUG ] apply> starting pass 1...\n", 91 | "[2016/Mar/24 17:05:48 DEBUG ] ...total of 0 hits in pass: finished.\n" 92 | ] 93 | }, 94 | { 95 | "data": { 96 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFU0lEQVR4nO3dy27bSBRFUbOR//9l\n9oBpDzqRLPLU47K41ihxDIgBhJ1TlORs+75/AXDVP7MvAODeZBQgIqMAERkFiMgoQERGASIyChCR\nUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCI\njAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiPyafQE8wrZtUx533/cpj8uj\nyCjdbds2K2cTH5rncKgHiMgofc3dg/u+z7qfwHPIKEBERumowq1Jg5TeZBQgMn8ssKpXU7T3Nnz1\noJ7qdOK5RRcFs1XwkliDQz1AREZpr+bu81oTncgoQERGaazmFD0YpPQgowARGaWlylP0YJDSnIzS\nTP2GHpSUtmQUICKjtHGXKXowSGlIRgEiMkoD95qiB4OUVmSU1B0belBSmpBRgIiMErnvFD0YpORk\nFCAio1x39yl6MEgJyShAREa5aI0pejBIScgoV6zU0IOScpmMAkRklNPWm6IHg5RrZBQgIqOcs+oU\nPRikXCCjNCA9PJmMcsKrKXrHEffqgu/4d2EuGeWEZRLz5tbE2nct6EFGaWOZwsJZMso5C+TSFKUt\nGaWZBQoLF8gop906l6YozckoV7wq6a0LC9fIKA9iitKDjHLR20E6/nJ+pqF0IqO0t+9fNUsKPcgo\n193oTqgpSj8yShcGKc8ho0Te3AmtU1JTlK5klFSdXMIUMkpHFQpritKbjNLAm1zOLamGMoCM0kaF\n4QlTyCjdzSqsKcoYMkoz74/2sCoZZU2mKMPIKC0VuUOqoYwkozRWpKQwjIyyGlOUwWSU9gxSHkVG\n6WLW5jNFGU9GASIyyjpMUaaQUdahoUwhowARGWVxpii9yShAREbp6M93jw5+P6kpygAySl/eh8/y\nfs2+ABZ3fKLp1CL8rLwffZMpygAyykx/Lexn6fv5mzZLmCEc6ulu1kfs931XUgaQUUb4s6Tb9vsr\n37+Am3KoZ47j5H72tikUZI0yiJ+ex6pklHGuvpqUPKLbo3Qno3Q0vpswnowCRGSUxTnX05uM0pF8\n8QQyChCRUXqp855Q53q6klGAiIwCRGSULuqc6KE3GeUR3B6lHxkFiMgo7TnR8ygyylM419OJjAJE\nZBQgIqM0VvnGqHM9PcgoQERGASIySkuVT/TQiYzyLG6P0pyM0pIpygPJKEBERnkc53raklGAiIwC\nRGSUJ3KupyEZ5RFEk35+zb4A6OsI6O6tWHRjjXLRn/Pu+ytv/miwbdv2fddQupJRrit+UD4a+upP\n3R6lFYd6rtv3oh+id5BnJBllNe9H6Pf3fOksjcgokVeDdMpx+cM4ftJZ+JyMkvprSf/32wFV/TCO\nGkpzMsoKHOSZSEZp4BikUzjIM52M0saUkhqhVOCfaO7qw4Z6htObNcr9OMhTioxyMw7yVCOjjBN+\n5MkIpSYZZZzvl6EuVM4IpSwZZagjcWdnqVeTqExGmeDzWfrJwDRCmUtGmeOTWWqEcguegkx2+W7p\nl4ZSgzXKZP/N0nNBdJCnDj/9nhJO/Sx6/zUIpTgTUciPG9MIpSAZpZxXB3x3QqnJoZ5y/nrA11DK\n8tSkqO/zu4M8xckopRmh1OdQT2kaSn0yChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCI\njAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJE\nZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQg8i8l7UKqXRR3rAAAAABJRU5ErkJggg==\n", 97 | "text/plain": [ 98 | "" 99 | ] 100 | }, 101 | "execution_count": 5, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "rules.apply(mol)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "The desired result was obviously..." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFbklEQVR4nO3c227bOhRF0eig///L\nPA8CiqJOXEuLm6SkMZ56CRoXMCYWJTlba+0LgLP+m/0CAK5NRgEiMgoQkVGAiIwCRGQUICKjABEZ\nBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjI\nKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAyK/ZL4BH2LZtyvdtrU35vjyK\njFJu27ZZOZv4rXkOh3qAiIxSa+4ebK3Nup7Ac8goQERGKbTCpUmDlGoyChCRUaq8maJF8/Cnf9Yg\npZSMUuJ9Q4tO+m9yqaTUkVGAiIzS35QpujNIGU9GASIySmcTp+jOIGUwGQWIyCg9TZ+iO4OUkWSU\nblb4zNInlJS+ZJQRxhdWKxlGRuljkeP8nxztGUNGASIySgcLTtGdQcoAMkrqKneWXikpXcgohVYo\nrFZSTUaJrBDKhMiSk1GqrFNYraSUjHLeOqFMiCwhGaXEaoXVSurIKCct+5DTTzz8RBEZ5Yw1Q5lQ\nUk6TUTpbubBaSQUZ5bCVQ5kQWc6RUXpav7BaSXcyyjHrhzIhspwgo3RzlcJqJX3JKAdcJZQJkeUo\nGeWAN4m5VmFv8x9hBTJKH5dLz+VeMMuSUY6595nXFOUEGQWIyCiH3XWQmqKcI6OccdeSwgkyCl9f\npigBGeWkOw1SDSUhowARGeW8ewxSU5SQjAJEZJRIa+3Se9QUJSejpFr7unRJISSjPJcpShcySgdX\nHKQaSi8ySh9XLCl0IaM8kSlKRzJKNwYpzySjPI4pSl8ySk/rD1INpTsZpbP1Swp9ySgPYopSQUbp\nzyDlUWSUEgtuPlOUIjIKEJFRHsEUpY6Mcn8aSikZBYjIKDdnilJNRgEiMkqh16dHBz9PaooygIxS\ny3P43N6v2S+Am9s/0fTTIjwY2cNJNkUZQEaZ6WDljn31ZgkzhEM95WZ9xL61pqQMIKOM4IeVcGMy\nChCRUQYxSLkrGWWc0tvm314GdXmUAWSUQq/d9AAS9yOj3IThySwyys3JK9VklELyxRPIKPdheDKF\njFLlzUfpB5NXSskoQERGASIySolZJ3rnd8aTUR5BXqkjowARGaW/uffoDU8Gk1GeQl4pIqMAERnl\nhgxPRpJROlvnw0uv5JUKMgoQkVHuyfBkGBmlp5VP9FBERnkWK5XuZJSelpqiiskYMgoQkVEex0ql\nLxkFiMgod2Z4MoCM8kTySkcyChCRUW7O8KSajHLSa5p+/8mbv4L7kVHOu0oc23efCrBS6UVGOa+1\ny5QU6vya/QJ4ouk/wWTfod+uVDhKRonsg/Q1Ryuv1G3bBJSOZJTUtyX967evd5/2X4yvmYbSnYwy\n1F6wKYd6B3mKyCgdJPeaxiTVCKWOO/X0cahRf35x9e3+bds0lFLeXiyhaJMKKANYoyxh36R9Z6mG\nMoZro6yi490nd5MYyRplnE/GZmtf4Wc09xGqoQzj1MNQHz4uem5OGqFMIaNM8OHJ/dDFTVdCmcU7\njzk+n6X/fIsaocwlo8z0ySx9X0kjlOm8BZksmaUaygq8C1nC0SA6yLMODzyxhEM/i94jTSzFGmUh\n/9yYRigLklGW89MB35VQ1uRQz3K+PeBrKMvy1mRRv8/vDvIsTkZZmhHK+hzqWZqGsj4ZBYjIKEBE\nRgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEi\nMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQ\n+R+9EVSPMR17rwAAAABJRU5ErkJggg==\n", 127 | "text/plain": [ 128 | "" 129 | ] 130 | }, 131 | "execution_count": 6, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "Chem.MolFromSmiles(\"[nH+]1ccc2ccccn12\")" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "This will require a new '`Fix 1,7 conjugated cation (aromatic 4)`' rule to be coded and tested." 145 | ] 146 | } 147 | ], 148 | "metadata": { 149 | "kernelspec": { 150 | "display_name": "Python 3", 151 | "language": "python", 152 | "name": "python3" 153 | }, 154 | "language_info": { 155 | "codemirror_mode": { 156 | "name": "ipython", 157 | "version": 3 158 | }, 159 | "file_extension": ".py", 160 | "mimetype": "text/x-python", 161 | "name": "python", 162 | "nbconvert_exporter": "python", 163 | "pygments_lexer": "ipython3", 164 | "version": "3.5.1" 165 | } 166 | }, 167 | "nbformat": 4, 168 | "nbformat_minor": 0 169 | } 170 | -------------------------------------------------------------------------------- /standardiser/docs/Multiple_possible_tautomers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run notebook_setup.py\n", 12 | "\n", 13 | "sys.path.append('../..')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "from standardiser import rules" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "rules.logger.setLevel('DEBUG')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "# Multiple possible standardised tautomers\n", 43 | "\n", 44 | "### Introduction\n", 45 | "\n", 46 | "In some cases, more than one standardised tautomer is conceivable. In the current system only one, essentially arbitrary, tautomer is returned.\n", 47 | "\n", 48 | "For example..." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFiElEQVR4nO3dXXObOACGUdjZ//+X\n2Qtv3RTXNvYr9IHOmVxkkjQhM+oTCYRZt21bAPjWP60PAGBsMgoQkVGAiIwCRGQUICKjABEZBYjI\nKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBE\nRgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEi\nMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRP5tfQDwvXVdb+9s29b2SJiZjDKY\nezqXH/Vc11VJacXgYwB/Tefj1xjMNGHk0akj6Xz8J8Yz9Rl29OJnN5dvT3cqKfUZc7T0xZTzyPc0\nqqnJgKO2M9L5+CMMbKox2qihQjoff6KxTR02PHGWs9MplHTCQKSwmlviX5dUZ6nDzaCUt21bnX5t\n27a7vn/8s1CKjFJS/QmgktKcjDI8JaUtGeUKlJSGZJQpKCnnkVGKeXZitE6/3oZSSTmJjHIdSkoT\nMsq5Kl+7V1Lqk1HK6Gevu5JSmYxyQUOUdP2l7WGQc08913QL5YsJ8tsvOEP9l2ihgl4WYgztxTX6\ntgPs7QFUOELpvDyzUa6syZRzkc7JyCgX97qkBTsrndOSUa7vvJJKJ4tzo+S6PTG6U+rFSaWTHbNR\nWJZ3c1Lp5AUZZRafboGSTg7qa9nFcEZZ0d8d2QJ1e6fP46dD7mJiLkfuX6r2EBSuQUaZzouSdjuJ\npmcyyvfGjU4P99RzGTJKeUPktf8jZBQyChCRUfjfEJNoOiSjfGm4rU5wEhkFiMgoLItJNAEZ5Rui\nA3cySknyyoRkFNSfiIzyMdGBn2QUICKjFGOWypxklNmpPyEZ5TOvnw1X+WCgBx4iwlFeWQ7+SkZ5\natfNnw8puszE80q/C63IKL896+ZO8mB3uB4Znd66Lstyy+fxMiop3PmfMKWfs85gAIxe0tGPn06Y\njc7kXs9C7TAnhUVGr6/QxPMZJQUZ7di67sO3+8jrRN4+e37glJTJyeiwHpO6C1nFro1Y0uEOmG65\ni2lMf41m0+3xnvzOtGSUYpSUOVnU9220Km3b9jhR7pAVPQXJaN8eLzF173Z2oZNG3WbHismpZJTy\n2pb054kFAaUCGR3TY6j6mQEuy1K9pB+l04qesmR0WLtL8/N14ddvb9lOY/4sc6LiE9IifzjMRinL\nbJQTFVnazz3nZgAyyrm+Lmnp11G5f1tTUQqTUU73XUmLt86tAZxERqmhyRaoGR6CQg+MJ67j4ENQ\nFiWlKIOJsX297UlJKcVIYjylrt0rKUU4N8oYbHuiW/4a068K6TQhJWcM0cDBx6PUGZtKSsjLNtPG\n602c21Zv5e7VpgnJKG20fujJH5SUhIzCsigpARmlma4mpMtt22lXB8QgZJSWugtXdwfEAGQU/qSk\nfEhGaazHavV4TPRLRmnPrk2GZuMxPNHZUwLpltkoPGFpzzEyCs8pKQfIKLykpLwjo/COkvKSjMIB\nrjXxnIwCRGQUICKjABEZBYjIKHxlXX+/7T7++JVcmieDwuceHx3lUv7EzEbhQ4/RtLF0bjIKELGo\nh9LMTCcjo1DabsmvqldnUQ8QkVH40OMFJVfq52ZRD5/blVRD5+YhIgARi3qAiIwCRGQUICKjABEZ\nBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjI\nKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBE\nRgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEDkP52EiVv2xX+qAAAAAElFTkSuQmCC\n", 61 | "text/plain": [ 62 | "" 63 | ] 64 | }, 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "mol = Chem.MolFromSmiles(\"Oc1nc(O)ccc(C)1\")\n", 72 | "mol" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [ 82 | { 83 | "name": "stderr", 84 | "output_type": "stream", 85 | "text": [ 86 | "[2016/Mar/24 16:48:13 DEBUG ] apply> mol = 'Cc1ccc(O)nc1O'\n", 87 | "[2016/Mar/24 16:48:13 DEBUG ] apply> starting pass 1...\n", 88 | "[2016/Mar/24 16:48:13 DEBUG ] rule 2 '2-hydroxy pyridine -> 2-pyridone' applied on pass 1\n", 89 | "[2016/Mar/24 16:48:13 DEBUG ] ...total of 1 hits in pass: will continue...\n", 90 | "[2016/Mar/24 16:48:13 DEBUG ] apply> starting pass 2...\n", 91 | "[2016/Mar/24 16:48:13 DEBUG ] ...total of 0 hits in pass: finished.\n" 92 | ] 93 | }, 94 | { 95 | "data": { 96 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAFrUlEQVR4nO3d23KbOgCGUdjT939l\n9oUb1wGf4BcgobWmF27TYpKhnyWO4zRNAwBb/Xf2CgC0TUYBIjIKEJFRgIiMAkRkFCAiowARGQWI\nyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChA\nREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYB\nIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgMifs1eAro3jeHsxTdO5awKbySinGcfx\nXs97TwdJpTWjTZZTPDZ0+aX7a9sn9ZNRTvCmocu/eX9tW6VOMsrRvm/o8h8+/tamSyVklENtbujT\nRd1f24w5kYxynIINXS75/tomzcFklIPs19DlG91f27w5gBOeOF/Zs0cfFyKpHMBolCN8ORTdtXpO\n9WcnMsrutk3nd0rqYfsW6IdNin0VyVbZU52UlLLsG2VHpYI1W4g9nlRFRtnLfoM+B5Goioyyi8Mm\nzpLK6ewkorwadj6+Py5fwxpyGUajFFZJoWpYBzrh7veUVElD4UgySjENNXSaptlJVLCZSX3zKrk4\np6GGQlky2rwansOhofTM1n9NRya13Ya2u+ZUxWj0mpxNCYeR0evbNakGdCCjfSmbVA2Fwb5RbjYk\n9RoNvcZ3wbmMRhmG9aNU9YE7GWXuY1I1FB75/8C3KjnPvzifCoRcDMq3pmmSG1iSUdZ5czW6q9Tp\nk4zSO7cpISSjFKNH9ElGASIyympGnfBIRsEHAxEZpSQ9okMyyhZyCXcyCsPgg4GAjAJEZJSNXg3f\nDOvojYzCXz4A2MaN8mDuqveyYicySnm3YV27DXr6zOpBVXlBRtmu9Vx+NPvWPGOVp2QU/nn/wfD0\nuQDTMAyS2jcZhS1+pfZx7i+p/bnyjIxjvBq+tTjfL7DOktofo1H4q0z3jVL7I6MwDD8Nve3xLDaI\nXiZVTK+ovWkXFWp9Xj9bT0fkWcVolN4tWy+prCKjdO3jeFlS+UhG6dfafQ7LpE4/X9j29o+L3rIE\n6tDGrivq19zu0ZIrNruhyTeLHcf5Aagqf0p8w2iUHo1j0Vn5bFEfh5nLaE6TkrZLRunO7r16euqo\nRF6XjFJGK7cpOXrMV/0PhJzbNrOvqu6FbN7MHmSUXmgoO5FRinnzdKbjV2amrobeDig9qmv9WMe+\nUa6vxkbNSlrd+rGCjHJxNTb0ptLVYjWTeq6s3oZyITJKSVUdlx8M+DiEjFLeOI5VxRR2Zd8oJT2e\nge/pxHSigctOaMX7q5hUlauSUcpYeyXoT1U9n5jmySgFhFfTO4GSptk3yvk8TJOmySipsjd2klSa\n44QnIrveHG+a/v0ax19VXZ5Pdf+TN1+CPcgo2x15g9FbTH+/+zHvDB/IKBudfpPm5W2S4BQyyhan\nNxTq4RAT61XT0FcPgjNK5UgyykqV3TTpaUntReVIJvWsUVlDoQYyytdqbahjTZxLRrmCKvNOL2o5\nVkDtah2KwumMRvmChsJrMsonGgpvyShvaSh8IqO8pqHwBRnlNQ2FL8goQERGASIyChCRUYCIjF7a\nx+dp3B7NMXtAB7CGG+V1bHY+k9ObYBOj0V49vUmnMSmsJ6MAEZP6qzPAhJ3J6NV5ngbszKQeICKj\nvVoeUHKkHjYxqe/YrKQaCpt4iAhAxKQeICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZ\nBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjI\nKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBERgEiMgoQkVGAiIwCRGQUICKjABEZBYjIKEBE\nRgEiMgoQ+R8zwnFRd5EMYAAAAABJRU5ErkJggg==\n", 97 | "text/plain": [ 98 | "" 99 | ] 100 | }, 101 | "execution_count": 5, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "rules.apply(mol)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "Clearly, the tautomer where the carbonyl is adjacent to the methyl group would be as valid as the one generated above. " 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### N-heterocycles\n", 122 | "\n", 123 | "This issue is also relevant for N-heterocycles such as imidazoles, for which there are currently no standardisation rules. In these cases there may be no significantly preferred tautomer, or, if there is, it will depend on subtle electronic effects, intramolecular H-bonds _etc._\n", 124 | "\n", 125 | "Any standardisation rules in these cases would thus be directed more at 'canonicalisation' than at correcting the representation, which is more what the ruleset as currently conceived is aimed at." 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "### Alternative strategies\n", 133 | "\n", 134 | "An alternative strategy to the one adopted (_i.e._ return one standardised tautomer only) would be to generate all possible 'standardised' tautomers and then either...\n", 135 | "\n", 136 | "* return them all to the user\n", 137 | "* select one 'canonical' example, using rules similar to those used to identify canonical SMILES\n", 138 | "* attempt to identify (very approximately) the lowest-energy tautomer\n", 139 | "\n", 140 | "Obviously this is a much more heavyweight solution, and, for current purposes, would probably be overkill." 141 | ] 142 | } 143 | ], 144 | "metadata": { 145 | "kernelspec": { 146 | "display_name": "Python 3", 147 | "language": "python", 148 | "name": "python3" 149 | }, 150 | "language_info": { 151 | "codemirror_mode": { 152 | "name": "ipython", 153 | "version": 3 154 | }, 155 | "file_extension": ".py", 156 | "mimetype": "text/x-python", 157 | "name": "python", 158 | "nbconvert_exporter": "python", 159 | "pygments_lexer": "ipython3", 160 | "version": "3.5.1" 161 | } 162 | }, 163 | "nbformat": 4, 164 | "nbformat_minor": 0 165 | } 166 | -------------------------------------------------------------------------------- /standardiser/docs/Neutralisation_strategy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run notebook_setup.py\n", 12 | "\n", 13 | "sys.path.append('../..')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "from standardiser import rules, neutralise" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "for module in [rules, neutralise]: module.logger.setLevel('DEBUG')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "# Neutralisation Strategy\n", 43 | "\n", 44 | "This document explains why neutralisation steps involving proton addition/removal are carried out both before and after application of the rules. Originally, neutralisation was only carried out after application of the rules.\n", 45 | "\n", 46 | "This is based on a real example. The amino-pyrimidine moiety is protonated and this originally prevented the '2-hydroxy pyridine -> 2-pyridone' transform from working as intended. The '4-hydroxy pyridine -> 4-pyridone' transform was thus applied instead, which is not what might be expected..." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAHEklEQVR4nO3dUXeaShSAUbkr//8v\ncx9IrAsEkQPMzJm9Vx/axEaa4pczoDiM4/gA4Kj/Sm8AQNtkFCBERgFCZBQgREYBQmQUIERGAUJk\nFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQU\nIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREbp3jD8+zX7+PKWsPBTegOgqGF4\njOPqH2EH0ygdW0ZzHI2cfEtGAUIs6mGdyZQdZBTWzZb8qso7FvUAITJKx5YnlJyp53sW9fRtVlIN\n5XvDaL8BCLCoBwiRUYAQGaV7nsZEjIwChMgoQIiM0jdPFCVMRgFCZBQgREYBQmSUjjkwyhlkFCBE\nRgFCZBQgREbplQOjnERGAUJklE65HglnkVGAEBkFCJFRejQM3j6H08goQIiM8sbyevCuEA9rZJT3\ndBN2klHem71/eyYOjHIuGQUIkVFWJR5I4UQyyhYlhY9klCO0FZ5klA/eDqTTB1uMqfNLnE5G+bXR\nxLfZGceGYwon8pOZxyN87c2ppPXvSsMwPB4P+zzn+im9AZQXv37x9NerjenwNzBPAbWu51z2p96d\nfg34emI6q+fyU3Z+TiGj/bq0dwVjulHP5S3t/8TZjTp1zxsR3dmp/fWc/S0PAYLsQz26883crl4+\nH6vn7Ct4FBBhB+rOx4ZeEdnTYxqv5+yreSBwmL2nL0Ua+vLFo/vb4Xp+vGsnnThMRjtStqF/d3Gk\nVqfMnnsibizlADtNL3aMY7ceMN2z4527ct95v3d+H8hBRrtQVUP/7nF1LD29nrMvrqScS0bzq7Ch\nL3f9L6aX1nPtTtdvo6TsJaPJ1dzQl20ocHqnie8MTXCFp7SGYWioFPf/OB/H8Tn/rtzAxavYRUZz\nmgLaSkNLUVJOIaMJ7X5mzz2b80HZ5xiN47gdSiXlIxnNxrMjv/UxlErKNhlNJU1Dt9fap1NSImQ0\njz2vd2yioUUoKYfJaC/2nHS6X1VZV1KOkdE8Ns47V1Wrjwpu7Z6SwoyM5tdWQ4szcvItGU1lOZBq\n6AFKyldkNDkNPUZJ2U9Gs/n4ypx6VD4pf9y05bf5+ZGNT5GPjFKXyts6I448ZDSlhgbS1ln785BR\nKEJ8M/kpvQFcYhpIG1odt2saSJffaaHsh4xSRqbKvy3p7I/Ls0/Tb7J8D7pmUZ9Wi0dIM7V1zTj+\npvP5G1ono7l5mN7EuaaeyWhmHtt3+mq0NIdmIqMUkGPxvvwXPD+y8SnykdHkDKRwNRmlFjlGVDok\no3CcSZ+HjPbAuh4uJaPczeKdZGQUDnr7GlA6JKNdqH9db0SlXTIKECKjvah/IIVGySi3SrN4d2CU\nJxntiIc9XEFGKS/NiEqfZBQgREa5T5qp04FRXsko5eVoK93yXkxcrrn3Mtlh8M4CPMkop1nL5euw\nmWZdD08y2pflQb2dh/nWB8p/n9jTR+/8TD4y2p3tbq7lcv2vfB3E1kva9MZzBRntzts3VX/97C3b\n8Pvmz3pEAs7UU8Y4js+YQtNktEf1XKZESUlARjulpMc4MMqSjFJenSUdhqHCraJCTjH1q56B9FHN\n6fvXbhbfGFoho12rr6QFXqsunQTJaF+WlaiqG9tPxjrXXzzLj8C0zj5EdS4t6XP0PHAXNRx2oEKm\nUapzxUwaqSdsk9FetHWJzOdB2+A2n1hPZ+1ZI6NdaKuhk2mDD2/5KRV+vNRzWs5b17Nkn+hCixl9\nKrXxa6/6V1Jm7BD5Nd3QyZ3/hNn4uXYbDxye7A3JJWjo5Op/yJ56zm7vscPEsVHacNFTSg8/e9S1\n/njyEzWzNKPouV5PuZ9yDsqDqHOmUbpwbjpfVXI1AAry35+WUfS6dL67Lw+lfvm/z6nzhp71pNEv\n79SjqVOuN0p7lq8nmn1kHAv8FJmuUEWHZDShYRjST0V1BquqCw9yGxnNppOlZbXBmjaszm3jIjIK\nJ5sOKShpP2Q0lU5G0Unlqap88ziRjNKwylNV+eZxFhnNo6tRtBVK2gMZTaLbhtbfKSed0pNRmlf/\njw8nnXLrdIRJpttRtDmdv7osK9Mo3EdDU5LR5hlFoSwZBQiR0bYZRaE4GW2bhrZr4zpV25ewmp4+\n5bx/PWQUijmQwulcv2dQVUVGoZgDKbT8qJCMQpM8BbUe3tIOSlp74+jtKVVDqyKjUNjbks7+ODvF\npKFVsaiHlmhohWQUynPavWkW9VCF/SV9vZnJtAZeAwMQYlEPECKjACEyChAiowAhMgoQIqMAITIK\nECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQ\nIqMAITIKECKjACEyChAiowAhMgoQ8j+ngS93E0LGLAAAAABJRU5ErkJggg==\n", 59 | "text/plain": [ 60 | "" 61 | ] 62 | }, 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "mol = Chem.MolFromSmiles(\"Oc1[nH+]c(N)nc2n(C)cnc12\")\n", 70 | "mol" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 5, 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [ 80 | { 81 | "name": "stderr", 82 | "output_type": "stream", 83 | "text": [ 84 | "[2016/Mar/24 16:48:58 DEBUG ] apply> mol = 'Cn1cnc2c(O)[nH+]c(N)nc21'\n", 85 | "[2016/Mar/24 16:48:58 DEBUG ] apply> starting pass 1...\n", 86 | "[2016/Mar/24 16:48:58 DEBUG ] rule 3 '4-hydroxy pyridine -> 4-pyridone (within-ring)' applied on pass 1\n", 87 | "[2016/Mar/24 16:48:58 DEBUG ] ...total of 1 hits in pass: will continue...\n", 88 | "[2016/Mar/24 16:48:58 DEBUG ] apply> starting pass 2...\n", 89 | "[2016/Mar/24 16:48:58 DEBUG ] ...total of 0 hits in pass: finished.\n" 90 | ] 91 | }, 92 | { 93 | "data": { 94 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAG0ElEQVR4nO3d23KbShBAUUj5/3+Z\nPODDUYRAQHOZ6Vmr8mBbcQrFeKu5qh+GoQPgqD9PLwBA3WQUIERGAUJkFCBERgFCZBQgREYBQmQU\nIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQg\nREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERuEZfb/4lZWHxo/HPxRCRuExB1LY\n990w/P5R0kLIKDzmQAqH4ZpFIUBGoUrjWEoJfp5eAEL6vh/8MtVsHEjnP8P1KVVDiyKjdRuGQUlr\n97Gkb5++HWLyAy+KjXqoiYYWSEarNw6kTy8FIQ67V01G4T4rrdw+Y07njSpvIexWS8Ie0vLZHs/K\nISa43Dg2amhWNuqTsIe0WNN1R2Qlo3koaYFsyLdARuEqGtoIGU2lkYH08L2R7qSh7ZBRqlTyi8V4\nKpKGtkNGs2lkIC32fHUHlBokowkNw1BmYtIzhLbJeaPU6ti9ka7jCohmyWhOS4lJZu+9ka6joS2z\nUZ9WsXsP89HQxskoddv7anH6HT00FBv1mSXbtF96LrtKOv4L09+P/OeMZ0RoKF5Ik0uT0YueyGt/\nd/37hlAmVoX8EpT0nqfQ913XbRowNZRX1oYmVF3S+xd+un7h42+HhvLGvlGK9sgLwFTJ1+vBxi9q\nKHPWiVZUN5CWdqtjB5RY4oSnVtT161/glekCyhIZpTjVDc40TkYpS8kNbeTuWewloxSkqIYqJhvJ\nKKUoqqGwnYxShL7vNZRKySjPK/NkzI97Qu0eZU5G21LOO769LECJDYXtXMXUnHJ2QTqhnRxktDmF\n3D3PEEoaMso/lrbxzy2ehpKJjLZoZSBditvyLtT3m3d8VXtDx6NMVT8FziWjjdq7ab/8N/9/YOkQ\n9mtx6gqQYrKFjHKapdzMbzdXvxzPgnPIaLtue+vQj7fvrIdi8oXzRpuWZDSER8loW+bdvLOkaS4B\num2QpwoyCmsUk69klEvkmDphCxltiLLBFWSUW9k9Sj4y2oqbr6NPk0v4SkYBQmS0CSXc0qlett9Z\nJ6NwkLwyklHuZrcpychofk9t0WfKpV0irJBROE5e6WQUIEhGk3OMHq4mozwg025TkNHMHh9F5ZIW\nyGhuEgaXk9G0vBcb3ENGeYbtfdKQ0ZyMojeYvwpMX1l5iHxklGvlnjrzPjN2kNGEjKK3cXcSOhmF\nR4hvJj9PLwAnq2gUHbf3a1naJeNAOn8SQtkOGU0uQafK97Gkb5/Ojz6NH/jhJGCjPpV5NEs4wlPC\nMpRjGH7TOX1A7WQ0PxW7gWNNLZPRPFa235X0LCv/i7tGS3NoJjLairGkpcW0rr6/7QCdp3D6yspD\n5COjeXxN0jAMdWWrKI/fLotiyWgqWyr5SElrz7eGskJGsym2pPXSUNbJaEKbS3rP4nxReNM1lK9k\nNKctB5RuPkenxqsANJQtZDStLQeUyjnbscCBVEPZyJWC+X29HvTBXoz1HBdvKmkJ66SGsp2MNqHA\nkr4G9ONDS4/eQEPZRUZbUVRJN94w5ZGeupkLe1ljGvI1lDeUdGUI/fpdB75xLw3lACtNW54taTxS\n1/X0WN+hk9EGPVLS0yN1bk8NoURYe1p0827Q7sopL95TDSXICtSoe0p6Z6GO9VRDibMOtevSN7F4\n8B0ytvdUQzmF1ah1p4+l5bzF0EpPHVDiRDLKaSUtJ6Bv3no6DaGmUU7hnUFZfIvg7YoN6GhqpSGU\nK8goXRcoaeEBHU1Tp4ByBRnl197CVBHQdeMdsLSVIBnlCDfvgIn7jXJEXQ0t/Ab71E5GAUJklA/m\no5thDpbIKJ810k3b+8TJKJ+V8zZNUDgZpQmmTq4joywykMIWMsqaFkpqUCVIRgFCZJQv0gykpk4u\nIqN8V9c1Swdkf35cS0b5YN7N9CWFw2SUhiy+FqTZc8ETZBQgREZpiamTC8goQIiMAoTIKHRdZ3uf\n42SUxsglZ5NRgBAZBQiRUfiP7X0OkVHaI5ecSkYBQmQUXrgFC/vJKE2SS84jowAhMgoQIqMAITIK\nECKjACEyChDy8/QCwNNer2hyIhT7ySht6/t/0vn2KWxgo56GzaPpcnv2k1GAEBkFCJFRgBAZBQiR\nURo2P6DkSD37OeGJtr2VVEPZT0ZpnnQSY6MeIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBE\nRgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERG\nAUJkFCBERgFCZBQg5C/V7wN7yYlV8wAAAABJRU5ErkJggg==\n", 95 | "text/plain": [ 96 | "" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "rules.apply(mol)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "As stated above, in the standardization procedure as originally implemented, neutralization was applied only _after_ the rules. If it is applied before the rules, the above molecule is handled correctly (_i.e._ the neutral 2-pyrimidone) is obtained..." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 6, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stderr", 124 | "output_type": "stream", 125 | "text": [ 126 | "[2016/Mar/24 16:48:58 DEBUG ] 1 positive/H, 0 positive/quat and 0 negative (of which 0 are acid) charges identified\n", 127 | "[2016/Mar/24 16:48:58 DEBUG ] Overall H balance: -1; formal charge: 0\n" 128 | ] 129 | }, 130 | { 131 | "data": { 132 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAG90lEQVR4nO3d23KbWBBAUTGV//9l\n5oFYUYFAmD7iXHqtyksc28IK2m4ugmme5wcAV/1XewEA+iajACEyChAiowAhMgoQIqMAITIKECKj\nACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQIqMA\nITIKECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMkp60/Tvz+rj28+EjT+1FwCqmqbH\nPO/+FU4wjZLYNprzbOTkt2QUIMRGPewzmXKCjMK+1Sa/qvKOjXqAEBklse0BJUfq+T0b9eS2KqmG\n8nvTbL0BCLBRDxAiowAhMkp6TmMiRkYBQmQUIERGyc2JooTJKECIjAKEyChAiIySmB2jlCCjACEy\nChAiowAhMkpWdoxSiIwChMgoSbkeCaXIKECIjAKEyCgZTZPb51CMjAKEyChvbK8H7wrxsEdGeU83\n4SQZ5b3V/dtHYscoZckoQIiMsmvggRQKklGOKCl8JKNcoa3wJKN88HYgXT7YY0wdX6I4GeWvgya+\nzc48dxxTKMhvZh6P8LU3l5K2vypN0/R4PKzzlPWn9gJQX/z6xcuXNxvT6WdgXgJqu56yrE/ZFb8G\nfDsxXdVz+09WfoqQ0by+2ruKMT2o5/Yzrf/EWY2SuudGRHd26nw9V1/lJUCQdSijO2/m9u3N52v1\nXH0HrwIirEDpfGzoNyJbPKbxeq6+mxcCl1l7cqnS0JdvHl3fLtfz40M76MRlMppI3Yb+PMSVWhWZ\nPc9E3FjKBVaaLE6MY7fuMD2z4pXdcj/5uHc+D4xBRlNoqqE/j7g7lhav5+qbKyllyej4Gmzoy0P/\ni+lX67l6UCWlIBkdXMsNfVmGuw/vKCkFucLTsKZp6qKhi5t/nc/z/Bx+9z/Hxas4RUbHtAS0l4ZW\noaSUIqMDOn1mzz2L80HFc4zmef5YSSXlIxkdjbMjf+VMJZWUYzI6lGEa+nFzu6Az1/BXUg64bPM4\nvN/xsudlpw+em6Wknjy2TKNZnDnodL+mRuOPI6eZlLdkdBwHh56bqtVHVQ86fdjA7+dZ5D4yOr6+\nGlrd846ncJKMDmU7kGroNUrKeTI6OA29zDPHSTI6mjNvzmlE75Py9mnu5ImnMBmlLX21VTd5yOiQ\nOhpIe2cXKg8ZBQiS0TEZSG9jIEVGqaOvfaDHlDQ5GR1WjwPpSG0lDxkdmyTdxECamYyOzGv7Tsbo\ntGSUCsbYeN/+BP3/TFwho4MzkMK3ySitGGNEJSEZhetM+jxkNAPb9fBVMsrdbLwzGBmFi9zhjoWM\nptD+dr0RlX7JKECIjGbR/kAKnZJRbjXMxrsdozzJaCJe9vANMkp9w4yo5CSjACEyyn2GmTrtGOWV\njFLfGG0lrT+1F4DxdXcvkxMmdxbgSUYpZi+Xr8PmMNv18CSjuWx36p3czbc/UP77hzN9XG60p6SM\nREbTOe7mXi73v+TXQey9pF0vPN8go+ks7wrd68A9fXje/FmPGIAj9dQxz/MzptA1Gc2oncuUKCkD\nkNGklPQaO0bZklHqa7Ok0zQ1uFQ0yCGmvNoZSB/NHL5/7Wb1haEXMppaeyWt8F516SRIRnPZVqKp\nbhyfjFXWTzzrj8D0zjpEc75a0ufoeeEhWtjtQINMozTnGzNppJ5wTEaz6OsSmc+dtsFlLlhPR+3Z\nI6Mp9NXQxbLAl5e8SIUfL/VcNudt17NlnUihx4w+1Vr4vXf9KykrVojxdd3QxZ0/wmr83PscLxye\nrA2DG6Chi2//IGfqufp8rx0W9o3Shy+dUnr57FHX+uPJb9SRDTOKlvV6yL3IMSgvouRMo6RQNp2v\nGrkaABX57x+WUfR76Xz3WF5Kefm/H1PyhpY6afSXD+rVlJTrjdKf7fuJVh+Z5wq/RZYrVJGQjA5o\nmqbhp6I2g9XUhQe5jYyOJsmmZbPBWhaszWXjS2QUClt2KShpHjI6lCSj6KLxVDW+eBQko3Ss8VQ1\nvniUIqPjSDWK9kJJM5DRQaRtaPudctBpeDJK99r/9eGg09iSjjCDSTuKdif5u8tGZRqF+2jokGS0\ne0ZRqEtGAUJktG9GUahORvumof06uE7V8SWsltOnHPdvh4xCNRdSuBzrdwZVU2QUqrmQQpsfDZJR\n6JJTUNvhlnZQ096No4+nVA1tioxCZW9Luvrr6hCThjbFRj30REMbJKNQn8PuXbNRD004X9LXTzOZ\ntsB7YABCbNQDhMgoQIiMAoTIKECIjAKEyChAiIwChMgoQIiMAoTIKECIjAKEyChAiIwChMgoQIiM\nAoTIKECIjAKEyChAiIwChMgoQIiMAoTIKECIjAKEyChAiIwChMgoQIiMAoTIKECIjAKE/A/bphp7\nyTpz2QAAAABJRU5ErkJggg==\n", 133 | "text/plain": [ 134 | "" 135 | ] 136 | }, 137 | "execution_count": 6, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "mol2 = neutralise.apply(mol)\n", 144 | "\n", 145 | "mol2" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 7, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [ 155 | { 156 | "name": "stderr", 157 | "output_type": "stream", 158 | "text": [ 159 | "[2016/Mar/24 16:48:58 DEBUG ] apply> mol = 'Cn1cnc2c(O)nc(N)nc21'\n", 160 | "[2016/Mar/24 16:48:58 DEBUG ] apply> starting pass 1...\n", 161 | "[2016/Mar/24 16:48:58 DEBUG ] rule 2 '2-hydroxy pyridine -> 2-pyridone' applied on pass 1\n", 162 | "[2016/Mar/24 16:48:58 DEBUG ] ...total of 1 hits in pass: will continue...\n", 163 | "[2016/Mar/24 16:48:58 DEBUG ] apply> starting pass 2...\n", 164 | "[2016/Mar/24 16:48:58 DEBUG ] ...total of 0 hits in pass: finished.\n" 165 | ] 166 | }, 167 | { 168 | "data": { 169 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAG3ElEQVR4nO3d23abSBBAUWlW/v+X\nmQcyjC0JBBSX6uq9Vx5iO45RrBwXzUXPYRgeAOz1z90bANA2GQUIkVGAEBkFCJFRgBAZBQiRUYAQ\nGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFRgBAZ\nBQiRUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFRuMfzOfuehQ+Nvx9/kYSMwm12\npPD5fAzD319KmoSMwm12pHAYztkUAmQUmjSOpWTw5+4NIOT5fA7+M7VsHEjfv4fLU6qGpiKjbRuG\nQUlb97GkL2++HGLyDU/FTj20REMTktHmjQPp3VtBiMPuTZNRuM5CK9fPmNN5o8qbhGW1IqyQ5md/\nvCqHmOB049iooVXZqS/CCmla03VHVCWjdShpQnbkeyCjcBYN7YSMltLJQLp8A6QkNLQfMkqTEnZz\nMp6KpKH9kNFqOhlI056v7oBSh2S0oGEYciamPENon2SUVi0MpLdc4fN8PjW0T06/r2nu9mvFzD3M\n8T0/S3r2P4WryHomo2V1UtIFPx/7qUnV0M7JKG1beazpvKRqKDJaWbGBdO6xbD1q/ympe1I4nhGh\nofhBWlyZjJ79QKazxFb+jzCEMvFUqK9ASa98CGt6qqH85NnQhaZLetfGz+2zaygvrI2S2u6Gxm/x\nObbyZTjVUN55TvSiuYH0kFsdT4eeDjki/3BAiU9klIwOj/4hPTWK8pGdetI5Y3Ce/sID51MY+elK\nLpctPuw7Y9RAyjvTKIlcuYD73xf6dV9BiWQHGSWLuwa9n19UUtlBRkkhyc7ye1IzbBXJySj3S9LQ\nFwk3iZzctrkvCV8MLmdD53TyGi1sIqPdSRWBthoKH9mp706Su+dZeaQMGeWXuVn16GuKDKHUIaM9\nWhhI5+I2vxSw+Qyh1hs6Lo82/RA4lox2auuu/fyf/HzS5e/P/fVnBIhiZJTDzPXROe3UJqP92voS\nRoEv9DedhU4V8sOA/znhqWtGQ4izUMWlyqyNZjhpjCRMowAhMsopCi2Dwhcy2hFlgzPIKJcqc2uP\ny85zID8Z7cXFh0TK5BK+klGAEBntgrNz4DwyCjtZHmUko1zNsinFyGh9d+3RyyWdkFHYz4ozDxkF\nCJLR4hyjh7PJKDewbEolMlrZ7aOoXNIDGa1NwuB0MlpWmRskQ3Iyyj3s71OGjNZkFL3A+0+B6T0L\nH6IeGeVctafOuo+MDWS0IKPoZdydhIeMAgT9uXsDOFhDo+i4v9/K1s4ZB9L3B2FK7YeMFlegU/l9\nLOnLm6pamJ36Ut6jmeEIT4ZtgPPIaH0qdgHHmnomo3Us7L8r6VEW/hWtnXRLRnsxljRbTNvq+8sC\n6Hs3p/csfIh6ZLSOr0kahqGtbKVy++2ySEtGS1lTyVtK2nq+NZQFMlpN2pK2S0NZJqMFrS7pNZvz\nRfKmayhfyWhNaw4oXXyOTotXAWgoa8hoWWsOKOU52zHhQKqhrORKwfq+Xg96Yy/Geo6bN5U0w3NS\nQ1lPRruQsKQ/A/rxQ3MfvYCGsomM9iJVSVfeMOWWnmooW8loR74G4oKCLAyhXz9rxydu5YZY7OBJ\n05d7SxqP1Hk93dd3eMhoh24p6eGROranhlAiPHt6dPEy6OPMKS/eUw0lyBOoU9eU9MpC7euphhLn\nOdSvMTsnff9P/cu/fem1PdVQDuFp1LvDx9IbA/pioacOKHEgGeWwkuYJ6IuXnk5DqGmUQ3hlUGZf\nIni9tAEdTa00hHIGGeXxCJQ0eUBH09QpoJxBRvlra2GaCOiy8Q5Y2kqQjLKHC89h4n6j7NFWQ5Pf\nYJ/WyShAiIzywfvoZpiDOTLKZ5100/4+cTLKZ3lepgmSk1G6YOrkPDLKLAMprCGjLOmhpAZVgmQU\nIERG+aLMQGrq5CQyyndtXbO0Q/XHx7lklA/eu1m+pLCbjNKR2Z8FZVYuuIOMAoTIKD0xdXICGQUI\nkVGAEBmFx+Nhf5/9ZJTOyCVHk1GAEBkFCJFR+I/9fXaRUfojlxxKRgFCZBR+cAsWtpNRuiSXHEdG\nAUJkFCBERgFCZBQgREYBQmQUIOTP3RsAd/t5RZMTodhORunb8/krnS9vwgp26unYezRdbs92MgoQ\nIqMAITIKECKjACEySsfeDyg5Us92Tniiby8l1VC2k1G6J53E2KkHCJFRgBAZBQiRUYAQGQUIkVGA\nEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYAQ\nGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUI+ReiKxR1Hm/iFgAAAABJRU5ErkJggg==\n", 170 | "text/plain": [ 171 | "" 172 | ] 173 | }, 174 | "execution_count": 7, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "rules.apply(mol2)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "The procedure was thus be enhanced by including a neutralization step _before_ application of the rules as well as after.\n", 188 | "\n", 189 | "Recall that neutralization _after_ the rules is still required as some rules 'expose' a removable proton on cations (_i.e._ the various 'conjugated cation' rules)." 190 | ] 191 | } 192 | ], 193 | "metadata": { 194 | "kernelspec": { 195 | "display_name": "Python 3", 196 | "language": "python", 197 | "name": "python3" 198 | }, 199 | "language_info": { 200 | "codemirror_mode": { 201 | "name": "ipython", 202 | "version": 3 203 | }, 204 | "file_extension": ".py", 205 | "mimetype": "text/x-python", 206 | "name": "python", 207 | "nbconvert_exporter": "python", 208 | "pygments_lexer": "ipython3", 209 | "version": "3.5.1" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 0 214 | } 215 | -------------------------------------------------------------------------------- /standardiser/docs/Rule_application_strategy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run notebook_setup.py\n", 12 | "\n", 13 | "sys.path.append('../..')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "from standardiser import rules" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "rules.logger.setLevel('DEBUG')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "# Rule application strategy\n", 43 | "\n", 44 | "This document discusses aspects of the rule application strategy, as this is not always straightforward.\n", 45 | "\n", 46 | "The document was originally created to illustrate a problem with an earlier version of the code. There, the transform for each rule was applied repeatedly, with the *first* product of each reaction being taken an input to the next, until the reaction no longer produced a product. This was to handle cases where a moiety requiring rule-based standardisation occurrred multiple times in a molecule, and it worked for most such cases.\n", 47 | "\n", 48 | "However, that approach failed for molecules such as this one (which is a simplified version of real examples)..." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAILElEQVR4nO3d3XKjOBCAUdja939l\n9sITr8f8xNCAuuVzKhe7qakZYqTPQnbwOE3TAMBR/7Q+AIDaZBQgREYBQmQUIERGAUJkFCBERgFC\nZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJk\nFCBERgFCZBQgREYBQmQUIERGAUJkFCBERqE74/j/19v353+SsH9bHwBwqnEcpmn1f7mA1Sh0ZB7N\nabLkvJqMAoS4qO/C63Jj+4LOJd6XszK9gIzWZy+Mz82fVglzUV+cvTBoTUahI/MnUVcn13NR3zsr\n02/zVlINvZ6M9s5e2BdaS+f8+yJ7Bhf1ACEyWpy9MGjNRX199sJ48iTagox2wV4YtOOiHnphKdqI\njAKEyGh93sMETckodMEVfTsyWpzJA63JKECIjAKEyGhlruh5MBKaklGAEBkFCJFRKM4VfWsyWpbJ\nAznIKNTml9iak1EobBzHyUVJazJakyt6SENGAUJktCTbYQyu6NOQ0XpMHkhFRgFCZLQYS1HIRkah\nJE+oecgoQIiMVmIBAgnJKNTjCTWVf1sfALDD6INg85HRMixAvtNbN59jwHjIQ0Yhl7VuvpmmSUmT\ncBpqMGH6No7D81d8d51oAyMDq9ECbId16fWsTtMwDEdqaE2agYwm9ZrOxyQxWzowSyc9MDMTmadz\n/gecrzzmN31dvA3sDek0MNqyGm3s13S+cgWXzfbtsx/n9obTZWC0JaMNvJRz99A3YVKZpq2Snn6W\nNk69gdGQjN7h7SWil6Fu0LPDdiuVtBUP+iXWu3nW3+/Etfdch87/4+J/d+vsGxv38zv1+8zfevT6\nnXH88zVNf32d7rHuOP/v5ZDHpf2N/9zW2Tc27ieju20M0dO7abZwgLFxMxnd7c6lh3VHFakWpJ/8\nAU4ko9kpaRU3b0gqaR4yekSqpYfZ0sq8m9lKyj1k9CAlJQMDIwMZLcOEYZGB0ZyMHnfzgvRXJszN\n8jzYStqWjIbYCyMJJW1IRvdJ/qqC2XKbe35h6UTGxnVktB4lZZG3QLUioyUpKYuUtAkZ3SfPCFTS\nhtau6DM85Ep6PxktTElZ5NTfTEZ3SPiqQv6Sjj/aHsYNUg2P/AOjJ27bXF62W/mufcy6+2DeLNvA\n6JjHcYdUy403bW/l++EnSvUxbzc2RhP+cO7xfAOr0U8lnzw3Lz3WlpzbrICycUZOIaP9uHRKHOvm\nXK/zNsmz6VyvD3gqMhqSdvLM7Z1OZ3WzM4XO+NPGqVfYU8hoV4KX9h9ucQZZH93PY34pGf1IoTXI\nrpK2WnJWntVj0Y/Frvlo1yCjx6Vt6yclff73jce1cBi15vb6pXHSkcA9vP2+T5/8RmDzhHkTeFvl\nnsbSktHfFV1rrEUq1eTppaQd/AgcJ6OfqDpJ8uRyQ/WSpnpaogkZPajoEjWnEiXtLJed/Thtyegv\n1kdb9mm/KO3kKVFSWCSjACEyusXvftwp84K0s5FQ9LDTklESyVxSWCOjX6TEGkRJKUdGV3V2HVdI\nlZIaCTzIKBmlKmlnuezsx8lARvepOwTLHXmGkm5/ilStx5PruDXJsnLR6ZIPkqIEGSW1G0r6yV1W\nK96PalEfP0U2MrqgvxeX6h754IOkSE9G6Z8PkuJSMrpgbbaYP634IKlTeA64iIwus+7IxgdJkZYB\nsaWPCdPHT/GQ4YOkhrIPadHDzs9qdIulRzY+SIqEvP3+FxneBM6r6Ufzw6g1MHT/OjL6u3IT5pXJ\nc53SA4MTyehHTBgWTdNkXCCjn1JSFk3TkH9cuCi5lIzuUK6kJs898pR0+14qXMQr9fs8LuKkiTeP\nkjYZGPe8YZYNMrpbwwkDD7vS6aLkajJ6RImSmjw3u3RUvGRzHKw6k5HRg5KUNMN70Xk6a1TMtzdf\n/k4nOh0ZPa5VSe2FZXZ4VLym01mtRUZDbiupdBZybFRcthtgb+dyMprXTzmP7IWZPG1l2PPxzqfb\nyGjUuRNm6cpODUtqu9sz+CCpG8noCYIltSnGMR/eKtD9qK7mwT3NgZI+ZsHpZ8Cc6VXkbU9GxXU8\nsmfKsx3mtPbhbXszeFaV9CIu6s9kO4ygc7vJPUy2Yj7/5AwlzWN+mfL2nedZvfqMGRVX8JgWMI7D\nYy9ssB1W03wTvOH+j1FxOhf1Sc1evj8y7r1Em0eGd5L+HIlRcTIZTeSKdz6ZM8wZFeeS0ca8afR7\n5FmQDkp6Kne/v8r8N/HevjOOfybV8+si5W7a37E898kfHvvseY6mMhm90PYQvTSds39LSVmipGeQ\n0QulGqJKmkSqUTEMCQ+oHhn9Ii7ikki3IWlgxMjotdKNz3QH9BXm3UxXUgJk9HLpwpXugEjAqAiQ\n0a9kzjBnVBwlo3fIOD4zHhOtGRWHyOhNMu6FmTPMGRX7yehVvKpAVUq6k4x+NxOGRZ7z95DRr6ek\nECOjKCmEyCjDMCgpHCej/LAdBofIKPCBx40dH19v35//yS/jts3Ab+afwOfa5YXVKLBpHk076X+T\nUYAQF/VAzNevTGWUdWuftze/yrNZ9s3mg+HLyCgrvKoAn7E3yhKvKvA0P/WeU/9mNQr85q2kGvo3\nGeUQK9Nvs5ZOd4SUUQ76+lcV4MneKECIjLLEqwrwMRf1rPCqAnxmnEwPgAAX9QAhMgoQIqMAITIK\nECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQ\nIqMAITIKECKjACEyChAiowAh/wEUmkA/Qrcj6AAAAABJRU5ErkJggg==\n", 61 | "text/plain": [ 62 | "" 63 | ] 64 | }, 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "mol = Chem.MolFromSmiles(\"Oc1nc(CCc2c(O)ncnc2O)nc(O)c1\")\n", 72 | "mol" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "# Reaction defining rule (for rule '2-hydroxy pyridine -> 2-pyridone')...\n", 84 | "\n", 85 | "rxn = [x['rxn'] for x in rules.rule_set if x['name'] == '2-hydroxy pyridine -> 2-pyridone'][0]" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 6, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlgAAADICAYAAAA0n5+2AAAKnUlEQVR4nO3d4XKjNhQGUNPp+78y\n/bHr2sGIAL5IV+Kcmcw0yW7tlczVhwBpmud5fgDAcVPrNwBZ/dP6DQAAjEbAAgAIJmABAAQTsAAA\ngglYAADBBCwAgGACFgBAMAELACCYgAUAEEzAAgAIJmABAAQTsAAAgglYAADBBCwAgGACFgBAMAEL\nACCYgAUAEEzAAgAIJmABAAQTsAAAgglYAADBBCwAgGACFgBAMAELACCYgAUAEEzAAgAIJmABAAQT\nsAAAgglYAADBBCwAgGACFgBAMAELACCYgAUAEEzAAgAIJmABAAQTsAAAgglYAADBBCwAgGACFgBA\nMAELACCYgAUAEEzAAgAIJmABAAQTsAAAgglYAADBBCwAgGACFgBAMAELACCYgAUAEEzAAgAIJmAB\nAAQTsAAAgglYAADBBCwAgGACFgBAMAELACCYgAUAEEzAAgAIJmABAAQTsAAAgglYe0zT6+v9ZwBX\nUXega/+2fgPpTdPjMc/l7wGiqTvQPTNY31ieXVKXM/wY2rEv6g50wQzWN5xRtuMMP4Z27I/+gS6Y\nwWI8zvBjaEeA08xglTzP5Of55yDj7DE/fRRDO7aj7kD3BKw1y8ska8VNwQOuUrpUq+5ANwSsJfeg\n5PXeN87wY2hHgEtM86yi/k+4ykvfXEO75tNXn7hJDwrc5P7UV1G7F30DQGcErMfDAA7koBbBMG4f\nsCYFLTf9cx1tC3CZdAFruezOcnHpyKV5pml6uAUtMQGAO/F5h6GkfIpwzxPKy5ClLg3GYMPNTI/H\nwycexpEyYD2fHN8aX5e/ewWu6e/vt0uV2av9au+mom8A6F3KgHXGazz+8x/TYorrfcA2gB9XClXR\nSyjpm0rMEKbicw/jSRuwlusfHv/7P4vVe+BSyI4rzSq6bAsAn1IErK17rqJuaH+GquXMFnFctu2H\n+30ArtU8YP227V/0eDvPs4H8pKOB12XbnLQ1wPWaBiy3geRWuiR4dhLQZVv4JPDCmJoFrFrhSvE6\nZ7mv8rvypcD135e4bFuf4wGgjiYBa+uJNLW/vaP9sB24fr//ymVb7siJBYytesAqDaRXhau1wduA\nfq2fzfo5S6Xd2/CZb8s9iHAvVQNW7XDFcVcG3ddr7HuqkDhmS+o78hStkz4YT7WAJVzlV6sv1j4H\nBphYpdkSbXyd8r2I+9rbMQBjmeYKR3M5XNUrJmuvpZi9ZAi6+uOco/e8aec43+xkkKEuBjA1CgUN\nnyLsqog0Ueu+JX3Rl+2nNn/vRzMlx2ztxflNE5b6Qf/AGC4PWGaOzlm2Uekemm/bMVNfGFjKovd8\n1NbH1HwIZ+vnQD8uDVgKxDmlglv6s0t721z/5FOaLfm2m/T1d0p7ccb8v4UsGFGaZRqu1styDUff\nT+kejt/+bLZ/Ny9XDOS9fP4BRnFpwFoWcMV8W1T77JntytoPBv3rZkuErO88++Wa/7dZLBjNPzVf\nTJEoq1FE53k+NOPFeJ4D9m8/Y92Vh2ipH/QP9KlqwGot6+DiDJWlGrMlv/2MdUIWsMetAlZGLcLV\nslhnK97Z3k8rtQdy7f6pxfpwQhaM4fKApSiUmbniN0JWPjWaR8iC/lWawcoTIhQo2FbjGJmm6f8v\n1glZ0DeXCB9tCla22atsRTvb+6kh0+WoaO+B6vnZf3711M8ZtpR6PO55fEBvqgSsK2/YPWNrGYNa\ng03rcKVA96HV5ahvPx9bgaq0FEE2GcLUVttkbTfgj2Z7EWby27Y0kWEoQ7gqa/u+crfNtVoP5t+s\nkfXNbgJHX+uOtA30aZorHrWtB5E1tQaQva9V01p/tOqjUttka7OrZOmLPXuH3uUE5PPf3bZ+rfVD\ngrYyhQYFt57B2lug9izOuSekJSiGKWmbPLZmst6/r/n6eUyPlrO8OdsEKLltwPq2iK+d5Zd+n3XA\nuGpLliO22iZru0Vr3QdLW9u2tHz9mlq/PtC/qgErw4B+la3A1VOhztJH9xrgPmdGWvdB6xOEDCHr\nXab38njkez/Ap1vOYNXa949td7/valvby1EAfKf6Olitx02D9+9qNI9w9cfd/r1HWIYA6NmtFho1\nmH1qsUaZcLVNO7xkWAQ4W39kez/AutsELEUpB+GqLxn6xUwW0KNbBKwMg0RmtZpGuOKsmvsjrr02\nwFHD3+Ru8M5BuPq0Z1FPXqKeLNzaeub9z2Tsh6zvC/hUPWAtHz9/fn/FY+mKEZyX8fg5GrLOzkhl\nWyYC6E+TGazWa/xQl9mrdaVV09m21m57ZqUiXgdgryYBqzRj9ayRyxmu5d/dQ2HMQbjaZhA/p9b2\nPZn6J8v7APZJdQ/WWu0ohbC/3xm8A5Qu2z7/++lokwpX+2QaxHtSd+ses+7AMc0C1tn1l34WuddZ\n7LPYepz7nNIA8s1sonDVL/30U5YtpIB+NF2mIaJYvQcrg8J5ewLvPP/8mqbn1/rj7Uv6p8xaT/m1\nWJT3ybED/ak+g7WsEc/vv60dyxuFFaTrvZp3ffawh8feM3EpKj8zWcBeqe7BOkugivHtGframk6l\n37EuwwDuWAL43i1Wcme/yHF1nmcD9Qk1LkU9L+vuvbzLS+1Lhc/Aq5+gL0PMYK0xq7Vf6bIt7UTO\nZP0Zl8uXbzmu1kzjchZYTYN+DBOwFJ+89M05Zwfx9ac9tX20Kz/Oy6ejX6/pWIJeDBOw1ihG9O5I\nyFpbqPcox0sba+v7uSQIfRsqYAlUjGjvx/nbYEU922vKzX9/tl7P1Dnow1ABa40SBJ8sqdHW/tD8\nuZjy+8/1E+Q1zaMdoaVNDgf7Z/bIgNDGK0vtXzZDX+1X2moquuys9UmCfjL1CQXDz2DB3ZQvP+0f\niM2QHFPjHG6tT+YMC6cBq8ZbB6vlfhZsMgRc67l10XJLI65XKjvPPol7nZWb39U8SGm8gEVeBoKv\nLZvv/fvoQOVJtu9dEXLntePIsQXp3CNgKT4MpOZHeXUwZ1XVphKyIL17BCzyMAh8rXoT6rPdml+S\n1VeQxpgBq3R2B5xj4C4qbTV1eckp9YlaBymMGbDIzWD9tSZNqN/y0SeQ1rgBy1kcg2vyETeg56NP\nIKXxFhoFrmftJf6Q7KBg3Bks4DrCFcAmAQsAINg9tspZrsYIMDp1D5oaP2CVdmIFGJW6B825REg7\nz43a3s+0PQ3VH/0I8GH8GSxycoY9Bv0IsMoMFvksZ0Pok34Ebmz8GazlInzOrvPTR2PQj+2oe9Dc\n+AHr8VBcgPtR96CpewQs8nGGPQb9CLDKVjkAnOUmOyhwkzsAQDABCwAgmIAFABBMwAIACCZgAQAE\nE7AAAIIJWAAAwQQsAIBgAhYAQDABCwAgmIAFABBMwAIACCZgAQAEE7AAAIIJWAAAwQQsAIBgAhYA\nQDABCwAgmIAFABBMwAIACCZgAQAEE7AAAIIJWAAAwQQsAIBgAhYAQDABCwAgmIAFABBMwAIACCZg\nAQAEE7AAAIIJWAAAwQQsAIBgAhYAQDABCwAgmIAFABBMwAIACCZgAQAEE7AAAIIJWAAAwQQsAIBg\nAhYAQDABCwAgmIAFABBMwAIACCZgAQAEE7AAAIIJWAAAwQQsAIBgAhYAQDABCwAgmIAFABBMwAIA\nCCZgAQAEE7AAAIIJWAAAwf4DedSLBfYCmKUAAAAASUVORK5CYII=\n", 98 | "text/plain": [ 99 | "" 100 | ] 101 | }, 102 | "execution_count": 6, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "# Run transform...\n", 109 | "# NB As the transorm is a one-component reaction, only the first component from each product tuple will exist\n", 110 | "\n", 111 | "products = [x[0] for x in rxn.RunReactants((mol,))]\n", 112 | "\n", 113 | "# Sanitize product mols...\n", 114 | "\n", 115 | "for x in products: Chem.SanitizeMol(x)\n", 116 | " \n", 117 | "# Keep unique products only...\n", 118 | "\n", 119 | "products = list({Chem.MolToSmiles(x): x for x in products}.values())\n", 120 | "\n", 121 | "# Depict...\n", 122 | "\n", 123 | "Draw.MolsToGridImage(products)\n", 124 | "\n", 125 | "# Note that, in each product, only one example of the target moiety had been fixed..." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 7, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAJRklEQVR4nO3d2XabShAFUHFX/v+X\nuQ+KZYUZiqarYe+VB0eKHQzVRz0wdH3fvwA46r/aGwDQNjEKECJGAULEKECIGAUIEaMAIWIUIESM\nAoSIUYAQMQoQIkYBQsQoQIgYBQgRowAhYhQgRIwChIhRgBAxChAiRgFCxChAiBgFCBGjACFiFCBE\njAKEiFGAEDEKECJGAULEKECIGAUIEaMAIX9qbwBZdd3v131fbzsgOzHKlK77JzoHfwW+GNQzMg7N\nvv+ncwp8EaMAIWIU2KDrfv8MXh//y4cxNwqsMVe+SG8UWGSufI0YZWTcSPQ+YJ5B/eNNRuQgSWUo\nCx7fMxWjz7bQzRSdbDQoleelqkH9gxmqwxnE6FPJUDYyV77GoP6RNAN2MVe+qOvtkadZzlAJCzsZ\n1PNFhvL2vGWiCDH6MAtBKUN5Uwk7idEnkaGsUgn7idHHkKGsUgmHiNFH6GQoq1TCUWL0/rrO+Ris\nkaEBYrSUhdswXnmHxpUM1XggTIwWVP2kERnKJiohRowWVPeujDKUTVRCmBi9p+UMXVpx4lFUwhlc\nU1/Wu0M6LtSivdTVDLXidLFxDWSIL5VwFjFa3GSSLt+h8f3XYxUuQ3PKkJvfVMKJxGhG7/I+cEsd\nGZrW3LikCpVwLjF6hWNrTYNHMY5fHJChbKESTidGLxJctf+U/c8P6V6v16AxyNDkFjqkWz4mT6ES\nSrBPG9aNgnnyaGo51X2n5/vr1Tx9O/e4qYRC9EbPdPHk17hJfIL185aW05bltUdHMicxepoMCwjv\nxPyEqQxNaNf0znyqTszqLFMM5YjRc2TI0I++799JqtnkdHii/Ot4/vN5+fOuyfE67NwTnJ6hkfNG\nf36CI/s4c6mqGErTG40q0Q+dHMppCG25foAyyMrxRDmFuKa+AX3/dxjYdROr8zPf0m/8l5SQYZKn\n73sBeg290ZArW8vPf/T7/+lu5HSsKgqdOvr+QFUhRYnR46o/7/2zLm9BKY/Dx31whcXogov63Vvm\niNGDqmfoh+5GHqcc97m72CxcwEZdYvSIPBm6QLxerPRx/57VMf5IRYzuttpa1Dal7VqU94Famhjd\nJ2c5Jtykp6k7QPk+RfSlHi7nhKcd1u5Ed+W2bOK0p2vkmeRxklMVYnSr1QytW73ispY8GfrzP05U\ngg/UosToJskzlFq6rkuVoVQhRtc1kaFz3Q3dkHJWJ8qrFIYjfj0xumLDkzmu3Bz+Xm5QPSlyLjYu\nEK/lWKlf4ulG2Xzv8+23iSu6GVPvVh6gKMuLCYJZd0rJe/wuq59qn6+L/rLJM3TBPcogIb3RaQou\nmw0TkRM3bRm/VXQzlM0zidEJc43h87rWcrG9O3zuIp/Jd8+SqiomN8blTIWI0d0yF2LmbavorFS1\ne5mkLKatdkjTanfL55Te8o2Tqm0N5+9XBpnpje7gfJHrXdDst0yq7pqZzWBu/G5cX4IYnTZZbU0U\n353ayfW/iMcZcYDT75+iubOvM3wYLN/pI8NVAAuq773nEKM3dIP2kyFDPxY+gfJs5HbNfaDmJ0Zn\nLVSbKiwqVYbOeW+kSuAlRh+liWbfRIa+GumH5j/c9yBG76nR9tNKhsI3MbrE3eeulDlDFyrh+o3Z\nTgFfQ4zek3YClxGjK+bzKHU3ZE7aeM3cFV2Wc39+pD3idyJGD+r7jM+w+9ZQKjWRoe3m0cJTl6/f\nmFsSo1tkb+FzmmgnTWToAnmEGF031/FM3vbn4ilVs289Q3/c4FfgODF6T6vxlOFCxrtkaAOs1xfl\n1iQ3tP2WbmkfZ5TT/G2TUj87hNLE6CZz7SRn49l+W8yF+xlXfJxRbk1u9p3u+5WQGH2QJh5nBM0R\noyHZRnMLCdXE44zymzviyffH8nrjY4/mWcTofRRtDx5n1C77vDT7d4dUHc+Buk8KOuVxRq3IXAZj\nbT1CqlF6o3dQvamc8jijVjS03li9MB5CjN5BqqbicUZ5pCqMG7Mr90nY41igqRQyVwatlIfCOJer\nmBq2fAmKpvJYrk26mJbWKtNedaXteG65DlhtnEtv9Ia0k8eSoVWI0SadeJo9tyFDa7Fbb0U7uVLa\ncf0ktVGO3ijcnwwtSoy2xNJ8Knl2tsKoS4xW0HVHnuNkaZ5JCqM6VzFVMKjqT6QuVLumwiSFkYHe\n6D7jXuTnlYW3lvX93z9zvVRNhUkOfRJ6o7uVW58d9VLXr0bXkFrxPpjnHiunNyUhRne77FbNgycm\njZuEdtKQ94EaT+CUiNeX2riWGG3AZHvQTlo0d9v8n3id/rwccJp9NmL0iIU7Tl5DO8lgXAOHhyk/\n39W/1iZzZGhCYvSgySSdW4J/TY3mDtNO8igxvTM5ezP5+tT2qI0KxOhFBnNhk2/RnGsmyt/JuHr7\nOxlaixg97t2EDnzXwNcPWW8GmkpDzl0+MpZPS4yGHEvS8Q/5fLn8bLh3U9FgUlnokA5W51/FRh5K\noi4xus/cSuvyW3t+/sSz4T5jOk0lp+Wh/ffrJSJVYVQnRvOaO29Uh7RdpSOVKsRoau+4lJj5HZje\nGUXqkUeo+kDNQIzCOSIT5X3/ep80+hqtyLsUOD+HIbvJpqL9PMdcqqqBPPRGm2R69DkGR3nLDWu4\nmBvlQUvMlSckRrN7dzxrbwW5qIpUxGirNCRIQowChIhRaJLhSB5itAEaDGQmRhsmXiEDMQoQIkbb\nMNfxdALhkxmOJCFGG3fKHU+BADHaDB1PyEmMtkPHkxHj+gzEaPvE67MZplQnRgFCxGhTdDwZUxW1\niVGAEDF6C/ojUI8Yhfb5HK1KjLZGg4FkxOhdiFeoRIwChIjRBs11PD3p7MkMR+oRowAhYrRNOp6Q\nhhiFuzCur0SMAoSIUbgRsz01iFGAEDEKEPKn9gZwhu+FBcM6uJYYbV/X/ROdg78ChRnUN24cms56\ngWuJUYAQMQoQIkbhdrru98/g9fG/JMwSE9yLJcfL6Y02brygpNk8mSXHGvRG2zdoJzIUriVGb0F0\nspGeaQFiFJ5k8IkrVc9gbhQgRIzCjVhyrMGgHu7FkuPlut5eBggwqAcIEaMAIWIUIESMAoSIUYAQ\nMQoQIkYBQsQoQIgYBQgRowAhYhQgRIwChIhRgBAxChAiRgFCxChAiBgFCBGjACFiFCBEjAKEiFGA\nEDEKECJGAULEKECIGAUI+R8ols9bhpgMpQAAAABJRU5ErkJggg==\n", 138 | "text/plain": [ 139 | "" 140 | ] 141 | }, 142 | "execution_count": 7, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "# Re-run reaction on first product...\n", 149 | "\n", 150 | "prod = rxn.RunReactants((products[0],))[0][0]\n", 151 | "prod" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 8, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "Sanitization error: Can't kekulize mol \n", 166 | "\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "# An attempt to sanitize this 'molecule' fails...\n", 172 | "\n", 173 | "try:\n", 174 | " \n", 175 | " Chem.SanitizeMol(prod)\n", 176 | " \n", 177 | "except ValueError as err:\n", 178 | " \n", 179 | " print(err.args[0])" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 9, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAJO0lEQVR4nO3d23ajRhRAQSkr///L\n5IGRRhHievpO1ZqHxJPYWHRvNyDDc5qmBwBX/VN7AwD6JqMAITIKECKjACEyChAiowAhMgoQIqMA\nITIKECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAh\nMgoQIqMAITIKECKjACEyChAiowAhMspdPZ87H3k+//6Bdf/W3gBo0vP5mKbVf4UPVqOwsIzmNFmT\nskZGAUIc1P/f54rDQdzwLDBJQUY/OB12N1/7V1W5xEH9i9NhwCUyCgvLn6AOTVjnoB5++SqphrJO\nRrmrZRmXZ3XgAAf1cInz5rzI6IvTYZziCiQvz0kpPh08HaawzIwEZPQ684eZkXB7DuqvckzHzEi4\nPRkNMH8AGY1SUh6Gwd3J6DEbk8QU4mEY3JqMHmOSsMsguSsZPWxjkpg/zIyEW5LRRMwfZg2MhN2n\nTJGWjJ6xPUMamD80YZqetUdC7a9/LzJ60l5Jq88fWjDVHgl+ppcko+dtjtDq84dGlBwJHgJdl4xe\n0l5JN06HOVNWS9aRMKdz/jNNv2/7Z0eX4X6jWczzp/D9Cvxu9/DO3kh6LqlRkZuMXrU3QsuX1Jxp\nUHwYuAd/+2Q0YG+GVFmT0poLw+B1KmB6hNPp0L4AGQ2pFcr3xPj5MNPl5phIde2Ok69TqGlHlJLm\nZq2UwO4Mib/IX9PgZyjfH5z/+f2RZVgd+1fxORKydpPCrEYT2F5rXFux7naT7nxeuK/bTT9H05LR\nNOIl/Twd9oh100Fcy9IGdO30zt42KGlKMprM2ZI6HXY3aU+jz/v38udT0oRktKjPdCbv5tpHdp/H\nTvuWq874TlTSVGQ0pZ8L0kZOh9GdhG97WqOkSchoYj9LKqCcOqKfA1pm2ChpnIym91nSHO8qvXZV\n4f3/mjCtWR6v+LnbFxnNIsnb8pe5DF5VeFh6NKbkqnNjvxsVQTKay7W5sX06LMlAN2faUfJmYNv7\n3aiIkNFWOB02sEZuraCkmbjfaEYbk+f58v7INE3lbwdFXeXzeuA5OIbFaVajpZVcdcLS9tBzW7IL\nrEZLK7zq3GBBWsZalZqtlQfhnCWjuWxMnuJbsvpXSlpR9YZuDgwlPUFGx3fgdFjBraEZewNDSY+S\n0aJqHce5sFBLs0fus+1dr6QHyWgWDU6e9pcey3cvDKydb3N717cwMNrnSn1Rddva2iXatVsFNvhD\naGw57jt+KzLKXwUmzJFbBZq35SlphJcmi5bH3O5vqqS+u/DFu1O3/Boe1N1bnbZ1utkFWI2m1/jk\nma8q5NuSVHf1twKq6Ocrb4+skdE7SnsE5yGXZ7X8Evktuwtk9KaCJc33NJRTm9GyTrd849J8p99R\nATKaWONH9J9OlbTWkrPrkvZofrW95qfI6K1tT5VGnqs+2Kzu4ntZbmEXm12Lt98X0vgQ3H4DdvWN\n7+5N4P1Gp9PNrktGU+p38vzU1LfTXUl/auolJRUZ5e/psNobsqOLjRyS+m+T0RJM/lS6KOn2hbvC\nG3NK+69tm2SUPxqf4W9dlJRbkdFkOnqr05fuNltJc+h3AFcno1BCD93XyotkNK/2f5L3ML1/aHZB\n2v4eP2WwbycTGU1jbbR1OgS7mDzNlrRH6/f9an0YtEBGb62LXG7opaS9v85sk9Gb635uN1XSwXK5\ne2taZjKaQDvPUr6nFkq6/RSpeXhU38gNchnh1iT3tbbW6HFJ1cWDpAa7xwpvMppLzxOmy832IKnL\nfv5AdUR/nIwyjuQJu3aX1Y3N6DGy7JLRqH4nzMoR/a3XIAUeJNXakLj5Hk9CRjNqv6Tj8SApyjPJ\nE9h9bFFHL/IYa5MWHiR1ZDOaHRhjDINi2t2Rfeluwgx/VaGFB0k9OhwYs5FGQgGN7sUeNTslfho+\no4+PetbdLz2WdLCRkFuLu7BfbU6JpfV3jJo8WfRVUsPgLL/FVFTLv8dCPtu/Z9VUQ7mgrR+DA9hd\nWbS29HizBsmt2V1PkNVoYru/393CL4BTxfauNyr6JaPpKSlrNna9UdEvGc3iSEmLbcwRjuiLUdLx\nyGgupgQX5Bg2z+fDSMzKOe+8+nqnC8UU2PXvdBpiuZnG2SkpPyXf9XM3jabyzOESlJSf4gPjdQZg\neghoPc6N1ucs6m3tvi1/7W+fL9MfFxu6/PRG4gXWQYWUXHI6uOvLkTXp3NPkQ2g5VLxn4wKr0ULK\nLDnna7KX1yZUsb3qfLxKmunH8DRZgUa5bXM5R+4ofOQ/mD/Z/z/z9z/Ql7WxkWn5+TBUknJQX9rl\nkqY6rFsetb0/svFXFJDjzM/XSnN319vjF1iNlnbwKRfLe2UmfVKbqdKiJE+d2e7m+pc2Kq6T0Qp2\nZ0vu61HmzGC87akuGa2j5JMjXbjvyPEFafInSrnWdJmMVpP1uaGf8+HnV1hbkJpI1W0MjNwP41PS\na2S0prQl3U3n4qv/KOnyEgTlfQ6MrOlcfj5HLRfIaN/OppNe5HvLPcnJaGUXFqQJryc4iGuchnZB\nRus7cuH+679P+tWVtEVuWNMRGa3v/UvT72mTu5trH3GmDC6Q0VZ8/mK1ZcjNWYr2RUYr+5wwnzPH\nRIJeuMMTQIiMtshS9M7s/e7IaE0mDAxARgFCZLQ5lqh3Zu/3SEYBQmS0GusOvhgSnZJRaIWCdkpG\n22I9At2R0Trkkm+e69ItGQUIkdE6fq46LFGhRzJag8M3vhgSPZPRhphG0CMZLW5t3WE9An2SUYAQ\nGYXaHIh0TkbLckQPw5FRgBAZhaociPRPRgsyYWBEMtoAeb0zu75/MlqKVsKgZBQgREZLWVuKWqVC\n52QUIERGa7MUhc7JKECIjEJSz+fOR57Pv38Ywr+1NwDu5OuKoguMQ7AahVKW0Zwma9IByChAiIN6\nSM0C82ZktIjPeeVc2PC+drGqjk5G83NVAYbm3Ghmrirwttz1fqYOwWoUCvoqqYYOQUYhqWUZl4cj\njMVBPUCIjAKEyGhmrirA6Jwbzc9VBRiajBYhnTAuB/UAITIKECKjACEyChAiowAhMgoQIqMAITIK\nECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQ\nIqMAIf8BxC5FEgNvsDUAAAAASUVORK5CYII=\n", 192 | "text/plain": [ 193 | "" 194 | ] 195 | }, 196 | "execution_count": 9, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "# Re-run reaction on second product...\n", 203 | "\n", 204 | "prod = rxn.RunReactants((products[1],))[0][0]\n", 205 | "prod\n", 206 | "\n", 207 | "# Note that both applications of rhe reaction have been to the first ring..." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 10, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "Sanitization error: Can't kekulize mol \n", 222 | "\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "# An attempt to sanitize this 'molecule' also fails...\n", 228 | "\n", 229 | "try:\n", 230 | " \n", 231 | " Chem.SanitizeMol(prod)\n", 232 | " \n", 233 | "except ValueError as err:\n", 234 | " \n", 235 | " print(err.args[0])" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "As is hopefully clear from the above, the problem is that, as the transform is applied to the same ring as the first on the second attempt, a non-physical 'intermediate' is obtained. This may be depicted but not sanitized, which means it may not used in any further transformations.\n", 243 | "\n", 244 | "Thus, the desired product, where the transform has been applied once to each ring, cannot be obtained by simple serial application of the transform to the first product. A slightly different strategy must thus be applied to ensure cases such as this one are handled. \n", 245 | "\n", 246 | "Now, all products are used, and the transform applied to each until the final molecule is converged upon (or we run out of iterations in a few pathological cases)..." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 11, 252 | "metadata": { 253 | "collapsed": false 254 | }, 255 | "outputs": [ 256 | { 257 | "data": { 258 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAILElEQVR4nO3d3XKjOBCAUdja939l\n9sITr8f8xNCAuuVzKhe7qakZYqTPQnbwOE3TAMBR/7Q+AIDaZBQgREYBQmQUIERGAUJkFCBERgFC\nZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJkFCBERgFCZBQgREYBQmQUIERGAUJk\nFCBERgFCZBQgREYBQmQUIERGAUJkFCBERqE74/j/19v353+SsH9bHwBwqnEcpmn1f7mA1Sh0ZB7N\nabLkvJqMAoS4qO/C63Jj+4LOJd6XszK9gIzWZy+Mz82fVglzUV+cvTBoTUahI/MnUVcn13NR3zsr\n02/zVlINvZ6M9s5e2BdaS+f8+yJ7Bhf1ACEyWpy9MGjNRX199sJ48iTagox2wV4YtOOiHnphKdqI\njAKEyGh93sMETckodMEVfTsyWpzJA63JKECIjAKEyGhlruh5MBKaklGAEBkFCJFRKM4VfWsyWpbJ\nAznIKNTml9iak1EobBzHyUVJazJakyt6SENGAUJktCTbYQyu6NOQ0XpMHkhFRgFCZLQYS1HIRkah\nJE+oecgoQIiMVmIBAgnJKNTjCTWVf1sfALDD6INg85HRMixAvtNbN59jwHjIQ0Yhl7VuvpmmSUmT\ncBpqMGH6No7D81d8d51oAyMDq9ECbId16fWsTtMwDEdqaE2agYwm9ZrOxyQxWzowSyc9MDMTmadz\n/gecrzzmN31dvA3sDek0MNqyGm3s13S+cgWXzfbtsx/n9obTZWC0JaMNvJRz99A3YVKZpq2Snn6W\nNk69gdGQjN7h7SWil6Fu0LPDdiuVtBUP+iXWu3nW3+/Etfdch87/4+J/d+vsGxv38zv1+8zfevT6\nnXH88zVNf32d7rHuOP/v5ZDHpf2N/9zW2Tc27ieju20M0dO7abZwgLFxMxnd7c6lh3VHFakWpJ/8\nAU4ko9kpaRU3b0gqaR4yekSqpYfZ0sq8m9lKyj1k9CAlJQMDIwMZLcOEYZGB0ZyMHnfzgvRXJszN\n8jzYStqWjIbYCyMJJW1IRvdJ/qqC2XKbe35h6UTGxnVktB4lZZG3QLUioyUpKYuUtAkZ3SfPCFTS\nhtau6DM85Ep6PxktTElZ5NTfTEZ3SPiqQv6Sjj/aHsYNUg2P/AOjJ27bXF62W/mufcy6+2DeLNvA\n6JjHcYdUy403bW/l++EnSvUxbzc2RhP+cO7xfAOr0U8lnzw3Lz3WlpzbrICycUZOIaP9uHRKHOvm\nXK/zNsmz6VyvD3gqMhqSdvLM7Z1OZ3WzM4XO+NPGqVfYU8hoV4KX9h9ucQZZH93PY34pGf1IoTXI\nrpK2WnJWntVj0Y/Frvlo1yCjx6Vt6yclff73jce1cBi15vb6pXHSkcA9vP2+T5/8RmDzhHkTeFvl\nnsbSktHfFV1rrEUq1eTppaQd/AgcJ6OfqDpJ8uRyQ/WSpnpaogkZPajoEjWnEiXtLJed/Thtyegv\n1kdb9mm/KO3kKVFSWCSjACEyusXvftwp84K0s5FQ9LDTklESyVxSWCOjX6TEGkRJKUdGV3V2HVdI\nlZIaCTzIKBmlKmlnuezsx8lARvepOwTLHXmGkm5/ilStx5PruDXJsnLR6ZIPkqIEGSW1G0r6yV1W\nK96PalEfP0U2MrqgvxeX6h754IOkSE9G6Z8PkuJSMrpgbbaYP634IKlTeA64iIwus+7IxgdJkZYB\nsaWPCdPHT/GQ4YOkhrIPadHDzs9qdIulRzY+SIqEvP3+FxneBM6r6Ufzw6g1MHT/OjL6u3IT5pXJ\nc53SA4MTyehHTBgWTdNkXCCjn1JSFk3TkH9cuCi5lIzuUK6kJs898pR0+14qXMQr9fs8LuKkiTeP\nkjYZGPe8YZYNMrpbwwkDD7vS6aLkajJ6RImSmjw3u3RUvGRzHKw6k5HRg5KUNMN70Xk6a1TMtzdf\n/k4nOh0ZPa5VSe2FZXZ4VLym01mtRUZDbiupdBZybFRcthtgb+dyMprXTzmP7IWZPG1l2PPxzqfb\nyGjUuRNm6cpODUtqu9sz+CCpG8noCYIltSnGMR/eKtD9qK7mwT3NgZI+ZsHpZ8Cc6VXkbU9GxXU8\nsmfKsx3mtPbhbXszeFaV9CIu6s9kO4ygc7vJPUy2Yj7/5AwlzWN+mfL2nedZvfqMGRVX8JgWMI7D\nYy9ssB1W03wTvOH+j1FxOhf1Sc1evj8y7r1Em0eGd5L+HIlRcTIZTeSKdz6ZM8wZFeeS0ca8afR7\n5FmQDkp6Kne/v8r8N/HevjOOfybV8+si5W7a37E898kfHvvseY6mMhm90PYQvTSds39LSVmipGeQ\n0QulGqJKmkSqUTEMCQ+oHhn9Ii7ikki3IWlgxMjotdKNz3QH9BXm3UxXUgJk9HLpwpXugEjAqAiQ\n0a9kzjBnVBwlo3fIOD4zHhOtGRWHyOhNMu6FmTPMGRX7yehVvKpAVUq6k4x+NxOGRZ7z95DRr6ek\nECOjKCmEyCjDMCgpHCej/LAdBofIKPCBx40dH19v35//yS/jts3Ab+afwOfa5YXVKLBpHk076X+T\nUYAQF/VAzNevTGWUdWuftze/yrNZ9s3mg+HLyCgrvKoAn7E3yhKvKvA0P/WeU/9mNQr85q2kGvo3\nGeUQK9Nvs5ZOd4SUUQ76+lcV4MneKECIjLLEqwrwMRf1rPCqAnxmnEwPgAAX9QAhMgoQIqMAITIK\nECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQ\nIqMAITIKECKjACEyChAiowAh/wEUmkA/Qrcj6AAAAABJRU5ErkJggg==\n", 259 | "text/plain": [ 260 | "" 261 | ] 262 | }, 263 | "execution_count": 11, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "mol = Chem.MolFromSmiles(\"Oc1nc(CCc2c(O)ncnc2O)nc(O)c1\")\n", 270 | "mol" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 12, 276 | "metadata": { 277 | "collapsed": false 278 | }, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAADICAIAAAD0lnbAAAAIQklEQVR4nO3d0XajNhRAUejq//8y\nffCMmxiDMRdJV2LvlYc2kzVDjHQsjA3zsiwTAGf903oDAPomowAhMgoQIqMAITIKECKjACEyChAi\nowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQIqMAITIKECKjACEyChAiowAhMgoQIqMAITIKECKj\nACEyChAiowAhMgoQIqMAITIKECKjACEyCsOZ5/+/Xr6//knC/m29AcCl5nlals3/pQCrURjIOprL\nYslZmowChDioH8LP5YYjOHZYmRYgo/3zWhjHvYwNVb2Cg/rOeS0MWpNRGMj6SdTRSXkO6mEsLyXV\n0PJkFIazlc7190X2Cg7qAUJktHNeC4PWHNT3z2thPHkSbUFGh2DmQDsO6mEUlqKNyGifvMEe0pBR\ngBAZ7dDWsZsl6p05om9HRgFCZHQUFiPQiIz2Ri4hGRmF/nlybUpGh2AWQTsyChAio12x6mTNqGhN\nRgFCZLR/FiP35kMXzcloP+SSlXmeF6OiNRnthkUH5CSjnbNEhdZktA9bx26WqHfmiD4JGQUIkdGO\nWYxABjLaAbmEzGQUuuTJNQ8ZBQiR0V5ZjEASMpqdXLJmVKTyb+sNAL4wu3FhPjLaJYuR+3jp5nO/\nGwN5yGhqpsoNbXXzxbIshkcSMgrtzfP0/GTv8TIqaRIymtpjhjyWJ2bLYH4uOpdlmqYz+1dJM5DR\nvJ4Hd8+YvvwH3VmlkxHIaC4/Xxd7aaV1RzbrixS+vWxh6XQaGM3JaHs/ptmHyWDCZLN/udfHnq2w\nuwyMtjz0ta3f9vftHjBhkng09GdJS19Ee3/XGxitWI1W8qyncc5p+6tOa9JWfBi0kmX583XFX7X4\nKEsejwVpxX9ub+8bG03I6HfWQ/T5nZ0/upzZkoqS3pyMfi3JEDVb2GJsVCajX6u89NhhtuSRakF6\n5Ae4kIz2zWzJo/KpHSXNQ0bP2Fp6zPOvr1obY7a0se5mtpJSh4ye9Lakz9PxV52UP7wxptNNOd2U\ngYwOwoS5LSVtTkbPy3Ou6cGEqSzPg62kbcloiA+MkISSNiSj39k5q9D8hMNktlRU+uPzlzM2ypHR\n0Zgtt+UtUK3I6IDMlttS0iZk9Du9jECzpaitI/oMD7mS1iejwzJbbsuur0xGv+CswgnzX203o4JU\nw8OJ+5pctnlw9S/lu3WbdVcUrsw1nqvxOH4h1XLjK6UnzM6d+GpuRh07L4wm/OXcd6QCq9Gjck6S\ngy5femwtOStvBkH2yCVklEPOdXNt1Hmb9ll21Ac8FRm9i2+n01XdHEzaXO7Y2fUKewkZvZGPJT34\nEmfpzeByHvOiZPSQHtcgb71Mp1ZLzp5n9TxNPW6244mCZPR2fr5nsOHU6rGk24fGgzzLco63349s\n/w3YzRPmTeBtdfc0lpaMfjbYWiPV5BmlpAP8Cpwno0d0OUlS5XJH7yXt5XGmHBmlvS5KOlguB/t1\n2pLRDwYbbWl/nS5KCm/J6JjS5hLGI6N7xKimzAvSwT4F1OlmpyWjJJK5pLBFRm+kizWIktIdGd3U\nRXTe6nfLH3opae+PM1eRUTJKVdLBcjnYr5OBjN5Fd5MnQ0n37yLV1+NJOS5N8t7b6HRRoi428iA3\nkqILMvqFHi9K1LsKj/mRq6wOs+vH+C2ykdE3xrufYo/b/ORGUiQno1/LPJ3SblhbbiRFUTL6xsfZ\nYjpV5kZSlzBoC5HR94QyGzeSIi0DYs/HeZvq0Rvsc99vZbiR1NTtQ9rpZudnNbpnf+lhYVKfG0mR\nkLfff7D/JvBlWVq/Q/x2lr+ab0bzTwd8RffLkdHPPpV0yjCb7nBEn013JaUQGT2ki5JSn8MRJhk9\nTkl5q4td76CkKBn9QnclNXnqSLjrqUlGv7N/ENdqOsllc0p6ZzL6tf0JYzqRjWfZ0mT0jC5KavJU\nlmS/U5+MnpSnpHKZR+aSem9WOTJ63seSckNpS+pdruXIaEiGOWMpmk2GUfH082BFSQuR0TE50m8r\nQ0nf3khKSUtwaZKox4SRLF5UHhJvi/n2J11X5XIyegElpb4f2ZynVTQtOWvypHSZPCW11hjSSxg/\n7uG+rpbbNQ/llZqXNMO1OLnWs54ndqmS1uFx7Jv7qrNPSSvwIHbm+J0zzJA81ocpz+/s/NF1/7qS\nluUUUwfmeXqcRpi+OVp3QjaV5i/4bDFO4mQ0qZ+LzmWZpunMKDdD8mj4do4jNwzPm/keyGgiq3Re\nQEmZjgwDJQ2Q0cZKpJOctkpV5y2enlDL8WHQUtZz4+U78/xnUj2/CvH5vzzefkj05xgoWrkPIyHD\nJ1j7JKMF7Y/J0nPm97+lpEyTkpYhowWlGpNKmkTzUbHkuVbuKGT0Rj7MH2pp//qkkl5KRstKNyDT\nbdAtrLv5/M7OH5VlJFxHRotLN1zTbRCN7IwEg+QbMnpLJgkfGSSHyWgNGQdkxm2iOjcUu4KMVpJx\nQCopk2FwARktpdmpA/iWksbI6L2ZPzwYCQEyenvmDw9GwlkyivnDX154OkVGmaZJSeE8GeUvKxE4\nxfVGgQNcGXebjAKfvFxu2nXyf3NQD+xaR9Mr6b/JKECIjAKEyCjbHreLeny9fH/9k3BXTjGxwVkF\nOMZqlHecVeBpves9p/5mNQp88lJSDf1NRjnFyvRupHObjHLKy6RSVW7Ma6MAITLKO84qwGEO6tng\nrAIcMy+mB0CAg3qAEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUIkVGAEBkF\nCJFRgBAZBQiRUYAQGQUIkVGAEBkFCJFRgBAZBQiRUYAQGQUIkVGAkP8AzrAuSijRUQAAAAAASUVO\nRK5CYII=\n", 283 | "text/plain": [ 284 | "" 285 | ] 286 | }, 287 | "execution_count": 12, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "fixed = None\n", 294 | "\n", 295 | "mols = [mol]\n", 296 | "\n", 297 | "changed = False\n", 298 | "\n", 299 | "for n_pass in range(1, 10):\n", 300 | "\n", 301 | " logging.debug(\"apply_rule> starting pass {}...\".format(n_pass))\n", 302 | "\n", 303 | " products = {}\n", 304 | "\n", 305 | " for mol in mols:\n", 306 | "\n", 307 | " for product in [x[0] for x in rxn.RunReactants((mol,))]:\n", 308 | "\n", 309 | " try:\n", 310 | "\n", 311 | " Chem.SanitizeMol(product)\n", 312 | "\n", 313 | " smiles = Chem.MolToSmiles(product, isomericSmiles=True)\n", 314 | "\n", 315 | " except ValueError as error:\n", 316 | "\n", 317 | " continue # We are assuming this simply means an unphysical molecule has been generated\n", 318 | "\n", 319 | " if smiles in products: continue # Keep only new structures\n", 320 | " \n", 321 | " products[smiles] = product\n", 322 | "\n", 323 | " if products:\n", 324 | "\n", 325 | " changed = True\n", 326 | "\n", 327 | " logging.debug(\"apply_rule> there are {} products: will continue\".format(len(products.values())))\n", 328 | "\n", 329 | " mols = list(products.values()) # Update list of mols\n", 330 | "\n", 331 | " else: # Finished...\n", 332 | "\n", 333 | " logging.debug(\"apply_rule> there were no products: will return\")\n", 334 | " \n", 335 | " fixed = mols[0] if changed else None # If there a multiple possible 'fixed' molecules, just take first\n", 336 | " \n", 337 | " break\n", 338 | " \n", 339 | "fixed" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "Note also that it would be possible to deal with this issue by rewriting the SMARTS transforms to be more specific (_i.e._ by excluding cases such as the present one). However, tt was felt that this would likely end up being more complicated than the chosen method, as mutiple complicated SMARTS could become necessary to represent fairly simple transforms." 347 | ] 348 | } 349 | ], 350 | "metadata": { 351 | "kernelspec": { 352 | "display_name": "Python 3", 353 | "language": "python", 354 | "name": "python3" 355 | }, 356 | "language_info": { 357 | "codemirror_mode": { 358 | "name": "ipython", 359 | "version": 3 360 | }, 361 | "file_extension": ".py", 362 | "mimetype": "text/x-python", 363 | "name": "python", 364 | "nbconvert_exporter": "python", 365 | "pygments_lexer": "ipython3", 366 | "version": "3.5.1" 367 | } 368 | }, 369 | "nbformat": 4, 370 | "nbformat_minor": 0 371 | } 372 | -------------------------------------------------------------------------------- /standardiser/docs/Tautomerism_and_aromaticity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%run notebook_setup.py\n", 12 | "\n", 13 | "sys.path.append('../..')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "from standardiser import rules\n", 25 | "\n", 26 | "from standardiser.rules_demo import show_change" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "rules.logger.setLevel('DEBUG')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Tautomerism and aromaticity\n", 45 | "\n", 46 | "This page shows some transforms where the non-aromatic form is (counterintuitively) favoured over the aromatic form for certain hydroxy-substituted furans, thiophenes and pyrolles.\n", 47 | "\n", 48 | "They are taken from the article \"[So you think you understand tautomerism?](http://link.springer.com/article/10.1007%2Fs10822-010-9329-5)\" by Roger Sayle from the special \"[Perspectives in Drug Discovery and Design: Tautomers and Tautomerism](http://link.springer.com/journal/10822/24/6/page/1)\" issue of J. Comput. Aided Mol. Des. (2010), **24**: 485-496 (see the section \"Tautomeric preference and aromaticity\", starting on page 492).\n", 49 | "\n", 50 | "As the article states: \"Software for handling tautomers may be dismissed by its users unless it returns the type of results that they expect to see. The reason why some tautomer enumeration programs prefer to generate aromatic tautomers, contrary to experimental evidence, is to satisfy customer demand and market forces...\".\n", 51 | "\n", 52 | "As these rules are not what most people will be exepecting, I have chosen not to enable them by default. They are kept in a separate rules file which may be appended to the base rule set if required (seee below)." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 4, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "# Add extra rule set...\n", 64 | "\n", 65 | "rules.add_rule_set('tautomerism_and_aromaticity')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Non-aromatic 1" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [ 82 | { 83 | "name": "stderr", 84 | "output_type": "stream", 85 | "text": [ 86 | "[2016/Mar/24 16:50:32 DEBUG ] rule 1 'Non-aromatic 1' applied\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "HTML(show_change(\"Oc1occc1\"))" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 6, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [ 101 | { 102 | "name": "stderr", 103 | "output_type": "stream", 104 | "text": [ 105 | "[2016/Mar/24 16:50:32 DEBUG ] rule 1 'Non-aromatic 1' applied\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "HTML(show_change(\"Oc1[nH]cc2ccccc12\"))" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "### Non-aromatic 2" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 7, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stderr", 129 | "output_type": "stream", 130 | "text": [ 131 | "[2016/Mar/24 16:50:32 DEBUG ] rule 2 'Non-aromatic 2' applied\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "HTML(show_change(\"Oc1cocc1\"))" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "### Non-aromatic 3" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 8, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [ 153 | { 154 | "name": "stderr", 155 | "output_type": "stream", 156 | "text": [ 157 | "[2016/Mar/24 16:50:32 DEBUG ] rule 3 'Non-aromatic 3' applied\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "HTML(show_change(\"Oc1[nH]c2ccccc2c1\"))" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "Note that this rule is not restricted to bicyclic ring systems as currently coded." 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "### Non-aromatic 4" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 9, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [ 186 | { 187 | "name": "stderr", 188 | "output_type": "stream", 189 | "text": [ 190 | "[2016/Mar/24 16:50:32 DEBUG ] rule 4 'Non-aromatic 4' applied\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "HTML(show_change(\"Nc1[nH]c2ccccc2c1\"))" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "Note that this rule is not restricted to bicyclic ring systems as currently coded." 203 | ] 204 | } 205 | ], 206 | "metadata": { 207 | "kernelspec": { 208 | "display_name": "Python 3", 209 | "language": "python", 210 | "name": "python3" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.5.1" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 0 227 | } 228 | -------------------------------------------------------------------------------- /standardiser/docs/notebook_setup.py: -------------------------------------------------------------------------------- 1 | # Common setup for the IPython Notebooks 2 | 3 | from __future__ import print_function, division, absolute_import 4 | import six 5 | 6 | import warnings 7 | 8 | import sys 9 | import re 10 | import random 11 | 12 | from ipywidgets import HTML 13 | 14 | import pandas as pd 15 | 16 | from rdkit import Chem 17 | from rdkit.Chem import Draw, PandasTools, AllChem 18 | with warnings.catch_warnings(): 19 | warnings.simplefilter("ignore") 20 | from rdkit.Chem.Draw import IPythonConsole 21 | IPythonConsole.molSize = (450, 200) 22 | PandasTools.RenderImagesInAllDataFrames() 23 | -------------------------------------------------------------------------------- /standardiser/docs/standardiser.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flatkinson/standardiser/107b68d8f01c4d24111902c87751f897c442e299/standardiser/docs/standardiser.pdf -------------------------------------------------------------------------------- /standardiser/docs/standardiser.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flatkinson/standardiser/107b68d8f01c4d24111902c87751f897c442e299/standardiser/docs/standardiser.pptx -------------------------------------------------------------------------------- /standardiser/make_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | #################################################################################################### 4 | 5 | # Defaults... 6 | 7 | level = 'INFO' 8 | fmt = '[%(asctime)s %(name)s %(levelname)s] %(message)s' 9 | datefmt = '%d/%m/%y %H:%M:%S' 10 | 11 | #################################################################################################### 12 | 13 | def run(name, level=level, fmt=fmt, datefmt=datefmt): 14 | 15 | logger = logging.getLogger(name) 16 | 17 | handler = logging.StreamHandler() 18 | 19 | handler.setFormatter(logging.Formatter(fmt, datefmt=datefmt)) 20 | 21 | logger.addHandler(handler) 22 | 23 | logger.setLevel(level) 24 | 25 | logger.propagate = False 26 | 27 | return logger 28 | 29 | # run 30 | 31 | #################################################################################################### 32 | # End 33 | #################################################################################################### 34 | -------------------------------------------------------------------------------- /standardiser/neutralise.py: -------------------------------------------------------------------------------- 1 | ################################################################################################################################ 2 | # 3 | # Copyright [2014] EMBL - European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in 7 | # compliance with the License. You may obtain a copy of 8 | # the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software distributed under the 13 | # License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 14 | # either express or implied. See the License for the specific language governing permissions 15 | # and limitations under the License. 16 | # 17 | ################################################################################################################################ 18 | 19 | """ 20 | Neutralise charges 21 | """ 22 | 23 | #################################################################################################### 24 | 25 | from . import make_logger 26 | logger = make_logger.run(__name__) 27 | 28 | import copy 29 | 30 | from rdkit import Chem 31 | 32 | from .utils import StandardiseException, sanity_check 33 | 34 | #################################################################################################### 35 | 36 | # Module configuration... 37 | 38 | pos_smarts = "[+!H0!$(*~[-])]" 39 | quat_smarts = "[+H0!$(*~[-])]" 40 | neg_smarts = "[-!$(*~[+H0])]" 41 | acid_smarts = "[$([O-][C,P,S]=O),$([n-]1nnnc1),$(n1[n-]nnc1)]" 42 | acid_h_smarts = "[$([OH][C,P,S]=O),$([n-]1nnnc1),$(n1[n-]nnc1)]" 43 | 44 | #################################################################################################### 45 | 46 | # Module initialization... 47 | 48 | pos_pat = Chem.MolFromSmarts(pos_smarts) 49 | quat_pat = Chem.MolFromSmarts(quat_smarts) 50 | neg_pat = Chem.MolFromSmarts(neg_smarts) 51 | acid_pat = Chem.MolFromSmarts(acid_smarts) 52 | acid_h_pat = Chem.MolFromSmarts(acid_h_smarts) 53 | 54 | #################################################################################################### 55 | 56 | def formal_charge(mol): 57 | 58 | return sum(x.GetFormalCharge() for x in mol.GetAtoms()) 59 | 60 | # def formal_charge 61 | 62 | ###### 63 | 64 | # NB This solved a problem in early iterations: newer RDKit releases may have made it unnecessary, but I haven't gone back and checked properly yet 65 | 66 | def set_all_h_explicit(mol): 67 | 68 | for atom in mol.GetAtoms(): 69 | 70 | atom.SetNumExplicitHs(atom.GetTotalNumHs()) 71 | 72 | atom.SetNoImplicit(True) 73 | 74 | # set_all_h_explicit 75 | 76 | ###### 77 | 78 | def run(mol, balance_quat_surplus=False): 79 | 80 | mol = copy.deepcopy(mol) 81 | 82 | set_all_h_explicit(mol) 83 | 84 | pos = [x[0] for x in mol.GetSubstructMatches(pos_pat)] 85 | quat = [x[0] for x in mol.GetSubstructMatches(quat_pat)] 86 | neg = [x[0] for x in mol.GetSubstructMatches(neg_pat)] 87 | acid = [x[0] for x in mol.GetSubstructMatches(acid_pat)] 88 | 89 | logger.debug("{n_pos} positive/H, {n_quat} positive/quat and {n_neg} negative (of which {n_acid} are acid) charges identified".format(n_pos=len(pos), n_quat=len(quat), n_neg=len(neg), n_acid=len(acid))) 90 | 91 | h_added = 0 92 | 93 | # Negative charges... 94 | 95 | if quat: 96 | 97 | neg_surplus = len(neg) - len(quat) # i.e. 'surplus' negative charges 98 | 99 | if neg_surplus > 0 and acid: 100 | 101 | logger.warn("zwitterion with more negative charges than quaternary positive centres detected") 102 | 103 | while neg_surplus > 0 and acid: 104 | 105 | atom = mol.GetAtomWithIdx(acid.pop(0)) 106 | 107 | atom.SetNumExplicitHs(atom.GetNumExplicitHs() + 1) 108 | atom.SetFormalCharge(atom.GetFormalCharge() + 1) 109 | 110 | h_added += 1 111 | 112 | neg_surplus -= 1 113 | 114 | if balance_quat_surplus: 115 | 116 | quat_surplus = len(quat) - len(neg) 117 | 118 | acid_h = [x[0] for x in mol.GetSubstructMatches(acid_h_pat)] 119 | 120 | if quat_surplus > 0 and acid_h: 121 | 122 | logger.warn("Surplus of quat positive charges but with uncharged acids detected") 123 | 124 | while quat_surplus > 0 and acid_h: 125 | 126 | atom = mol.GetAtomWithIdx(acid_h.pop(0)) 127 | 128 | atom.SetNumExplicitHs(atom.GetNumExplicitHs() - 1) 129 | atom.SetFormalCharge(atom.GetFormalCharge() - 1) 130 | 131 | h_added -= 1 132 | 133 | quat_surplus -= 1 134 | 135 | else: 136 | 137 | for atom in [mol.GetAtomWithIdx(x) for x in neg]: 138 | 139 | while atom.GetFormalCharge() < 0: 140 | 141 | atom.SetNumExplicitHs(atom.GetNumExplicitHs() + 1) 142 | 143 | atom.SetFormalCharge(atom.GetFormalCharge() + 1) 144 | 145 | h_added += 1 146 | 147 | # Positive charges... 148 | 149 | for atom in [mol.GetAtomWithIdx(x) for x in pos]: 150 | 151 | while atom.GetFormalCharge() > 0 and atom.GetNumExplicitHs() > 0: 152 | 153 | atom.SetNumExplicitHs(atom.GetNumExplicitHs() - 1) 154 | 155 | atom.SetFormalCharge(atom.GetFormalCharge() - 1) 156 | 157 | h_added -= 1 158 | 159 | # Done... 160 | 161 | try: 162 | 163 | sanity_check(mol) 164 | 165 | except StandardiseException as err: 166 | 167 | logger.debug("Molecule failed sanity check") 168 | 169 | raise 170 | 171 | logger.debug("Overall H balance: {sign}{n}; formal charge: {chg}".format(sign="+" if h_added > 0 else "", n=h_added, chg=formal_charge(mol))) 172 | 173 | return mol 174 | 175 | # run 176 | 177 | #################################################################################################### 178 | # End 179 | #################################################################################################### 180 | -------------------------------------------------------------------------------- /standardiser/rules.py: -------------------------------------------------------------------------------- 1 | ################################################################################################################################ 2 | # 3 | # Copyright [2014] EMBL - European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in 7 | # compliance with the License. You may obtain a copy of 8 | # the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software distributed under the 13 | # License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 14 | # either express or implied. See the License for the specific language governing permissions 15 | # and limitations under the License. 16 | # 17 | ################################################################################################################################ 18 | 19 | """ 20 | Module to apply rule-based standardisations. 21 | """ 22 | 23 | #################################################################################################### 24 | 25 | from __future__ import print_function, division, absolute_import 26 | import six 27 | 28 | from . import make_logger 29 | logger = make_logger.run(__name__) 30 | 31 | import os 32 | import re 33 | import csv 34 | from six.moves import filterfalse 35 | 36 | from rdkit import Chem 37 | from rdkit.Chem import AllChem 38 | from rdkit.Geometry import rdGeometry 39 | 40 | from .utils import StandardiseException 41 | 42 | #################################################################################################### 43 | 44 | # Module configuration... 45 | 46 | base_rules_file_name = "base" 47 | 48 | data_dir_name = "data" 49 | 50 | max_passes = 10 51 | 52 | #################################################################################################### 53 | 54 | # Module data... 55 | # See bottom of file for module initialisation code (i.e. after functions are defined) 56 | 57 | rule_set = [] 58 | 59 | #################################################################################################### 60 | 61 | def load_rule_set(rules_file_name): 62 | 63 | with open(os.path.join(os.path.dirname(__file__), data_dir_name, "rules_{}.dat".format(rules_file_name))) as rules_file: 64 | 65 | reader = csv.reader(filterfalse(lambda x: re.match(r"^\s*(?:#|$)", x), rules_file), delimiter="\t") # SMARTS and name, tab-seperated 66 | 67 | rule_set = [{"n": n, "SMARTS": x[0], "rxn": AllChem.ReactionFromSmarts(x[0]), "name": x[1]} for n, x in enumerate(reader, 1)] 68 | 69 | return rule_set 70 | 71 | # load_rule_set 72 | 73 | def add_rule_set(rules_file_name, prepend=False): 74 | 75 | global rule_set 76 | 77 | if prepend: 78 | 79 | rule_set = load_rule_set(rules_file_name) + rule_set 80 | 81 | else: 82 | 83 | rule_set = rule_set + load_rule_set(rules_file_name) 84 | 85 | # add_rule_set 86 | 87 | ###### 88 | 89 | def setAllHsExplicit(mol): 90 | 91 | """ 92 | Make all hydrogens explicit. 93 | 94 | #TODO: In the earliest versions of this project, this hack was necessary to get things working as I expected. 95 | That was several versions of RDKit ago, however, and I really need to check whether this is still necessary. 96 | """ 97 | 98 | for atom in mol.GetAtoms(): 99 | 100 | atom.SetNumExplicitHs(atom.GetTotalNumHs()) 101 | 102 | atom.SetNoImplicit(True) 103 | 104 | # setAllHsExplicit 105 | 106 | ###### 107 | 108 | def apply_rule(mol, rule, verbose=False): 109 | 110 | """ 111 | Apply a single rule to the input molecule. 112 | 113 | Please see the IPython Notebook 'issue_01' for an explanation of why things are done this way. 114 | """ 115 | 116 | if verbose: logger.debug("apply_rule> applying rule {n} '{name}'...".format(n=rule["n"], name=rule["name"])) 117 | 118 | mols = [mol] 119 | 120 | changed = False 121 | 122 | for n_pass in range(1, max_passes+1): 123 | 124 | if verbose: logger.debug("apply_rule> starting pass {n}...".format(n=n_pass)) 125 | 126 | products = {} 127 | 128 | for mol in mols: 129 | 130 | for product in [x[0] for x in rule["rxn"].RunReactants((mol,))]: 131 | 132 | try: 133 | 134 | Chem.SanitizeMol(product) 135 | 136 | smiles = Chem.MolToSmiles(product, isomericSmiles=True) 137 | 138 | except ValueError as error: 139 | 140 | continue # We are assuming this simply means an unphysical molecule has been generated 141 | 142 | if smiles in products: continue # Keep only new structures 143 | 144 | products[smiles] = product 145 | 146 | if products: 147 | 148 | changed = True 149 | 150 | mols = list(products.values()) # Update list of mols 151 | 152 | if verbose: logger.debug("apply_rule> there are {} products: will continue".format(len(mols))) 153 | 154 | else: 155 | 156 | if (verbose): logger.debug("apply_rule> there were no products: will return") 157 | 158 | return mols[0] if changed else None 159 | 160 | logger.debug("apply_rule {n} '{name}'> maximum number of passes reached; current number of mols is {m}".format(n=rule["n"], name=rule["name"], m=len(mols))) 161 | 162 | return mols[0] 163 | 164 | # apply_rule 165 | 166 | ###### 167 | 168 | def run(mol, first_only=False, verbose=False, output_rules_applied=None): 169 | 170 | """ 171 | Apply all rules to the input molecule. 172 | """ 173 | 174 | logger.debug("mol = '{smi}'".format(smi=Chem.MolToSmiles(mol))) 175 | 176 | rules_applied = [] 177 | 178 | for n_pass in range(1, max_passes+1): 179 | 180 | logger.debug("starting pass {n}...".format(n=n_pass)) 181 | 182 | n_hits_for_pass = 0 183 | 184 | for rule in rule_set: 185 | 186 | logger.debug("Trying rule {n} '{name}' on pass {m}...".format(n=rule["n"], name=rule["name"], m=n_pass)) 187 | 188 | product = apply_rule(mol, rule, verbose) 189 | 190 | if product: 191 | 192 | logger.debug("rule {n} '{name}' applied on pass {m}".format(n=rule["n"], name=rule["name"], m=n_pass)) 193 | 194 | mol = product 195 | 196 | if output_rules_applied is not None: output_rules_applied.append(rule["n"]) 197 | 198 | if first_only: break 199 | 200 | n_hits_for_pass += 1 201 | 202 | if product and first_only: break 203 | 204 | logger.debug("...total of {n} hits in pass: {m}".format(n=n_hits_for_pass, m="will continue..." if n_hits_for_pass else "finished.")) 205 | 206 | if n_hits_for_pass == 0: break 207 | 208 | setAllHsExplicit(mol) 209 | 210 | return mol 211 | 212 | # run 213 | 214 | ###### 215 | 216 | def demo(old_mol): 217 | 218 | """ 219 | Utility function for illustrating the application of rules. 220 | 221 | See also companion module rules_demo.py 222 | """ 223 | 224 | new_mol = None 225 | 226 | for rule in rule_set: 227 | 228 | products = rule["rxn"].RunReactants((old_mol,)) 229 | 230 | if len(products): 231 | 232 | new_mol = products[0][0] 233 | 234 | logger.debug("rule {n} '{name}' applied".format(n=rule['n'], name=rule['name'])) 235 | 236 | break 237 | 238 | if not new_mol: 239 | 240 | logger.warn("No hits for mol!") 241 | 242 | return None, None, None, None 243 | 244 | Chem.SanitizeMol(new_mol) 245 | 246 | AllChem.Compute2DCoords(old_mol) 247 | 248 | conf = old_mol.GetConformer() 249 | 250 | old_pat, new_pat = rule["SMARTS"].split(">>") 251 | 252 | old_match = old_mol.GetSubstructMatch(Chem.MolFromSmarts(old_pat)) 253 | new_match = new_mol.GetSubstructMatch(Chem.MolFromSmarts(new_pat)) 254 | 255 | coord_map = {new_idx: rdGeometry.Point2D(conf.GetAtomPosition(old_idx).x, conf.GetAtomPosition(old_idx).y) for old_idx, new_idx in zip(old_match, new_match)} 256 | 257 | AllChem.Compute2DCoords(new_mol, clearConfs=True, coordMap=coord_map, canonOrient=False) 258 | 259 | return old_mol, old_match, new_mol, new_match 260 | 261 | # demo 262 | 263 | #################################################################################################### 264 | 265 | # Module initialization... 266 | 267 | add_rule_set(base_rules_file_name) 268 | 269 | #################################################################################################### 270 | # End 271 | #################################################################################################### 272 | -------------------------------------------------------------------------------- /standardiser/rules_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from jinja2 import Template 4 | from tempfile import TemporaryFile 5 | from base64 import b64encode 6 | 7 | from rdkit import Chem 8 | from rdkit.Chem import Draw 9 | 10 | from . import rules 11 | 12 | #################################################################################################### 13 | 14 | # Dir for Jinja2 templates... 15 | 16 | templates = os.path.join(os.path.dirname(__file__), 'templates') 17 | 18 | #################################################################################################### 19 | 20 | def rules_table(): 21 | 22 | def f(x): 23 | 24 | x = x.copy() 25 | 26 | x['tag'] = re.sub("[ ,]", "_", re.sub("[()>.]", "", x["name"])) 27 | 28 | return x 29 | 30 | rule_set = [f(x) for x in rules.rule_set] 31 | 32 | template = Template(open(os.path.join(templates, 'rules_table.html')).read()) 33 | 34 | return template.render(rule_set=rule_set) 35 | 36 | ###### 37 | 38 | def b64_img(mol, match): 39 | 40 | with TemporaryFile() as fh: 41 | 42 | Draw.MolToImage(mol, highlightAtoms=match).save(fh, format='png') 43 | 44 | fh.seek(0) 45 | 46 | b64_img = b64encode(fh.read()).decode('utf-8') 47 | 48 | return b64_img 49 | 50 | ###### 51 | 52 | def show_change(smiles): 53 | 54 | mol = Chem.MolFromSmiles(smiles) 55 | 56 | old_mol, old_match, new_mol, new_match = rules.demo(mol) 57 | 58 | template = Template(open(os.path.join(templates, 'show_rule.html')).read()) 59 | 60 | if not new_mol: return template.render(old_img=b64_img(mol, None), new_img=None) 61 | 62 | return template.render(old_img=b64_img(old_mol, old_match), new_img=b64_img(new_mol, new_match)) 63 | 64 | #################################################################################################### 65 | # End 66 | #################################################################################################### 67 | -------------------------------------------------------------------------------- /standardiser/standardise.py: -------------------------------------------------------------------------------- 1 | ################################################################################################################################ 2 | # 3 | # Copyright [2014] EMBL - European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in 7 | # compliance with the License. You may obtain a copy of 8 | # the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software distributed under the 13 | # License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 14 | # either express or implied. See the License for the specific language governing permissions 15 | # and limitations under the License. 16 | # 17 | ################################################################################################################################ 18 | 19 | """ 20 | Apply standardisation procedure 21 | """ 22 | 23 | #################################################################################################### 24 | 25 | from . import make_logger 26 | logger = make_logger.run(__name__) 27 | 28 | from rdkit import Chem 29 | 30 | from . import break_bonds, neutralise, rules, unsalt 31 | 32 | from .utils import StandardiseException, sanity_check, timeout 33 | 34 | #################################################################################################### 35 | # 36 | # Module configuration... 37 | # 38 | 39 | #################################################################################################### 40 | # 41 | # Module initialization... 42 | # 43 | 44 | #################################################################################################### 45 | 46 | def verbose(verbose=True): 47 | 48 | """ 49 | Turn full debugging output on or off. 50 | """ 51 | 52 | level = 10 if verbose else 20 53 | 54 | logger.setLevel(level) 55 | 56 | for module in break_bonds, unsalt, neutralise, rules: module.logger.setLevel(level) 57 | 58 | #################################################################################################### 59 | 60 | # Unix signals such as SIGALRM are not unavailable on Windows, so the timeout facility cannot be used. 61 | # Use of signals such as SIGALRM is also impossible whe running under mod_wsgi... 62 | # https://github.com/GrahamDumpleton/mod_wsgi-docs/blob/master/configuration-directives/WSGIRestrictSignal.rst 63 | # https://github.com/GrahamDumpleton/mod_wsgi-docs/blob/master/developer-guides/tips-and-tricks.rst 64 | # Thus, the use of the global use of the timeout wrapper is disabled here and enabled selectively below. 65 | 66 | ### @timeout() 67 | def run(input_mol, output_rules_applied=None, verbose=False): 68 | 69 | # Get input molecule... 70 | 71 | if type(input_mol) == Chem.rdchem.Mol: 72 | 73 | mol = input_mol 74 | 75 | input_type = 'mol' 76 | 77 | else: 78 | 79 | mol = Chem.MolFromMolBlock(input_mol) 80 | 81 | if not mol: 82 | 83 | mol = Chem.MolFromSmiles(input_mol) 84 | 85 | if not mol: 86 | 87 | raise StandardiseException("not_built") 88 | 89 | else: 90 | 91 | input_type = 'smi' 92 | else: 93 | 94 | input_type = 'sdf' 95 | 96 | try: 97 | 98 | sanity_check(mol) 99 | 100 | except StandardiseException as err: 101 | 102 | logger.debug("Molecule failed sanity check") 103 | 104 | raise 105 | 106 | ###### 107 | 108 | # Get disconnected fragments... 109 | 110 | non_salt_frags = [] 111 | 112 | mol = break_bonds.run(mol) 113 | 114 | for n, frag in enumerate(Chem.GetMolFrags(mol, asMols=True), 1): 115 | 116 | logger.debug("Starting fragment {n} '{smi}'...".format(n=n, smi=Chem.MolToSmiles(frag))) 117 | 118 | logger.debug("1) Check for non-organic elements...") 119 | 120 | if unsalt.is_nonorganic(frag): continue 121 | 122 | logger.debug("2) Attempting to neutralise (first pass)...") 123 | 124 | frag = neutralise.run(frag) 125 | 126 | logger.debug("3) Applying rules...") 127 | 128 | frag = rules.run(frag, output_rules_applied=output_rules_applied, verbose=verbose) 129 | 130 | logger.debug("4) Attempting to neutralise (second pass)...") 131 | 132 | frag = neutralise.run(frag) 133 | 134 | logger.debug("5) Checking if frag is a salt/solvate...") 135 | 136 | if unsalt.is_salt(frag): continue 137 | 138 | logger.debug("...fragment kept.") 139 | 140 | non_salt_frags.append(frag) 141 | 142 | if len(non_salt_frags) == 0: 143 | 144 | raise StandardiseException("no_non_salt") 145 | 146 | if len(non_salt_frags) > 1: 147 | 148 | raise StandardiseException("multi_component") 149 | 150 | parent = non_salt_frags[0] 151 | 152 | ###### 153 | 154 | # Return parent in same format as input... 155 | 156 | if input_type == 'mol': 157 | 158 | return parent 159 | 160 | elif input_type == 'sdf': 161 | 162 | return Chem.MolToMolBlock(parent) 163 | 164 | else: # input_type == 'smi' 165 | 166 | return Chem.MolToSmiles(parent, isomericSmiles=True) 167 | 168 | # run 169 | 170 | ###### 171 | 172 | # Check for availability of timeout before enabling (see above for details)... 173 | 174 | import platform 175 | 176 | if platform.system() == 'Windows': 177 | 178 | logger.warning("Running under Windows: must disable use of timeout") 179 | 180 | else: 181 | 182 | try: 183 | 184 | from mod_wsgi import version 185 | 186 | logger.warning("Running under mod_wsgi: must disable use of timeout") 187 | 188 | except: 189 | 190 | run = timeout()(run) 191 | 192 | #################################################################################################### 193 | # End 194 | #################################################################################################### 195 | -------------------------------------------------------------------------------- /standardiser/templates/rules_table.html: -------------------------------------------------------------------------------- 1 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {% for rule in rule_set %} 18 | 19 | 20 | 21 | 22 | 23 | {% endfor %} 24 |
NNameSMARTS
{{rule['n']}}{{rule['name']}}{{rule['SMARTS']}}
25 | -------------------------------------------------------------------------------- /standardiser/templates/show_rule.html: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 19 | 20 |
OriginalTransformed
13 | {% if new_img is not none %} 14 | 15 | {% else %} 16 | 17 | {%endif %} 18 |
21 | 22 | -------------------------------------------------------------------------------- /standardiser/unsalt.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # 3 | # Copyright [2014] EMBL - European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in 7 | # compliance with the License. You may obtain a copy of 8 | # the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software distributed under the 13 | # License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 14 | # either express or implied. See the License for the specific language governing permissions 15 | # and limitations under the License. 16 | # 17 | #################################################################################################### 18 | 19 | """ 20 | Remove salt/solvates 21 | """ 22 | 23 | #################################################################################################### 24 | 25 | from . import make_logger 26 | logger = make_logger.run(__name__) 27 | 28 | import os 29 | import csv 30 | import re 31 | import copy 32 | 33 | from rdkit import Chem 34 | 35 | #################################################################################################### 36 | 37 | # Module configuration... 38 | 39 | use_inchi = False ### Chem.INCHI_AVAILABLE 40 | 41 | salts_file = "salts.tsv" 42 | 43 | non_organic_elements = "[!#1&!#6&!#7&!#8&!#9&!#15&!#16&!#17&!#35&!#53]" 44 | 45 | #################################################################################################### 46 | 47 | # Module initialization... 48 | 49 | non_organic_elements = Chem.MolFromSmarts(non_organic_elements) 50 | 51 | salts = {} 52 | 53 | salts_fh = open(os.path.join(os.path.dirname(__file__), "data", salts_file)) 54 | 55 | for record in csv.DictReader(salts_fh, delimiter="\t"): 56 | 57 | smiles, name = record["SMILES"], record["name"] 58 | 59 | if re.match(r"\s*(#|$)", smiles): continue # NB assumes SMILES is in first column 60 | 61 | mol = Chem.MolFromSmiles(smiles) 62 | 63 | if not mol: 64 | 65 | logger.warning("Bad SMILES in salt/solvate file: {smiles} {name}".format(smiles=smiles, name=name)) 66 | 67 | continue 68 | 69 | Chem.RemoveStereochemistry(mol) 70 | 71 | key = Chem.MolToInchi(mol) if use_inchi else Chem.MolToSmiles(mol) 72 | 73 | salts[key] = {'SMILES': smiles, 'name': name} 74 | 75 | #################################################################################################### 76 | 77 | def is_nonorganic(mol): 78 | 79 | if mol.GetSubstructMatch(non_organic_elements): 80 | 81 | logger.debug("Fragment contains a non-organic element") 82 | 83 | return True 84 | 85 | return False 86 | 87 | # is_nonorganic 88 | 89 | ###### 90 | 91 | def is_salt(mol): 92 | 93 | mol = copy.deepcopy(mol) 94 | 95 | Chem.RemoveStereochemistry(mol) 96 | 97 | key = Chem.MolToInchi(mol) if use_inchi else Chem.MolToSmiles(mol) 98 | 99 | if key in salts: 100 | 101 | logger.debug("Fragment matches salt/solvate '{name}'".format(name=salts[key]['name'])) 102 | 103 | return True 104 | 105 | return False 106 | 107 | # is_salt 108 | 109 | #################################################################################################### 110 | # End 111 | #################################################################################################### 112 | -------------------------------------------------------------------------------- /standardiser/utils.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # 3 | # Copyright [2014] EMBL - European Bioinformatics Institute 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in 7 | # compliance with the License. You may obtain a copy of 8 | # the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software distributed under the 13 | # License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 14 | # either express or implied. See the License for the specific language governing permissions 15 | # and limitations under the License. 16 | # 17 | #################################################################################################### 18 | 19 | """ 20 | Utilities for standardise package 21 | """ 22 | 23 | #################################################################################################### 24 | 25 | from functools import wraps 26 | import errno 27 | import os 28 | import signal 29 | 30 | from rdkit import Chem 31 | 32 | from . import make_logger 33 | logger = make_logger.run(__name__) 34 | 35 | #################################################################################################### 36 | 37 | # Error types... 38 | 39 | errors = { 40 | "not_built": "RDKit could not build mol", 41 | "no_non_salt": "No non-salt/solvate components", 42 | "multi_component": "Multiple non-salt/solvate components", 43 | "sanity_check": "Molecule failed sanity check", 44 | "timed_out": "Time taken shows problem with moleule" 45 | } 46 | 47 | class StandardiseException(Exception): 48 | 49 | def __init__(self, name): 50 | 51 | self.name = name 52 | self.message = errors[name] 53 | self.args = (errors[name], ) 54 | 55 | # StandardiseException 56 | 57 | ###### 58 | 59 | # Sanity-check for molecules 60 | # 61 | # See e.g. PubChem CIDs 128221 or 20643358 for examples of things that fail 62 | 63 | def sanity_check(mol): 64 | 65 | try: 66 | 67 | Chem.SanitizeMol(mol) 68 | 69 | Chem.MolToSmiles(mol, isomericSmiles=True) 70 | 71 | except ValueError as err: 72 | 73 | raise StandardiseException("sanity_check") 74 | 75 | # sanity_check 76 | 77 | ###### 78 | 79 | # Time-out long running operation, as this usually indicates a problem with the molecules 80 | # 81 | # http://stackoverflow.com/questions/2281850/timeout-function-if-it-takes-too-long-to-finish 82 | 83 | def timeout(seconds=2): 84 | 85 | def decorator(func): 86 | 87 | def _handle_timeout(signum, frame): 88 | 89 | raise StandardiseException("timed_out") 90 | 91 | def wrapper(*args, **kwargs): 92 | 93 | signal.signal(signal.SIGALRM, _handle_timeout) 94 | 95 | signal.alarm(seconds) 96 | 97 | try: 98 | 99 | result = func(*args, **kwargs) 100 | 101 | finally: 102 | 103 | signal.alarm(0) 104 | 105 | return result 106 | 107 | return wraps(func)(wrapper) 108 | 109 | return decorator 110 | 111 | # timeout 112 | 113 | #################################################################################################### 114 | # End 115 | #################################################################################################### 116 | --------------------------------------------------------------------------------