├── CHANGELOG.md
├── LICENSE.txt
├── README.md
├── mmpdblib
    ├── __init__.py
    ├── __main__.py
    ├── analysis_algorithms.py
    ├── cansmirks_table.py
    ├── cli
    │   ├── __init__.py
    │   ├── click_utils.py
    │   ├── create_index.py
    │   ├── drop_index.py
    │   ├── fragdb_constants.py
    │   ├── fragdb_list.py
    │   ├── fragdb_merge.py
    │   ├── fragdb_partition.py
    │   ├── fragment.py
    │   ├── fragment_click.py
    │   ├── generate.py
    │   ├── help_.py
    │   ├── index.py
    │   ├── list_.py
    │   ├── loadprops.py
    │   ├── merge.py
    │   ├── predict.py
    │   ├── propcat.py
    │   ├── proprulecat.py
    │   ├── rgroup2smarts.py
    │   ├── rulecat.py
    │   ├── ruleenvcat.py
    │   ├── smi_split.py
    │   ├── smi_utils.py
    │   ├── smicat.py
    │   ├── smifrag.py
    │   └── transform.py
    ├── config.py
    ├── create_index.sql
    ├── dbutils.py
    ├── drop_index.sql
    ├── environment.py
    ├── fileio.py
    ├── fragment_algorithm.py
    ├── fragment_create_index.sql
    ├── fragment_db.py
    ├── fragment_records.py
    ├── fragment_schema.sql
    ├── fragment_types.py
    ├── index_algorithm.py
    ├── index_types.py
    ├── index_writers.py
    ├── properties_io.py
    ├── reporters.py
    ├── rgroup2smarts.py
    ├── schema.py
    ├── schema.sql
    ├── smarts_aliases.py
    └── smiles_syntax.py
├── pyproject.toml
└── tests
    ├── cached.fragdb
    ├── comma.smi
    ├── space.smi
    ├── space.smi.gz
    ├── support.py
    ├── tab.smi
    ├── test_analysis.py
    ├── test_data.csv
    ├── test_data.fragdb
    ├── test_data.mmpdb
    ├── test_data.smi
    ├── test_data_2019.mmpdb
    ├── test_fragment.py
    ├── test_index.py
    ├── test_list.py
    ├── test_loadprops.py
    ├── test_rgroup2smarts.py
    └── two_tabs.smi


/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # CHANGELOG
  2 | 
  3 | ## mmpdb 3.1 - 2023-11-28
  4 | 
  5 | Extended the "generate" command to handle 2-, and 3-cut transforms.
  6 | 
  7 | The `generate --explain` option now also explains why the search for a
  8 | matching query or variable part passes or fails. This proved useful in
  9 | determining that an expected fragmentation was instead being filtered.
 10 | 
 11 | The new `--min-heavies-total-const-frag N` fragmentation option
 12 | specifies the minimum number of heavy atoms allowed in the constant
 13 | part. The default value is 0.
 14 | 
 15 | Changed the fragdb schema version (in the "options" table) from 3 to 4
 16 | to support the new fragmentation option. Version 3 fragment databases
 17 | are still supported, by `min_heavies_total_const_frag` to 0.
 18 | 
 19 | Added two indices and a SQLite pragma for the page size. Roche reports
 20 | these improve analysis performance.
 21 | 
 22 | Fixed `--from` and `--to` support in proprulecat. These had been left
 23 | behind in the migration to click from argparse for command-line
 24 | processing.
 25 | 
 26 | ## mmpdb 3.0 - 2023-5-31 
 27 | 
 28 | A large number of changes to merge three different development tracks
 29 | and add new features.
 30 | 
 31 | The "fragments" file format has been replaced with a SQLite-based
 32 | "fragdb" file format. This makes it much easier to develop tools to
 33 | work on fragment data sets instead of processing a JSON-Lines file.
 34 | 
 35 | New functionality to create an MMP data set in a distributed compute
 36 | environment. Some of the features are:
 37 | 
 38 | - split a SMILES file into a set of smaller SMILES files
 39 | - the default "fragment" file output is now based on the input name
 40 | - fragment files can be re-partitioned by constant fragments:
 41 |     - the "fragdb_constants" file generates fragment information
 42 |     - the "fragdb_partition" create re-partitioned fragdb files
 43 | - the default "index" file output is now based on the input name
 44 | - there are tools to merge fragdb and mmpdb files into one
 45 | 
 46 | As a result, mmpdb can now handle significantly larger data sets.
 47 | 
 48 | Added support for Postgres for direct index database creation. (The
 49 | new distributed compute tools require SQLite.)
 50 | 
 51 | Added a new "generate" command to apply 1-cut transforms to a
 52 | structure, using MMP rules as a playbook.
 53 | 
 54 | Replaced the SHA256-based Morgan fingerprint signature with a
 55 | canonical SMARTS representing the Morgan fingerprint environment. This
 56 | is difficult to understand or depict, so also include a "pseudo"
 57 | SMILES that can be parsed by RDKit (if sanitize is disabled) and
 58 | drawn. The new environment fingerprint also include the SMARTS of its
 59 | parent, that is, the SMARTS with a smaller radius.
 60 | 
 61 | Switched to 'click' for command-line parsing, removed the vendered
 62 | version of the peewee ORM, and switched to a modern "pyproject.toml"
 63 | project configuration with a setup.cfg which declares its dependencies.
 64 | 
 65 | 
 66 | ## mmpdb 2.2-dev (the GitHub development track)
 67 | 
 68 | The `fragment` and `smifrag` commands now support options for
 69 | supervised fragmentation based on a specified set of R-group SMILES to
 70 | use for the fragmentation. Multiple SMILES can be specified on the
 71 | command-line using the `--cut-rgroup` argument, with one SMILES per
 72 | argument, or using the `--cut-rgroup-file` argument with the name of
 73 | an R-group file. The file must be formatted with one R-group SMILES
 74 | per line.
 75 | 
 76 | All SMILES strings must contain a single wildcard atom ("*"), which
 77 | indicates the attachment point. The wildcard atom must contain only
 78 | one single bond to the rest of the R-group, and cannot contain charge,
 79 | hydrogens, isotope, or other properties.
 80 | 
 81 | The SMILES strings are converted into a SMARTS pattern which matchs
 82 | the SMILES exactly (each atom must have the same valence and hydrogen
 83 | count, and the bond types must match). These SMARTS patterns are then
 84 | merged into a single recusive SMARTS with two terms: the wildcard
 85 | atom, single bonded to a recursive SMARTS term for each of the SMILES
 86 | strings.
 87 | 
 88 | The new `rgroup2smarts` command can be used to process the R-group
 89 | SMILES into SMARTS, as a way to examine the conversion process and
 90 | verify that it works.
 91 | 
 92 | ## mmpdb 2.2 - 2019-01-11
 93 | 
 94 |   This minor release contains improvements that help reducing the 
 95 |   database size. Many transformations and associated statistics inside
 96 |   the database are unlikely to ever be used, since there are other 
 97 |   transformations that will yield the same compounds. To accomplish this, 
 98 |   three new options have been introduced:
 99 | 
100 | - max-radius: The maximum radius can now be set on the command line 
101 |   during indexing. This was hardcoded in previous versions.
102 | 
103 | - smallest-transformation-only: Some transformation scan be reduced to 
104 |   smaller transformations, for example p-Fluoro-phenyl >> p-Chloro-phenyl
105 |   to Fluoro >> Chloro. If this flag is set during indexing, reducible 
106 |   transformations will not be written to the database. Note that this only
107 |   setting reduces the number of transformations for a given pair. It does
108 |   not completely remove a pair.
109 | 
110 | - min-heavies-per-const-frag: For double- and triple-cuts, fragmentations
111 |   are created where the constant parts can be very small down to a single
112 |   atom. For example, if the fragmentation algorithm can generate a single-cut 
113 |   where the variable fragment is p-Fluorophenyl, it will also generate a
114 |   double cut with phenyl as variable fragment and F as one of the constant
115 |   pieces. The 'min-heavies-per-const-frag' option can be used during 
116 |   fragmentation to eliminate multiple cuts where one of the constant fragments
117 |   is very small. If this is set to 3 or 4, double- and triple cuts only happen
118 |   at positions that would be considered real scaffolds changes in the middle 
119 |   of molecules. Note that in principle this option only reduces the number 
120 |   of pairs for a given transformation, effectively removing multiple-cuts 
121 |   where possible. There may be edge cases where pairs are completely removed,
122 |   because the single cut transfers too many atoms. If you use this option, you 
123 |   may want to adjust the --max-variable-heavies option during indexing.
124 | 
125 |   The last '--smallest-transformation-only' and '--min-heavies-per-const 3' 
126 |   options together typically reduce the database size by ~ 70%.
127 | 
128 | 
129 | ## mmpdb 2.1 - 2018-04-27
130 | 
131 | - RDKit 2018\_03\_1 changed the SMILES output so wildcard atoms are
132 |   represented with a `*` instead of `[*]`. mmpdb works with SMILES
133 |   strings at the syntax level. Parts of the code expected only the old
134 |   RDKit behavior and crashed or gave the wrong output for the new
135 |   behavior. For example, the fragmentation algorithm raised an
136 |   AssertionError saying:
137 | 
138 | ```
139 |   File "mmpdblib/fragment_algorithm.py", line 368, in make_single_cut
140 |     constant_smiles_with_H = replace_wildcard_with_H(constant_smiles)
141 |   File "mmpdblib/fragment_algorithm.py", line 302, in replace_wildcard_with_H
142 |     assert smiles.count("[*]") == 1, smiles
143 | AssertionError: *O
144 | ```
145 | 
146 |   Version 2.1 now supports both ways of representing the wildcard atom
147 |   and will work with RDKit releases from 2017 and 2018.
148 | 
149 | - The tests are now included as part of the distribution.
150 | 
151 | - mmpdb is available from PyPI using "pip install mmpdb"
152 | 
153 | - A preprint of 
154 |   [our paper](https://chemrxiv.org/articles/mmpdb_An_Open_Source_Matched_Molecular_Pair_Platform_for_Large_Multi-Property_Datasets/5999375)
155 |   is available from ChemRxiv. We received word today that
156 |   [JCIM](https://pubs.acs.org/journal/jcisd8) has accepted it (after
157 |   minor changes).
158 | 
159 | 
160 | ## mmpdb 2.0 - 2017-08-15
161 | 
162 | First public release of mmpdb, an open-source matched molecular pair
163 | package designed to create and query MMP databases for big-pharma
164 | sized ADMET datasets.
165 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | mmpdb - matched molecular pair database generation and analysis
  2 | 
  3 | 
  4 | Copyright (c) 2015-2025 F. Hoffmann-La Roche Ltd., distributed under the
  5 | 3-clause BSD license license, below.
  6 | 
  7 | Portions may be copyright (c) 2012-2013 by GlaxoSmithKline Research &
  8 | Development Ltd., distributed under the 3-clause BSD license.
  9 | 
 10 | Copyright (c) 2010-2017 Andrew Dalke Scientific, AB (Sweden). This is
 11 | the file fileio.py though modifications to this file are covered under
 12 | the mmpdb license.
 13 | 
 14 | Copyright (c) 2010 by Charles Leifer, distributed under the MIT
 15 | license. These are the files peewee.py and playhouse/*.py.
 16 | 
 17 | 
 18 | 
 19 | 3-clause BSD license for mmpdb and F. Hoffmann-La Roche Ltd.
 20 | ------------------------------------------------------------
 21 | 
 22 | 
 23 | Unless otherwise noted, all files in this directory and all
 24 | subdirectories are distributed under the following license:
 25 | 
 26 | Copyright (c) 2015-2025, F. Hoffmann-La Roche Ltd.
 27 | 
 28 | Redistribution and use in source and binary forms, with or without
 29 | modification, are permitted provided that the following conditions are
 30 | met:
 31 | 
 32 |     * Redistributions of source code must retain the above copyright
 33 |       notice, this list of conditions and the following disclaimer.
 34 |     * Redistributions in binary form must reproduce the above
 35 |       copyright notice, this list of conditions and the following
 36 |       disclaimer in the documentation and/or other materials provided
 37 |       with the distribution.
 38 |     * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 39 |       its contributors may be used to endorse or promote products
 40 |       derived from this software without specific prior written
 41 |       permission.
 42 | 
 43 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 44 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 45 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 46 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 47 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 48 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 49 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 50 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 51 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 52 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 53 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 54 | 
 55 | 
 56 | 
 57 | 3-clause BSD license for GlaxoSmithKline Research & Development Ltd.
 58 | --------------------------------------------------------------------
 59 | 
 60 | 
 61 | Copyright (c) 2013, GlaxoSmithKline Research & Development Ltd.
 62 | All rights reserved.
 63 | 
 64 | Redistribution and use in source and binary forms, with or without
 65 | modification, are permitted provided that the following conditions are
 66 | met:
 67 | 
 68 |     * Redistributions of source code must retain the above copyright
 69 |       notice, this list of conditions and the following disclaimer.
 70 |     * Redistributions in binary form must reproduce the above
 71 |       copyright notice, this list of conditions and the following
 72 |       disclaimer in the documentation and/or other materials provided
 73 |       with the distribution.
 74 |     * Neither the name of GlaxoSmithKline Research & Development Ltd.
 75 |       nor the names of its contributors may be used to endorse or promote
 76 |       products derived from this software without specific prior written
 77 |       permission.
 78 | 
 79 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 80 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 81 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 82 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 83 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 84 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 85 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 86 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 87 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 88 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 89 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 90 | 
 91 | 
 92 | 
 93 | MIT license for Andrew Dalke Scientific AB
 94 | ------------------------------------------
 95 | 
 96 | 
 97 | Copyright (c) 2010-2017 Andrew Dalke Scientific, AB (Sweden)
 98 | 
 99 | Permission is hereby granted, free of charge, to any person
100 | obtaining a copy of this software and associated documentation files
101 | (the "Software"), to deal in the Software without restriction,
102 | including without limitation the rights to use, copy, modify, merge,
103 | publish, distribute, sublicense, and/or sell copies of the Software,
104 | and to permit persons to whom the Software is furnished to do so,
105 | subject to the following conditions:
106 | 
107 | The above copyright notice and this permission notice shall be included in
108 | all copies or substantial portions of the Software.
109 | 
110 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
111 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
112 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
113 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
114 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
115 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
116 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
117 | SOFTWARE.
118 | 
119 | 
120 | 
121 | MIT license for Charles Leifer
122 | ------------------------------
123 | 
124 | 
125 | Copyright (c) 2010 Charles Leifer
126 | 
127 | Permission is hereby granted, free of charge, to any person obtaining a copy
128 | of this software and associated documentation files (the "Software"), to deal
129 | in the Software without restriction, including without limitation the rights
130 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
131 | copies of the Software, and to permit persons to whom the Software is
132 | furnished to do so, subject to the following conditions:
133 | 
134 | The above copyright notice and this permission notice shall be included in
135 | all copies or substantial portions of the Software.
136 | 
137 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
138 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
139 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
140 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
141 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
142 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
143 | THE SOFTWARE.
144 | 


--------------------------------------------------------------------------------
/mmpdblib/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "3.1.3"
2 | 


--------------------------------------------------------------------------------
/mmpdblib/__main__.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 |     from . import cli
3 | 
4 |     cli.main()
5 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/__init__.py:
--------------------------------------------------------------------------------
  1 | "implement the mmpdb command-line API"
  2 | 
  3 | # mmpdb - matched molecular pair database generation and analysis
  4 | #
  5 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  6 | # Copyright (c) 2021, Andrew Dalke Scientific AB
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are
 10 | # met:
 11 | #
 12 | #    * Redistributions of source code must retain the above copyright
 13 | #      notice, this list of conditions and the following disclaimer.
 14 | #    * Redistributions in binary form must reproduce the above
 15 | #      copyright notice, this list of conditions and the following
 16 | #      disclaimer in the documentation and/or other materials provided
 17 | #      with the distribution.
 18 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 19 | #      its contributors may be used to endorse or promote products
 20 | #      derived from this software without specific prior written
 21 | #      permission.
 22 | #
 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 27 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 34 | #
 35 | 
 36 | from .. import __version__
 37 | 
 38 | import sys
 39 | import click
 40 | 
 41 | from .click_utils import FormatEpilog
 42 | 
 43 | # Map from command to the mmpdb.cli.{module_name}.{function_name} handler.
 44 | # This is used to load the appropriate command dynamically.
 45 | 
 46 | command_groups = [
 47 |     (
 48 |         "Matched molecular pair generation commands (see 'help-analysis')", [
 49 |             ("fragment", "fragment.fragment"),
 50 |             ("smifrag", "smifrag.smifrag"),
 51 |             ("index", "index.index"),
 52 |             ("predict", "predict.predict"),
 53 |             ("transform", "transform.transform"),
 54 |             ("rgroup2smarts", "rgroup2smarts.rgroup2smarts"),
 55 |             ("generate", "generate.generate"),
 56 |             ],
 57 |     ), (
 58 |         "Distributed generation commands (see 'help-distributed')", [
 59 |             ("smi_split", "smi_split.smi_split"),
 60 |             ("fragdb_constants", "fragdb_constants.fragdb_constants"),
 61 |             ("fragdb_partition", "fragdb_partition.fragdb_partition"),
 62 |             ("fragdb_merge", "fragdb_merge.fragdb_merge"),
 63 |             ("merge", "merge.merge"),
 64 |             ],
 65 |     ), (
 66 |         "Administration commands (see 'help-admin')", [
 67 |             ("fragdb_list", "fragdb_list.fragdb_list"),
 68 |             ("list", "list_.list_"),
 69 |             ("loadprops", "loadprops.loadprops"),
 70 |             ("smicat", "smicat.smicat"),
 71 |             ("rulecat", "rulecat.rulecat"),
 72 |             ("ruleenvcat", "ruleenvcat.ruleenvcat"),
 73 |             ("propcat", "propcat.propcat"),
 74 |             ("proprulecat", "proprulecat.proprulecat"),
 75 |             ("drop_index", "drop_index.drop_index"),
 76 |             ("create_index", "create_index.create_index"),
 77 |             ],
 78 |     ), (
 79 |         "Help commands", [
 80 |             ("help", "help_.help_"),
 81 |             ("help-analysis", "help_.help_analysis"),
 82 |             ("help-admin", "help_.help_admin"),
 83 |             ("help-distributed", "help_.help_distributed"),
 84 |             ("help-postgres", "help_.help_postgres"),
 85 |             ("help-smiles-format", "help_.help_smiles_format"),
 86 |             ("help-property-format", "help_.help_property_format"),
 87 |             ]
 88 |     )
 89 |     ]
 90 | _commands = {}
 91 | for (title, command_pairs) in command_groups:
 92 |     _commands.update(command_pair for command_pair in command_pairs)
 93 | del title, command_pairs
 94 | 
 95 | 
 96 | epilog = """\
 97 | The 'mmpdb' program implements a set of subcommands to work with
 98 | a matched molecular pair database. The commands are roughly
 99 | categorized as "analysis", "distributed", and "admin" commands.
100 | 
101 | The analysis commands fragment a SMILES database, index the fragments
102 | into matched molecular pairs into a local SQLite database, import
103 | molecular properties into a database, and searches the database for
104 | possible transformations or predictions.
105 | 
106 | The distributed commands are used to parallelize matched molecular
107 | pair generation in a distributed cluster environment. They include
108 | ways to split a SMILES file into parts, for parallel fragmentation, to
109 | merge the fragmentations together for later re-use as a cache, to
110 | partition the fragmentations by constant for parallel indexing, and to
111 | merge the generated indices for a final database.
112 | 
113 | The admin commands are used to administer a database. They include
114 | include ways to list the available data sets, dump the data as
115 | a SMILES file or CSV file, update new properties, and
116 | re-aggregate the rule dataset should any property values change.
117 | 
118 | For a short description of how to generate and use a dataset:
119 | 
120 |   % mmpdb help-analysis
121 | 
122 | See "help-postgres" for examples using Postgres.
123 | 
124 | For a short description of how to parallize MMP generation:
125 | 
126 |   % mmpdb help-distributed
127 | 
128 | For a short description of how to adminster a database:
129 | 
130 |   % mmpdb help-admin
131 | 
132 | The "help-*-format" commands (like "help-property-format") give
133 | more details about the given format.
134 | 
135 | In addition, pass the "--help" option to a given command to see
136 | the full list of options for the command.
137 | """
138 | 
139 | 
140 | def explain(msg, *args):
141 |     full_msg = (msg % args) + "\n"
142 |     sys.stderr.write(full_msg)
143 |     sys.stderr.flush()
144 | 
145 | 
146 | def get_explain(use_explain, reporter=None):
147 |     if use_explain:
148 |         if reporter is None:
149 |             return explain
150 |         else:
151 |             return reporter.explain
152 | 
153 |     from ..reporters import no_explain
154 | 
155 |     return no_explain
156 | 
157 | 
158 | class CmdConfig:
159 |     def __init__(self, quiet):
160 |         self.quiet = quiet
161 |         from .. import reporters
162 | 
163 |         if quiet:
164 |             reporter = reporters.get_reporter("quiet")
165 |         else:
166 |             reporter = reporters.get_reporter("verbose")
167 |         self.reporter = reporter
168 | 
169 |         self.report = reporter.report
170 |         self.warning = reporter.warning
171 |         self.progress = reporter.progress
172 |         self.update = reporter.update
173 |         self.explain = get_explain(False, self.reporter)
174 | 
175 |     def set_explain(self, use_explain):
176 |         self.explain = get_explain(use_explain, self.reporter)
177 |         
178 | 
179 | # The 'main' command uses the MultiCommand click group.
180 | # This lets me use my own methods to resolve the available commands.
181 | class MultiCommand(FormatEpilog, click.MultiCommand):
182 |     def list_commands(self, ctx):
183 |         return list(_commands)
184 | 
185 |     def get_command(self, ctx, name):
186 |         # secret alias
187 |         if name == "transmogrify":
188 |             name = "generate"
189 |         path = _commands.get(name, None)
190 |         if path is None:
191 |             # Return None to indicate the command is not available.
192 |             return None
193 |         # Import the submodule then get and return the correct function.
194 |         import importlib
195 |         module_name, func_name = path.split(".")
196 |         mod = importlib.import_module("." + module_name, __name__)
197 |         return getattr(mod, func_name)
198 |     
199 |     def format_commands(self, ctx, formatter):
200 |         # Use my own command formatter so I can group them by theme.
201 |         limit = formatter.width - 6 - max(len(cmd[0]) for cmd in _commands)
202 | 
203 |         old_value = formatter.indent_increment
204 |         formatter.indent_increment = 1
205 |         try:
206 |             with formatter.section("Commands"):
207 |                 for title, command_pairs in command_groups:
208 |                     rows = []
209 |                     for subcommand, _ in command_pairs:
210 |                         cmd = self.get_command(ctx, subcommand)
211 |                         help = cmd.get_short_help_str(limit)
212 |                         rows.append((subcommand, help))
213 | 
214 |                     assert rows
215 |                     with formatter.section(title):
216 |                         formatter.write_dl(rows)
217 |         finally:
218 |             formatter.indent_increment = old_value
219 |         
220 | @click.group(epilog=epilog, cls=MultiCommand)
221 | @click.option("--quiet", "-q", is_flag=True, help="Do not show progress or status information")
222 | @click.version_option(version=__version__)
223 | @click.pass_context
224 | def main(ctx, quiet):
225 |     "Matched-molecular pair database loader"
226 |     if ctx.obj is None:
227 |         ctx.obj = CmdConfig(quiet)
228 | 
229 | if __name__ == "__main__":
230 |     main()
231 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/create_index.py:
--------------------------------------------------------------------------------
 1 | "Implement the 'create_index' command"
 2 | 
 3 | # mmpdb - matched molecular pair database generation and analysis
 4 | #
 5 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
 6 | # Copyright (c) 2021, Andrew Dalke Scientific, AB
 7 | #
 8 | # Redistribution and use in source and binary forms, with or without
 9 | # modification, are permitted provided that the following conditions are
10 | # met:
11 | #
12 | #    * Redistributions of source code must retain the above copyright
13 | #      notice, this list of conditions and the following disclaimer.
14 | #    * Redistributions in binary form must reproduce the above
15 | #      copyright notice, this list of conditions and the following
16 | #      disclaimer in the documentation and/or other materials provided
17 | #      with the distribution.
18 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
19 | #      its contributors may be used to endorse or promote products
20 | #      derived from this software without specific prior written
21 | #      permission.
22 | #
23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 | #
35 | 
36 | from .click_utils import (
37 |     command,
38 |     die,
39 |     open_database_from_options_or_exit,
40 |     add_single_database_parameters,
41 | )
42 | 
43 | create_index_epilog = """
44 | 
45 | Create the database indices for DATABASE. This is mostly used during
46 | development.
47 | """
48 | 
49 | 
50 | @command(
51 |     name="create_index",
52 |     epilog=create_index_epilog,
53 | )
54 | @add_single_database_parameters()
55 | def create_index(database_options):
56 |     """Create the database indices
57 | 
58 |     DATABASE: an mmpdb database
59 |     """
60 |     import sqlite3
61 |     from .. import schema
62 | 
63 |     mmpdb = open_database_from_options_or_exit(database_options)
64 | 
65 |     with mmpdb.atomic():
66 |         try:
67 |             schema.create_index(mmpdb)
68 |         except sqlite3.Error as err:
69 |             die(f"Cannot create index: {err}")
70 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/drop_index.py:
--------------------------------------------------------------------------------
 1 | "Implement the 'drop_index' command"
 2 | 
 3 | # mmpdb - matched molecular pair database generation and analysis
 4 | #
 5 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
 6 | # Copyright (c) 2021, Andrew Dalke Scientific, AB
 7 | #
 8 | # Redistribution and use in source and binary forms, with or without
 9 | # modification, are permitted provided that the following conditions are
10 | # met:
11 | #
12 | #    * Redistributions of source code must retain the above copyright
13 | #      notice, this list of conditions and the following disclaimer.
14 | #    * Redistributions in binary form must reproduce the above
15 | #      copyright notice, this list of conditions and the following
16 | #      disclaimer in the documentation and/or other materials provided
17 | #      with the distribution.
18 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
19 | #      its contributors may be used to endorse or promote products
20 | #      derived from this software without specific prior written
21 | #      permission.
22 | #
23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 | #
35 | 
36 | from .click_utils import (
37 |     command,
38 |     add_single_database_parameters,
39 |     open_database_from_options_or_exit,
40 | )
41 | 
42 | drop_index_epilog = """
43 | 
44 | Drop the database indices from DATABASE. This is mostly used during
45 | development. The index takes about 1/2 of the size of the database, so
46 | if you need to save space for data exchange or archival purposes then
47 | you might drop the indices, and re-create them later when needed.
48 | """
49 | 
50 | 
51 | @command(name="drop_index", epilog=drop_index_epilog)
52 | @add_single_database_parameters()
53 | def drop_index(
54 |     database_options,
55 | ):
56 |     """Drop the database indices
57 | 
58 |     DATABASE: the mmpdb database
59 |     """
60 |     from .. import schema
61 | 
62 |     mmpdb = open_database_from_options_or_exit(database_options)
63 | 
64 |     with mmpdb.atomic():
65 |         schema.drop_index(mmpdb)
66 |     mmpdb.execute("VACUUM")
67 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/fragdb_constants.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import click
  3 | import collections
  4 | import contextlib
  5 | 
  6 | from .click_utils import (
  7 |     command,
  8 |     die,
  9 |     GzipFile,
 10 |     add_multiple_databases_parameters,
 11 |     open_fragdb_from_options_or_exit,
 12 |     positive_int,
 13 |     nonnegative_int,
 14 |     frequency_type,
 15 | )
 16 | 
 17 | COUNT_FRAGMENTATIONS_SQL = "SELECT COUNT(*) FROM fragmentation"
 18 | def _get_num_fragmentations(c):
 19 |     (num_fragmentations,) = next(c.execute(COUNT_FRAGMENTATIONS_SQL))
 20 |     return num_fragmentations
 21 |     
 22 | 
 23 | class SingleDatabase:
 24 |     def __init__(self, database):
 25 |         self.database = database
 26 |         self.db = self.c = None
 27 | 
 28 |     def __enter__(self):
 29 |         self.db = open_fragdb_from_options_or_exit(self.database)
 30 |         self.c = self.db.cursor()
 31 |         return self
 32 | 
 33 |     def __exit__(self, *args):
 34 |         self.c.close()
 35 |         self.db.close()
 36 |         self.db = self.c = None
 37 |         
 38 |     def get_num_fragmentations(self, reporter):
 39 |         return _get_num_fragmentations(self.c)
 40 | 
 41 |     def iter_constants(self, min_constant_num_heavies, min_count, max_count, reporter):
 42 |         query = """
 43 |   SELECT constant_smiles, n
 44 |     FROM (
 45 |       SELECT constant_smiles, count(*) AS n
 46 |         FROM fragmentation
 47 |        WHERE constant_num_heavies >= ?
 48 |     GROUP BY constant_smiles
 49 |     )
 50 |    WHERE ? <= n AND n <= ?
 51 | ORDER BY n DESC, constant_smiles
 52 | """
 53 |         args = (min_constant_num_heavies, min_count, max_count)
 54 | 
 55 |         self.c.execute(query, args)
 56 |         return self.c
 57 |     
 58 | 
 59 | class MultipleDatabases:
 60 |     def __init__(self, databases):
 61 |         self.databases = databases
 62 | 
 63 |     def __enter__(self):
 64 |         return self
 65 |     
 66 |     def __exit__(self, *args):
 67 |         pass
 68 |     
 69 |     def get_num_fragmentations(self, reporter):
 70 |         n = 0
 71 |         num_databases = len(self.databases)
 72 |         for database_i, database in enumerate(self.databases, 1):
 73 |             reporter.update(f"Analyzing {database!r} (#{database_i}/{num_databases})")
 74 |             try:
 75 |                 with contextlib.closing(open_fragdb_from_options_or_exit(database)) as db:
 76 |                     with contextlib.closing(db.cursor()) as c:
 77 |                         n += _get_num_fragmentations(c)
 78 |             finally:
 79 |                 reporter.update("")
 80 |         return n
 81 | 
 82 |     def iter_constants(self, min_constant_num_heavies, min_count, max_count,
 83 |                            reporter):
 84 |         # We need to load all of the constant SMILES counts
 85 |         query = """
 86 |    SELECT constant_smiles, count(*)
 87 |      FROM fragmentation
 88 |     WHERE constant_num_heavies >= ?
 89 |  GROUP BY constant_smiles
 90 | """
 91 |         constant_counts = collections.Counter()
 92 |         num_databases = len(self.databases)
 93 |         for database_i, database in enumerate(self.databases, 1):
 94 |             reporter.update(f"Selecting constants from {database!r} (#{database_i}/{num_databases})")
 95 |             try:
 96 |                 with contextlib.closing(open_fragdb_from_options_or_exit(database)) as db:
 97 |                     with contextlib.closing(db.cursor()) as c:
 98 |                         c.execute(query, (min_constant_num_heavies,))
 99 |                         for constant_smiles, n in c:
100 |                             constant_counts[constant_smiles] += n
101 |             finally:
102 |                 reporter.update("")
103 |                         
104 |         for constant_smiles, n in constant_counts.most_common():
105 |             if min_count <= n <= max_count:
106 |                 yield constant_smiles, n
107 | 
108 | def open_frag_dbs(databases_options):
109 |     databases = databases_options.databases
110 |     if not databases:
111 |         raise click.BadArgumentUsage("must specify at least one fragment database")
112 |     if len(databases) == 1:
113 |         return SingleDatabase(databases[0])
114 |     else:
115 |         return MultipleDatabases(databases)
116 | 
117 |                 
118 | fragdb_constants_epilog = """
119 | 
120 | By default this lists the constants in one or more fragdb files,
121 | ordered from most common to least. It is meant as a way to reduce the
122 | number of constants used during indexing, and to partition the
123 | data set for parallel indexing.
124 | 
125 | Use `--min-count` and `--max-count` to set the minimum or maximum
126 | number of occurences. (If a constant appears twice in the same record
127 | then its occurence count is 2.)
128 | 
129 | Use `--min-frequency` and `--max-frequency` to express the minimum and
130 | maximum occurences as a fraction of the total number of
131 | occurences. (NOTE: this does not seem useful and will likely be
132 | removed unless people say it's important.)
133 | 
134 | Use `--min-heavies-total-const-frag` to set a lower bound on the number
135 | of heavies in each constant.
136 | 
137 | Use `--min-heavies-per-const-frag` to set a lower bound on the number
138 | of heavies in the smallest fragment in each constant.
139 | 
140 | Use `--limit` to limite the output to the first `K` constants.
141 | 
142 | By default the constants are written to stdout. Use `--output` to
143 | write the constants to a named file. If the filename ends with `.gz`
144 | then the output is gzip compressed.
145 | 
146 | The output is formatted in two tab-separated columns as in the
147 | following example:
148 | 
149 | \b
150 | ```
151 |   % mmpdb fragdb_constants example.fragdb --limit 3
152 |   constant	N
153 |   *C	1010
154 |   *C.*C	849
155 |    *C.*O	662
156 | ```
157 | 
158 | The first column contains the fragment SMILES and the second contains
159 | the count. The first line is a header with column named "constant" and
160 | "N". Use `--no-header` to omit the header in the output.
161 | 
162 | """
163 | 
164 | 
165 | @command(
166 |     epilog = fragdb_constants_epilog,
167 |     name = "fragdb_constants",
168 |     )
169 | @click.option(
170 |     "--min-count",
171 |     type=nonnegative_int(),
172 | )
173 | @click.option(
174 |     "--max-count",
175 |     type=nonnegative_int(),
176 | )
177 | @click.option(
178 |     "--min-frequency",
179 |     "--min-freq",
180 |     type=frequency_type(),
181 | )
182 | @click.option(
183 |     "--max-frequency",
184 |     "--max-freq",
185 |     type=frequency_type(),
186 | )
187 | @click.option(
188 |     "--min-heavies-per-const-frag",
189 |     type=nonnegative_int(),
190 |     help="Lower bound on the number of heavies in the smallest fragment in the constant part",
191 | )
192 | @click.option(
193 |     "--min-heavies-total-const-frag",
194 |     type=nonnegative_int(),
195 |     default=0,
196 |     help="Lower bound on the number of heavies in the constant part",
197 | )
198 | @click.option(
199 |     "--limit",
200 |     metavar="K",
201 |     type=positive_int(),
202 |     help="Limit the output to the 'K' most common constants",
203 | )
204 | @click.option(
205 |     "--output",
206 |     "-o",
207 |     "output_file",
208 |     default = "-",
209 |     type = GzipFile("w"),
210 |     help = "Write the result to the named file (default: stdout)",
211 |     )
212 | @click.option(
213 |     "--header / --no-header",
214 |     default = True,
215 |     help = "The default, --header, includes the header in output",
216 |     )
217 | @add_multiple_databases_parameters()
218 | @click.pass_obj
219 | def fragdb_constants(
220 |     reporter,
221 |     databases_options,
222 |     min_count,
223 |     max_count,
224 |     min_frequency,
225 |     max_frequency,
226 |     min_heavies_per_const_frag,
227 |     min_heavies_total_const_frag,
228 |     limit,
229 |     output_file,
230 |     header,
231 | ):
232 |     """List constants fragdb DATABASEs and their frequencies"""
233 |     from ..index_algorithm import get_num_heavies
234 | 
235 |     with open_frag_dbs(databases_options) as frag_dbs:
236 |         num_fragmentations = frag_dbs.get_num_fragmentations(reporter=reporter)
237 | 
238 |         const_frag_filter = min_heavies_per_const_frag is not None and min_heavies_per_const_frag > 0
239 | 
240 |         if min_count is None:
241 |             min_count = 0
242 | 
243 |         if max_count is None:
244 |             max_count = num_fragmentations
245 | 
246 |         # minimum frequency
247 |         if min_frequency is not None:
248 |             min_freq_count = int(math.ceil(min_frequency * num_fragmentations))
249 |             min_count = max(min_count, min_freq_count)
250 | 
251 |         # maximum frequency
252 |         if max_frequency is not None:
253 |             max_freq_count = int(math.floor(max_frequency * num_fragmentations))
254 |             max_count = min(max_count, max_freq_count)
255 | 
256 |         assert isinstance(min_count, int)
257 |         assert isinstance(max_count, int)
258 | 
259 |         c = frag_dbs.iter_constants(min_heavies_total_const_frag, min_count, max_count,
260 |                                         reporter=reporter)
261 |         
262 |         if limit is None:
263 |             # 4611686018427387904 constants ought to be good enough for anyone
264 |             limit = 2**63
265 | 
266 |         i = 0
267 |         for constant_smiles, n in c:
268 |             # Don't write the header until we have the first output line.
269 |             # This makes the status reports easier to read as they are
270 |             # not placed between the header and the constant lines.
271 |             if header:
272 |                 output_file.write(f"constant\tN\n")
273 |                 # Only write the header once.
274 |                 header = False
275 |             
276 |             # Can't put the --limit in the SQL because of
277 |             # possible additional filtering by
278 |             # --min-heavies-per-constant-frag 
279 |             # after the SQL query
280 |             if i >= limit:
281 |                 break
282 | 
283 |             if const_frag_filter:
284 |                 terms = constant_smiles.split(".")
285 |                 if any(get_num_heavies(term) < min_heavies_per_const_frag for term in terms):
286 |                     continue
287 | 
288 |             i += 1
289 |             output_file.write(f"{constant_smiles}\t{n}\n")
290 |     
291 |     output_file.close()
292 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/fragdb_list.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import dataclasses
  3 | import click
  4 | 
  5 | from .click_utils import (
  6 |     command,
  7 |     add_multiple_databases_parameters,
  8 |     )
  9 | 
 10 | @dataclasses.dataclass
 11 | class FragDBInfo:
 12 |     filename: str
 13 |     num_compounds: int
 14 |     num_error_compounds: int
 15 |     num_fragmentations: int
 16 |     num_constants: int
 17 |     num_variables: int
 18 |     max_num_pairs: int
 19 |     options: object
 20 | 
 21 |     def get_cols(self):
 22 |         return [
 23 |             self.filename,
 24 |             str(self.num_compounds),
 25 |             str(self.num_error_compounds),
 26 |             str(self.num_fragmentations),
 27 |             str(self.num_constants),
 28 |             str(self.num_variables),
 29 |             str(self.max_num_pairs),
 30 |             ]
 31 | 
 32 | def write(terms):
 33 |     sys.stdout.write(" ".join(terms) + "\n")
 34 | 
 35 | def get_info(filename, reporter):
 36 |     from .. import fragment_db
 37 |     try:
 38 |         db = fragment_db.open_fragdb(filename)
 39 |     except IOError as err:
 40 |         reporter.warning(f"Cannot open database: {err} -- Skipping.")
 41 |         return None
 42 |     except ValueError as err:
 43 |         reporter.warning(f"Cannot use database: {err} -- Skipping.")
 44 |         return None
 45 | 
 46 |     def _get_one(sql):
 47 |         c.execute(sql)
 48 |         for (n,) in c:
 49 |             return n
 50 |         raise AssertionError("cannot get one", sql)
 51 |     
 52 |     with db:
 53 |         c = db.cursor()
 54 |         return FragDBInfo(
 55 |             filename = filename,
 56 |             num_compounds = _get_one(
 57 |                 "SELECT COUNT(*) FROM record"
 58 |                 ),
 59 |             num_error_compounds = _get_one(
 60 |                 "SELECT COUNT(*) FROM error_record",
 61 |                 ),
 62 |             num_fragmentations = _get_one(
 63 |                 "SELECT COUNT(*) FROM fragmentation",
 64 |                 ),
 65 |             num_constants = _get_one(
 66 |                 "SELECT COUNT(DISTINCT constant_smiles) FROM fragmentation",
 67 |                 ),
 68 |             num_variables = _get_one(
 69 |                 "SELECT COUNT(DISTINCT variable_smiles) FROM fragmentation",
 70 |                 ),
 71 |             max_num_pairs = _get_one(
 72 |                 # XXX Should I have a +1 for single-cut constants?
 73 |                 "SELECT SUM(i*(i-1)/2) FROM (SELECT COUNT(*) AS i FROM fragmentation GROUP BY constant_smiles)",
 74 |                 ),
 75 |             options = db.options,
 76 |             )
 77 |             
 78 |     
 79 | @command(
 80 |     name = "fragdb_list",
 81 |     )
 82 | 
 83 | @click.option(
 84 |     "--all",
 85 |     "-a",
 86 |     "show_all",
 87 |     is_flag = True,
 88 |     default = False,
 89 |     help = "Include option information",
 90 |     )
 91 | 
 92 | @add_multiple_databases_parameters()
 93 | @click.pass_obj
 94 | def fragdb_list(
 95 |         reporter,
 96 |         databases_options,
 97 |         show_all,
 98 |         ):
 99 |     """Summarize zero or more fragdb databases
100 | 
101 |     If no DATABASE is given then look for '*.fragdb' in the current directory.
102 |     """
103 | 
104 |     databases = databases_options.databases
105 |     if not databases:
106 |         import glob
107 |         databases = glob.glob("*.fragdb")
108 |         databases.sort()
109 | 
110 |     col_headers = [        
111 |         "Name",
112 |         "#recs",
113 |         "#errs",
114 |         "#frags",
115 |         "#consts",
116 |         "#vars",
117 |         "max.#pairs",
118 |         ]
119 |     col_sizes = [len(s) for s in col_headers]
120 |         
121 |     info_list = []
122 |     rows = []
123 | 
124 |     for database in databases:
125 |         info = get_info(database, reporter)
126 |         if info is None:
127 |             continue
128 |         info_list.append(info)
129 |         
130 |         cols = info.get_cols()
131 |         rows.append(cols)
132 | 
133 |         # Figure out the columns sizes
134 |         for i, col in enumerate(cols):
135 |             n = len(col)
136 |             if n > col_sizes[i]:
137 |                 col_sizes[i] = n
138 | 
139 |     write(header.center(col_size)
140 |               for header, col_size in zip(col_headers, col_sizes))
141 | 
142 |     for info, row in zip(info_list, rows):
143 |         write(col.rjust(col_size)
144 |                   for col, col_size in zip(row, col_sizes))
145 |         if show_all:
146 |             sys.stdout.write("        Fragment options:\n")
147 |             d = info.options.to_dict()
148 |             for k, v in d.items():
149 |                 sys.stdout.write(f"          {k}: {v}\n")
150 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/fragdb_merge.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import click
  3 | 
  4 | from .click_utils import (
  5 |     command,
  6 |     die,
  7 |     )
  8 | 
  9 | 
 10 | fragdb_merge_epilog = """
 11 | 
 12 | The fragdb_merge command is designed to merge the fragdb files
 13 | generated by multiple `mmpdb fragment` outputs into a single
 14 | `--output` fragdb files. It CANNNOT be used to merge partitioned
 15 | fragdb files!
 16 | 
 17 | NOTE: This command is *NOT* needed if you want to partition on the
 18 | constants using distributed computing because `mmpdb fragdb_constants`
 19 | and `mmpdb fragdb_partition` both work on a one or more fragdb files
 20 | to generate a partitioned set of fragdb file.
 21 | 
 22 | Instead, it should be used for single-threaded indexing, and to create
 23 | a cache file for future re-fragmentations.
 24 | 
 25 | """
 26 |         
 27 | 
 28 | fragdb_merge_sql = """
 29 | -- This expects the database to import to be attached as 'old'
 30 | -- using something like:
 31 | --   attach database "subset.000.fragdb" as old
 32 | 
 33 | -- Step 1: Copy over the error_record table
 34 | 
 35 | 
 36 | INSERT INTO error_record (title, input_smiles, errmsg)
 37 |  SELECT title, input_smiles, errmsg
 38 |    FROM old.error_record
 39 |         ;
 40 | 
 41 | -- Step 2: Copy over the record table
 42 | 
 43 | 
 44 | INSERT INTO record (title, input_smiles, num_normalized_heavies, normalized_smiles)
 45 |  SELECT title, input_smiles, num_normalized_heavies, normalized_smiles
 46 |    FROM old.record
 47 |         ;
 48 | 
 49 | 
 50 | -- Step 3: Copy over the fragmentation table
 51 | 
 52 | -- The fragmentation is self-contained. All we need to do is get the
 53 | -- correct fragment id, which we can do with a simple title lookup
 54 | -- because the title was added in step 2.
 55 | 
 56 | INSERT INTO fragmentation (
 57 | 	record_id,
 58 | 	num_cuts,
 59 | 	enumeration_label,
 60 | 	variable_num_heavies,
 61 | 	variable_symmetry_class,
 62 | 	variable_smiles,
 63 | 	attachment_order,
 64 | 	constant_num_heavies,
 65 | 	constant_symmetry_class,
 66 | 	constant_smiles,
 67 | 	constant_with_H_smiles)
 68 |  SELECT new_record.id,
 69 | 	old_fragmentation.num_cuts,
 70 | 	old_fragmentation.enumeration_label,
 71 | 	old_fragmentation.variable_num_heavies,
 72 | 	old_fragmentation.variable_symmetry_class,
 73 | 	old_fragmentation.variable_smiles,
 74 | 	old_fragmentation.attachment_order,
 75 | 	old_fragmentation.constant_num_heavies,
 76 | 	old_fragmentation.constant_symmetry_class,
 77 | 	old_fragmentation.constant_smiles,
 78 | 	old_fragmentation.constant_with_H_smiles
 79 |    FROM record as new_record,
 80 |         old.record as old_record,
 81 |         old.fragmentation as old_fragmentation
 82 |   WHERE old_record.title = new_record.title AND
 83 |         old_fragmentation.record_id = old_record.id
 84 |         ;
 85 | """
 86 | 
 87 | def open_output_fragdb(filename, options):
 88 |     import sqlite3
 89 |     from .. import fragment_db
 90 |     from .. import schema
 91 |     
 92 |     # Remove any existing file.
 93 |     try:
 94 |         os.unlink(filename)
 95 |     except FileNotFoundError:
 96 |         pass
 97 |     db = sqlite3.connect(filename)
 98 |     c = db.cursor()
 99 |     fragment_db.init_fragdb(c, options)
100 |     schema._execute_sql(c, fragment_db.get_fragment_create_index_sql())
101 |     return db, c
102 | 
103 | def check_options_mismatch(filename, options, first_filename, first_options):
104 |     d = options.to_dict()
105 |     first_d = first_options.to_dict()
106 |     if d == first_d:
107 |         return
108 | 
109 |     # Figure out which values are different
110 |     lines = [f"Cannot merge. The options in {filename!r} differ from {first_filename!r}."]
111 |     for k in d:
112 |         if d[k] != first_d[k]:
113 |             lines.append(f"  {k}: {d[k]!r} != {first_d[k]!r}")
114 |     die(*lines)
115 | 
116 | @command(
117 |     name="fragdb_merge",
118 |     epilog = fragdb_merge_epilog,
119 |     )
120 | 
121 | @click.option(
122 |     "--output",
123 |     "-o",
124 |     "output_filename",
125 |     help = 'Name of the merged database (default: "merged.fragdb")',
126 |     default = None,
127 |     )
128 | 
129 | @click.argument(
130 |     "filenames",
131 |     metavar="FILENAME",
132 |     nargs=-1,
133 |     required=True,
134 |     )
135 | @click.pass_obj
136 | def fragdb_merge(
137 |         reporter,
138 |         filenames,
139 |         output_filename,
140 |         ):
141 |     """Merge multiple fragdb files into one"""
142 |     assert filenames, "should have been handled by click"
143 |     from .. import fragment_db, schema
144 |     import sqlite3
145 | 
146 |     if output_filename is None:
147 |         output_filename = "merged.fragdb"
148 |         reporter.report(f"No --output file name specified. Using {output_filename!r}.")
149 |     
150 |     first_filename = None
151 |     first_options = None
152 |     output_db = None
153 |     output_c = None
154 | 
155 |     num_records = num_error_records = None
156 |     try:
157 |         for filename in filenames:
158 |             # Ensure it's a valid SQLite database
159 |             try:
160 |                 old_db = fragment_db.open_fragdb(filename)
161 |             except ValueError as err:
162 |                 die(str(err))
163 |             old_options = old_db.options
164 |             old_db.close()
165 | 
166 |             if first_options is None:
167 |                 first_options = old_options
168 |                 first_filename = filename
169 |                 try:
170 |                     output_db, output_c = open_output_fragdb(
171 |                         output_filename,
172 |                         first_options,
173 |                         )
174 |                 except sqlite3.OperationalError as err:
175 |                     die(f"Error trying to open {output_filename!r} for writing: {err}")
176 |             else:
177 |                 check_options_mismatch(filename, old_options, first_filename, first_options)
178 | 
179 |             try:
180 |                 output_c.execute("ATTACH DATABASE ? AS old", (filename,))
181 |             except sqlite3.OperationalError as err:
182 |                 die(f"Cannot attach {filename!r} using sqlite3: {err}")
183 | 
184 |             try:
185 |                 # Check for any duplicate record ids
186 |                 output_c.execute("""
187 | SELECT old_record.title
188 |   FROM old.record as old_record, record as new_record
189 |  WHERE old_record.title = new_record.title
190 | """)
191 |                 for (title,) in output_c:
192 |                     die(
193 |                         f"Cannot merge {filename!r}: Duplicate record id {title!r}.",
194 |                          "  (Use 'fragdb_merge' to merge fragdb files from fragmenting different SMILES files,",
195 |                          "   not to merge the fragdb files generated by 'fragdb_split'.)"
196 |                         )
197 | 
198 |                 # We're free to merge!
199 |                 schema._execute_sql(output_c, fragdb_merge_sql)
200 |                 
201 |             finally:
202 |                 output_c.execute("COMMIT")
203 |                 output_c.execute("DETACH DATABASE old")
204 |                 output_c.execute("BEGIN TRANSACTION")
205 |             
206 |     finally:
207 |         if output_c is not None:
208 |             output_c.execute("COMMIT")
209 |             num_records, = next(output_c.execute("SELECT count(*) from record"))
210 |             num_error_records, = next(output_c.execute("SELECT count(*) from error_record"))
211 |             output_c.close()
212 |             output_db.close()
213 | 
214 |     if num_records is not None and num_error_records is not None:
215 |         reporter.report(
216 |             "Merge complete. "
217 |             f"#files: {len(filenames)} "
218 |             f"#records: {num_records} "
219 |             f"#error records: {num_error_records}"
220 |             )
221 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/fragment.py:
--------------------------------------------------------------------------------
  1 | "Implement the 'fragment' command"
  2 | 
  3 | # mmpdb - matched molecular pair database generation and analysis
  4 | #
  5 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  6 | # Copyright (c) 2019-2021, Andrew Dalke Scientific, AB
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are
 10 | # met:
 11 | #
 12 | #    * Redistributions of source code must retain the above copyright
 13 | #      notice, this list of conditions and the following disclaimer.
 14 | #    * Redistributions in binary form must reproduce the above
 15 | #      copyright notice, this list of conditions and the following
 16 | #      disclaimer in the documentation and/or other materials provided
 17 | #      with the distribution.
 18 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 19 | #      its contributors may be used to endorse or promote products
 20 | #      derived from this software without specific prior written
 21 | #      permission.
 22 | #
 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 27 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 34 | #
 35 | 
 36 | import click
 37 | 
 38 | 
 39 | from .. import smarts_aliases
 40 | from .click_utils import (
 41 |     command,
 42 |     die,
 43 |     name_to_command_line,
 44 |     ordered_group,
 45 |     positive_int,
 46 | )
 47 | from . import fragment_click
 48 | from . import smi_utils
 49 | 
 50 | 
 51 | # Make it so that ^C works in the main thread
 52 | def init_worker():
 53 |     import signal
 54 | 
 55 |     signal.signal(signal.SIGINT, signal.SIG_IGN)
 56 | 
 57 | 
 58 | def create_pool(num_jobs):
 59 |     import multiprocessing
 60 |     from .. import fragment_records
 61 | 
 62 |     if num_jobs > 1:
 63 |         pool = multiprocessing.Pool(num_jobs, init_worker)
 64 |     else:
 65 |         pool = fragment_records.SingleProcessPool()
 66 |     return pool
 67 | 
 68 | 
 69 | ############ The fragment cli
 70 | 
 71 | 
 72 | fragment_epilog = """
 73 | Fragment molecules in a SMILES file by breaking on 'cut bonds', as
 74 | matched by --cut-smarts or the R-group SMILES of --cut-rgroup or
 75 | --cut-rgroup-file. Cut up to --num-cuts bonds. Don't fragment
 76 | molecules with more than --max-rotatable-bonds bonds or --max-heavies
 77 | heavy atoms. Don't create multiple cuts if the fragments in the
 78 | constant part have less than --min-heavies-per-const-frag atoms.  See
 79 | 'mmpdb rgroup2smarts' for details about cutting with R-group SMILES.
 80 | 
 81 | The input structures come from a SMILES file. By default the fields
 82 | are whitespace delimited, where the first column contains the SMILES
 83 | and the second contains the identifier. Use --delimiter to change
 84 | the delimiter type, and --has-header to skip the first line. See
 85 | "mmpdb help-smiles-format" for more details.
 86 | 
 87 | The input SMILES strings are pre-processed to remove salts before
 88 | fragmenting. The default uses the default RDKit SmilesRemover. Use
 89 | --salt-remover to specify alternative rules, or use the special
 90 | name '<none>' to not remove any salts.
 91 | 
 92 | By default the fragmentation method uses 4 threads, which gives a
 93 | nearly 4-fold speedup. Use --num-jobs change the number of threads.
 94 | 
 95 | It can take a while to generate fragments. Suppose you want to update
 96 | the compound set every week, where only a few records are added or
 97 | changed. Most of the fragments will be the same between those two data
 98 | sets. What you can do is specify the old fragdb file as a --cache so
 99 | the fragmentation method can re-use the old fragmentation, assuming
100 | the structure hasn't changed for a given record.
101 | 
102 | If you do not specify the `--output` filename then the default is
103 | based on the SMILES filename with the "smi" or "smi.gz" extension
104 | replace with "fragdb". If there is no input SMILES filename because
105 | the data is from stdin then the default filename is `input.mmpdb`.
106 | 
107 | Examples:
108 | 
109 | 1) Fragment the SMILES file to produce the fragdb file
110 | `CHEMBL_thrombin_Ki_IC50.fragdb` (the output name is based on the
111 | input SMILES filename):
112 | 
113 | \b
114 |   % mmpdb fragment CHEMBL_thrombin_Ki_IC50.smi
115 | 
116 | 2) Do the same, but with an explicit output filename:
117 | 
118 | \b
119 |   % mmpdb fragment CHEMBL_thrombin_Ki_IC50.smi \\
120 |       -o CHEMBL_thrombin_Ki_IC50.fragdb
121 | 
122 | 3) Read from a gzip-compressed tab-delimited SMILES file. Use 8
123 | threads to fragment the structures. Save the results to
124 | dataset.fragdb:
125 | 
126 | \b
127 |   % mmpdb fragment --delimiter tab dataset.smi.gz --num-jobs 8 \\
128 |       -o dataset.fragdb
129 | 
130 | 4) Fragment the SMILES in 'dataset.smi.gz'. Reuse fragment information
131 | from the cache file 'old_dataset.fragdb' if possible, instead of
132 | computing the fragments from scratch each time. Save the results to
133 | 'new_dataset.fragdb'.
134 | 
135 | \b
136 |   % mmpdb fragment --cache old_dataset.fragdb dataset.smi.gz \\
137 |       -o new_dataset.fragdb
138 | 
139 | \b
140 | """ + smarts_aliases.get_epilog(
141 |     "--cut-smarts", smarts_aliases.cut_smarts_aliases
142 | )
143 | 
144 | 
145 | def cannot_combine_with_fragment_options(ctx, cache):
146 |     if cache is None:
147 |         return
148 |     used_names = ctx.meta[fragment_click.FRAGMENTATION_OPTION_NAMES]
149 |     if not used_names:
150 |         return
151 |     names = sorted(name_to_command_line(name) for name in used_names)
152 |     if len(names) == 1:
153 |         raise click.UsageError(f"Cannot combine {names[0]} with --cache")
154 |     else:
155 |         *first, last = names
156 |         first_str = ", ".join(first)
157 |         raise click.UsageError(f"Cannot combine {first_str} or {last} with --cache")
158 | 
159 | 
160 | @command(epilog=fragment_epilog)
161 | @fragment_click.add_fragment_options
162 | @click.option(
163 |     "--cache",
164 |     metavar="FRAGDB",
165 |     help="Get fragment parameters and previous fragment information the FRAGDB file",
166 | )
167 | @click.option(
168 |     "--num-jobs",
169 |     "-j",
170 |     metavar="N",
171 |     type=positive_int(),
172 |     default=4,
173 |     help="Number of jobs to process in parallel (default: 4)",
174 | )
175 | @smi_utils.add_input_options
176 | @click.option(
177 |     "--output",
178 |     "-o",
179 |     metavar="FILENAME",
180 |     help="Save the fragment data to FILENAME (default: based on the structure filename)",
181 | )
182 | @click.argument(
183 |     "structure_filename",
184 |     default=None,
185 |     required=False,
186 |     metavar="FILENAME",
187 |     # help = "SMILES filename (default: read from stdin)",
188 | )
189 | @click.pass_context
190 | def fragment(
191 |     ctx,
192 |     # from add_fragment_options
193 |     fragment_options,
194 |     # 'fragment'-specific arguments
195 |     cache,
196 |     num_jobs,
197 |     # SMILES input options
198 |     input_options,
199 |     # output options
200 |     output,
201 |     # input
202 |     structure_filename,
203 | ):
204 |     """Fragment SMILES file structures on rotatable bonds
205 | 
206 |     FILENAME: SMILES file (default: read from stdin)
207 | 
208 |     The output is a 'fragdb' file containing fragmentations which can
209 |     be used by `mmpdb index` or as cache for another `mmpdb fragment`.
210 | 
211 |     """
212 |     from .. import (
213 |         fragment_db,
214 |         fragment_types,
215 |         fragment_records,
216 |         fileio,
217 |     )
218 | 
219 |     config = ctx.obj
220 |     cannot_combine_with_fragment_options(ctx, cache)
221 | 
222 |     output_filename = output
223 |     if output_filename is None:
224 |         if structure_filename is None:
225 |             output_filename = "input.fragdb"
226 |         else:
227 |             output_filename = fileio.remove_suffixes(structure_filename) + ".fragdb"
228 |         config.report(f"Using {output_filename!r} as the default --output file.")
229 | 
230 |     # Use a cache?
231 |     cache_db = None
232 |     if cache is not None:
233 |         try:
234 |             cache_db = fragment_db.open_fragdb(cache)
235 |         except IOError as err:
236 |             die(f"Cannot open cache: {err}")
237 |         except ValueError as err:
238 |             die(f"Problem loading cache: {err}")
239 | 
240 |         try:
241 |             fragment_filter = cache_db.options.get_fragment_filter()
242 |         except fragment_types.FragmentValueError as err:
243 |             die(f"Error in cache option {err.name!r} ({err.value}!r) from {cache!r}: {err.reason}")
244 |     else:
245 |         try:
246 |             fragment_filter = fragment_options.get_fragment_filter()
247 |         except fragment_types.FragmentValueError as err:
248 |             die("Error in command-line option %r (%r): %s" % (name_to_command_line(err.name), err.value, err.reason))
249 | 
250 |     pool = create_pool(num_jobs)
251 | 
252 |     try:
253 |         try:
254 |             with input_options.read_smiles_file(structure_filename) as reader:
255 | 
256 |                 with fragment_db.open_fragment_writer(
257 |                     output_filename,
258 |                     options=fragment_filter.options,
259 |                 ) as writer:
260 | 
261 |                     records = fragment_records.make_fragment_records(
262 |                         reader,
263 |                         fragment_filter,
264 |                         cache_db,
265 |                         pool=pool,
266 |                         reporter=config,
267 |                     )
268 |                     writer.write_records(records)
269 | 
270 |         except fileio.FileFormatError as err:
271 |             die(f"Cannot parse input file: {err}")
272 | 
273 |         except UnicodeDecodeError as err:
274 |             die(f"Error processing input file: {err} at {reader.location.where()}")
275 | 
276 |     except KeyboardInterrupt:
277 |         config.update("Shutting down process pool")
278 |         pool.terminate()
279 |         pool.join()
280 |         config.update("")
281 |         raise SystemExit(-1)
282 |     else:
283 |         config.update("Closing process pool")
284 |         pool.close()
285 |         pool.join()
286 |         config.update("")
287 | 
288 | 
289 | ####### "fragdb_utils" group
290 | 
291 | 
292 | @ordered_group()
293 | def fragdb_utils():
294 |     pass
295 | 
296 | 
297 | @fragdb_utils.command()
298 | def fragdb_ls():
299 |     pass
300 | 
301 | 
302 | @fragdb_utils.command()
303 | def fragdb_constant_stats():
304 |     pass
305 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/fragment_click.py:
--------------------------------------------------------------------------------
  1 | """Helper functions related to fragmentation"""
  2 | 
  3 | # mmpdb - matched molecular pair database generation and analysis
  4 | #
  5 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  6 | # Copyright (c) 2021, Andrew Dalke Scientific AB
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are
 10 | # met:
 11 | #
 12 | #    * Redistributions of source code must retain the above copyright
 13 | #      notice, this list of conditions and the following disclaimer.
 14 | #    * Redistributions in binary form must reproduce the above
 15 | #      copyright notice, this list of conditions and the following
 16 | #      disclaimer in the documentation and/or other materials provided
 17 | #      with the distribution.
 18 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 19 | #      its contributors may be used to endorse or promote products
 20 | #      derived from this software without specific prior written
 21 | #      permission.
 22 | #
 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 27 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 34 | #
 35 | 
 36 | import click
 37 | 
 38 | from .click_utils import (
 39 |     IntChoice,
 40 |     die,
 41 |     pop_known_args,
 42 |     positive_int_or_none,
 43 |     nonnegative_int,
 44 |     set_click_attrs,
 45 | )
 46 | 
 47 | from .. import config
 48 | from .. import smarts_aliases
 49 | 
 50 | 
 51 | def callback_chain(*funcs):
 52 |     def caller(ctx, param, value):
 53 |         for func in funcs:
 54 |             value = func(ctx, param, value)
 55 |         return value
 56 | 
 57 |     return caller
 58 | 
 59 | 
 60 | EXCLUSION_CUTS = "mmpdblib.exclusion.cuts"
 61 | 
 62 | 
 63 | def mutual_exclusion_cuts(ctx, param, value):
 64 |     if not value or ctx.resilient_parsing:
 65 |         return value
 66 |     name = param.name
 67 |     prev_name = ctx.meta.get(EXCLUSION_CUTS, None)
 68 |     if prev_name is None:
 69 |         ctx.meta[EXCLUSION_CUTS] = name
 70 |         return value
 71 |     if prev_name == name:
 72 |         return value
 73 |     raise click.UsageError("Cannot specify more than one of --cut-smarts, --cut-rgroup, or --cut-rgroup-file")
 74 | 
 75 | 
 76 | # Record information about which fragmentation options are used so
 77 | # "fragment" can detect conflicts in mixing these with --cache
 78 | FRAGMENTATION_OPTION_NAMES = "mmpdblib.fragment_option_names"
 79 | 
 80 | 
 81 | def record_used_options(ctx, param, value):
 82 |     used_names = ctx.meta.setdefault(FRAGMENTATION_OPTION_NAMES, set())
 83 |     if not value:
 84 |         return value
 85 |     used_names.add(param.name)
 86 |     return value
 87 | 
 88 | 
 89 | #### @add_fragment_options
 90 | 
 91 | 
 92 | def add_fragment_options(command):
 93 |     OPTS = config.DEFAULT_FRAGMENT_OPTIONS
 94 | 
 95 |     param_names = []
 96 | 
 97 |     def add_option(*args, **kwargs):
 98 |         # Keep track of the parameter names used
 99 |         param_names.append(args[0].lstrip("-").replace("-", "_"))
100 | 
101 |         # Want to track which fragment options were used, in case one is
102 |         # specified with --cache.  This only works if the default is None,
103 |         # so we can't use
104 |         #    default = OPT.option_name
105 |         # These defaults are instead set at the very end.
106 |         callback = record_used_options
107 |         if "callback" in kwargs:
108 |             callback = callback_chain(callback, kwargs["callback"])
109 |         kwargs["callback"] = callback
110 | 
111 |         click.option(*args, **kwargs)(command)
112 | 
113 |     add_option(
114 |         "--max-heavies",
115 |         type=positive_int_or_none(),
116 |         help=f"Maximum number of non-hydrogen atoms, or 'none' (default: {OPTS.max_heavies})",
117 |     )
118 | 
119 |     add_option(
120 |         "--max-rotatable-bonds",
121 |         type=positive_int_or_none(),
122 |         help=f"Maximum number of rotatable bonds (default: {OPTS.max_rotatable_bonds})",
123 |     )
124 | 
125 |     add_option(
126 |         "--rotatable-smarts",
127 |         metavar="SMARTS",
128 |         help=f"SMARTS pattern to detect rotatable bonds (default: {OPTS.rotatable_smarts!r})",
129 |     )
130 | 
131 |     add_option(
132 |         "--salt-remover",
133 |         metavar="FILENAME",
134 |         help=(
135 |             "File containing RDKit SaltRemover definitions. The default ('<default>') "
136 |             "uses RDKit's standard salt remover. Use '<none>' to not remove salts."
137 |         ),
138 |     )
139 | 
140 |     ## These are mutually exclusive group
141 |     alias_names = ", ".join(repr(alias.name) for alias in smarts_aliases.cut_smarts_aliases)
142 | 
143 |     add_option(
144 |         "--cut-smarts",
145 |         metavar="SMARTS",
146 |         help=(
147 |             f"Alternate SMARTS pattern to use for cutting (default: {OPTS.cut_smarts!r}), "
148 |             f"or use one of: {alias_names}"
149 |         ),
150 |         callback=mutual_exclusion_cuts,
151 |     )
152 | 
153 |     add_option(
154 |         "--cut-rgroup",
155 |         metavar="SMILES",
156 |         multiple=True,
157 |         help="Cut on the attachment point for the given R-group SMILES",
158 |         callback=mutual_exclusion_cuts,
159 |     )
160 | 
161 |     add_option(
162 |         "--cut-rgroup-file",
163 |         metavar="FILENAME",
164 |         help="Read R-group SMILES from the named file",
165 |         callback=mutual_exclusion_cuts,
166 |     )
167 | 
168 |     ##
169 |     add_option(
170 |         "--num-cuts",
171 |         type=IntChoice(["1", "2", "3"]),
172 |         help=f"Number of cuts to use (default: {OPTS.num_cuts})",
173 |     )
174 | 
175 |     add_option(
176 |         "--min-heavies-per-const-frag",
177 |         type=nonnegative_int(),
178 |         metavar="N",
179 |         help=(
180 |             "Ignore fragmentations where one or more constant fragments have "
181 |             f"fewer than N heavy atoms (default: {OPTS.min_heavies_per_const_frag})"
182 |         ),
183 |     )
184 |     
185 |     add_option(
186 |         "--min-heavies-total-const-frag",
187 |         type=nonnegative_int(),
188 |         metavar="N",
189 |         help=(
190 |             "Ignore fragmentations where there are fewer than N heavy atoms in the "
191 |             "total constant fragment  (default: {OPTS.min_heavies_total_const_frag})"
192 |         ),
193 |     )
194 |     
195 |     add_option(
196 |         "--max-up-enumerations",
197 |         type=nonnegative_int(),
198 |         metavar="N",
199 |         help=(
200 |             "Maximum number of up-enumerations "
201 |             f"(default: {OPTS.max_up_enumerations})"
202 |         ),
203 |     )
204 | 
205 |     # Wrap the command to convert the fragment option parameters
206 |     # into a single object
207 |     def make_fragment_options_wrapper(**kwargs):
208 |         # Fill in the defaults, or use None if there aren't defaults (eg, for
209 |         # --cut-rgroups and --cut-rgroup-file).
210 |         popped_kwargs = pop_known_args(param_names, kwargs, OPTS)
211 | 
212 |         kwargs["fragment_options"] = make_fragment_options(**popped_kwargs)
213 | 
214 |         # Forward to the command
215 |         return command(**kwargs)
216 | 
217 |     set_click_attrs(make_fragment_options_wrapper, command)
218 | 
219 |     return make_fragment_options_wrapper
220 | 
221 | 
222 | ########
223 | 
224 | 
225 | def make_fragment_options(
226 |     *,
227 |     max_heavies,
228 |     max_rotatable_bonds,
229 |     rotatable_smarts,
230 |     cut_rgroup_file,
231 |     cut_rgroup,
232 |     cut_smarts,
233 |     num_cuts,
234 |     salt_remover,
235 |     min_heavies_per_const_frag,
236 |     min_heavies_total_const_frag,
237 |     max_up_enumerations,
238 | ):
239 |     from .. import (
240 |         fragment_types,
241 |         rgroup2smarts,
242 |     )
243 | 
244 |     if cut_rgroup_file is not None:
245 |         try:
246 |             cut_smarts = rgroup2smarts.get_recursive_smarts_from_cut_filename(cut_rgroup_file)
247 |         except OSError as err:
248 |             die(f"Cannot use --cut-rgroup-file: {cut_rgroup_file!r}: {err}")
249 | 
250 |         except rgroup2smarts.ParseError as err:
251 |             die(f"Cannot parse --cut-rgroup-file: {cut_rgroup_file!r}: {err}")
252 | 
253 |         except rgroup2smarts.ConversionError as err:
254 |             die(f"Error in --cut-rgroup-file: {cut_rgroup_file!r}: {err}")
255 | 
256 |     elif cut_rgroup:
257 |         try:
258 |             cut_smarts = rgroup2smarts.get_recursive_smarts_from_cut_rgroups(
259 |                 cut_rgroup, source="--cut-rgroup", offset=1
260 |             )
261 |         except rgroup2smarts.ConversionError as err:
262 |             die(str(err))
263 | 
264 |     else:
265 |         # Resolve any alias
266 |         if cut_smarts in smarts_aliases.cut_smarts_aliases_by_name:
267 |             cut_smarts = smarts_aliases.cut_smarts_aliases_by_name[cut_smarts].smarts
268 | 
269 |     method = "chiral"
270 | 
271 |     if max_heavies == "none":
272 |         max_heavies = None
273 | 
274 |     if max_rotatable_bonds == "none":
275 |         max_rotatable_bonds = None
276 | 
277 |     return fragment_types.FragmentOptions(
278 |         max_heavies=max_heavies,
279 |         max_rotatable_bonds=max_rotatable_bonds,
280 |         rotatable_smarts=rotatable_smarts,
281 |         cut_smarts=cut_smarts,
282 |         num_cuts=num_cuts,
283 |         salt_remover=salt_remover,
284 |         method=method,
285 |         min_heavies_per_const_frag=min_heavies_per_const_frag,
286 |         min_heavies_total_const_frag=min_heavies_total_const_frag,
287 |         max_up_enumerations=max_up_enumerations,
288 |     )
289 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/list_.py:
--------------------------------------------------------------------------------
  1 | "Implement the 'list' command"
  2 | 
  3 | # mmpdb - matched molecular pair database generation and analysis
  4 | #
  5 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  6 | # Copyright (c) 2019-2021, Andrew Dalke Scientific, AB
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are
 10 | # met:
 11 | #
 12 | #    * Redistributions of source code must retain the above copyright
 13 | #      notice, this list of conditions and the following disclaimer.
 14 | #    * Redistributions in binary form must reproduce the above
 15 | #      copyright notice, this list of conditions and the following
 16 | #      disclaimer in the documentation and/or other materials provided
 17 | #      with the distribution.
 18 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 19 | #      its contributors may be used to endorse or promote products
 20 | #      derived from this software without specific prior written
 21 | #      permission.
 22 | #
 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 27 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 34 | #
 35 | 
 36 | import click
 37 | import sys
 38 | 
 39 | from .click_utils import (
 40 |     command,
 41 |     add_multiple_databases_parameters,
 42 | )
 43 | 
 44 | 
 45 | list_epilog = """
 46 | 
 47 | In the simplest case, look in the current directory for files matching
 48 | '*.mmpdb', open each one, and print a terse summary of the information
 49 | in the database.
 50 | 
 51 | \b
 52 |   % mmpdb list
 53 |                Name             #cmpds  #rules  #pairs   #envs    #stats   |---------------- Title -----------------| Properties
 54 |   CHEMBL_thrombin_Ki_IC50.mmpdb   2985   29513   258294   199657        0  MMPs from 'CHEMBL_thrombin_Ki_IC50.fragdb' <none>
 55 |                      csdP.mmpdb   8473 2018581 12372084 12145254 12145254  CSD MP                                     MP
 56 | 
 57 | The output is a set of columns. The first line is the header. The first
 58 | column contains the database name. The next columns contain the number
 59 | of compounds, number of rules, number of pairs (a rule may have many
 60 | matched molecular pairs), number of rule environments (a rule may have
 61 | many environments), and number of property statistics for the rule
 62 | environments. After that is the user-defined title field, followed by
 63 | a list of the property or activity names stored.
 64 | 
 65 | The first entry, for thrombin, has no properties, which is why it also
 66 | has no property statistics. The second entry has a 'MP' property,
 67 | which in this case means 'melting point'.
 68 | 
 69 | The specific database location(s) can be given on the
 70 | command-line. The '--all' option shows more detailed information about
 71 | the dataset. The following gives more detailed information about the
 72 | database 'csd.mmpdb':
 73 | 
 74 | \b
 75 |   % mmpdb list --all csd.mmpdb
 76 |      Name   #cmpds  #rules  #pairs   #envs    #stats   |Title| Properties
 77 |   csd.mmpdb   8473 2018581 12372084 12145254 12145254  CSD MP  MP
 78 |         Created: 2022-10-08 14:50:16.595104
 79 |           #compounds/property:  8473/MP
 80 |           #smiles for rules: 56778  for constants: 10759
 81 |           Fragment options:
 82 |             cut_smarts: [#6+0;!$(*=,#[!#6])]!@!=!#[!#0;!#1;!$([CH2]);!$([CH3][CH2])]
 83 |             max_heavies: 100
 84 |             max_rotatable_bonds: 10
 85 |             max_up_enumerations: 1000
 86 |             method: chiral
 87 |             min_heavies_per_const_frag: 0
 88 |             min_heavies_total_const_frag: 0
 89 |             num_cuts: 3
 90 |             rotatable_smarts: [!$([NH]!@C(=O))&!D1&!$(*#*)]-&!@[!$([NH]!@C(=O))&!D1&!$(*#*)]
 91 |             salt_remover: <default>
 92 |           Index options:
 93 |             max_radius: 5
 94 |             max_variable_heavies: 10
 95 |             min_radius: 0
 96 |             smallest_transformation_only: False
 97 |             symmetric: False
 98 | 
 99 | 'Created' shows the creation time. '#compounds/property' shows how
100 | many compounds have a given property, for each of the available
101 | properties. The '#smiles' line says how many distinct SMILES strings
102 | are used for the rules and the constants tables. 'Fragment options'
103 | and 'Index options' are, I think, self-explanatory.
104 | 
105 | The count fields (like the number of compounds and rules) are
106 | pre-computed and stored in the database. If the database is updated
107 | incorrectly, it is possible for the cached information to be
108 | invalid. Use '--recount' to have SQLite compute the values directly
109 | from the database contents.
110 | 
111 | """
112 | 
113 | 
114 | @command(name="list", epilog=list_epilog)
115 | @click.option(
116 |     "--all",
117 |     "-a",
118 |     "all_option",
119 |     is_flag=True,
120 |     default=False,
121 |     help="List all information about the dataset",
122 | )
123 | @click.option(
124 |     "--recount",
125 |     is_flag=True,
126 |     default=False,
127 |     help="Count the table sizes directly, instead of using cached data",
128 | )
129 | @add_multiple_databases_parameters()
130 | @click.pass_obj
131 | def list_(
132 |     reporter,
133 |     databases_options,
134 |     all_option,
135 |     recount,
136 | ):
137 |     """Summarize the contents of zero or more databases
138 | 
139 |     DATABASES: the mmpdb database files to list (default looks files named '*.mmpdb')
140 |     """
141 | 
142 |     import json
143 |     from .. import dbutils
144 | 
145 |     name_list = []
146 |     num_compounds_list = []
147 |     num_rules_list = []
148 |     num_pairs_list = []
149 |     num_envs_list = []
150 |     num_stats_list = []
151 |     titles = []
152 |     property_names = []
153 | 
154 |     name_width = 5
155 |     num_compounds_width = 6
156 |     num_rules_width = 6
157 |     num_pairs_width = 6
158 |     num_envs_width = 6
159 |     num_stats_width = 6
160 |     title_width = 7
161 | 
162 |     all_fragment_options = []
163 |     all_index_options = []
164 |     for dbinfo, dataset in dbutils.iter_dbinfo_and_dataset(
165 |         databases_options.databases,
166 |         reporter,
167 |         apsw_warning = False,
168 |     ):
169 |         name = dbinfo.name
170 |         name_width = max(name_width, len(name))
171 |         name_list.append(name)
172 | 
173 |         table_sizes = dataset.get_table_sizes(recount)
174 |         if not table_sizes.all_defined():
175 |             reporter.warning(
176 |                 f"Pre-computed table counts not available in {dbinfo.get_human_name()}. "
177 |                 "Forcing --recount."
178 |                 )
179 |             table_sizes = dataset.get_table_sizes(recount=True)
180 | 
181 |         num_compounds = table_sizes.num_compounds
182 |         num_compounds_width = max(num_compounds_width, len(str(num_compounds)))
183 |         num_compounds_list.append(num_compounds)
184 | 
185 |         num_rules = table_sizes.num_rules
186 |         num_rules_width = max(num_rules_width, len(str(num_rules)))
187 |         num_rules_list.append(num_rules)
188 | 
189 |         num_pairs = table_sizes.num_pairs
190 |         num_pairs_width = max(num_pairs_width, len(str(num_pairs)))
191 |         num_pairs_list.append(num_pairs)
192 | 
193 |         num_envs = table_sizes.num_rule_environments
194 |         num_envs_width = max(num_envs_width, len(str(num_envs)))
195 |         num_envs_list.append(num_envs)
196 | 
197 |         num_stats = table_sizes.num_rule_environment_stats
198 |         num_stats_width = max(num_stats_width, len(str(num_stats)))
199 |         num_stats_list.append(num_stats)
200 | 
201 |         title = dataset.title
202 |         title_width = max(title_width, len(title))
203 |         titles.append(title)
204 | 
205 |         prop_names = dataset.get_property_names()
206 |         if prop_names:
207 |             s = " ".join(prop_names)
208 |         else:
209 |             s = "<none>"
210 |         property_names.append(s)
211 | 
212 |         all_fragment_options.append(dataset.fragment_options_str)
213 |         all_index_options.append(dataset.index_options_str)
214 | 
215 |     fmt = "%-{}s %-{}s %-{}s %-{}s %-{}s %-{}s  %-{}s Properties".format(
216 |         name_width,
217 |         num_compounds_width,
218 |         num_rules_width,
219 |         num_pairs_width,
220 |         num_envs_width,
221 |         num_stats_width,
222 |         title_width,
223 |     )
224 |     fancy_title = " Title ".center(title_width, "-")
225 |     fancy_title = "|" + fancy_title[1:-1] + "|"
226 |     print(
227 |         fmt
228 |         % (
229 |             "Name".center(name_width),
230 |             "#cmpds".center(num_compounds_width),
231 |             "#rules".center(num_rules_width),
232 |             "#pairs".center(num_pairs_width),
233 |             "#envs".center(num_envs_width),
234 |             "#stats".center(num_stats_width),
235 |             fancy_title,
236 |         )
237 |     )
238 | 
239 |     fmt = "%{}s %{}d %{}d %{}d %{}d %{}d  %-{}s %s".format(
240 |         name_width,
241 |         num_compounds_width,
242 |         num_rules_width,
243 |         num_pairs_width,
244 |         num_envs_width,
245 |         num_stats_width,
246 |         title_width,
247 |     )
248 | 
249 |     prefix = " " * num_compounds_width
250 |     for (
251 |         name,
252 |         num_compounds,
253 |         num_rules,
254 |         num_pairs,
255 |         num_envs,
256 |         num_stats,
257 |         title,
258 |         names_and_counts,
259 |         fragment_options,
260 |         index_options,
261 |     ) in zip(
262 |         name_list,
263 |         num_compounds_list,
264 |         num_rules_list,
265 |         num_pairs_list,
266 |         num_envs_list,
267 |         num_stats_list,
268 |         titles,
269 |         property_names,
270 |         all_fragment_options,
271 |         all_index_options,
272 |     ):
273 |         print(
274 |             fmt
275 |             % (
276 |                 name,
277 |                 num_compounds,
278 |                 num_rules,
279 |                 num_pairs,
280 |                 num_envs,
281 |                 num_stats,
282 |                 title,
283 |                 names_and_counts,
284 |             )
285 |         )
286 |         if all_option:
287 |             creation_date = dataset.creation_date
288 |             creation_date_str = creation_date.isoformat(" ")
289 |             print(prefix + "Created:", creation_date_str)
290 | 
291 |             s = " "  # Always have a trailing space
292 |             for property_name, count in dataset.get_property_names_and_counts():
293 |                 s += "%s/%s " % (count, property_name)
294 |             if s == " ":
295 |                 s = "(no properties)"
296 |             else:
297 |                 s = s[:-1]  # strip the trailing space
298 |             print(prefix + "  #compounds/property:", s)
299 | 
300 |             print(
301 |                 prefix
302 |                 + "  #smiles for rules: %d  for constants: %d"
303 |                 % (dataset.get_num_rule_smiles(), dataset.get_num_constant_smiles())
304 |             )
305 | 
306 |             options = json.loads(fragment_options)
307 |             print(prefix + "  Fragment options:")
308 |             for k, v in sorted(options.items()):
309 |                 print(prefix + "    %s: %s" % (k, v))
310 | 
311 |             options = json.loads(index_options)
312 |             print(prefix + "  Index options:")
313 |             for k, v in sorted(options.items()):
314 |                 print(prefix + "    %s: %s" % (k, v))
315 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/loadprops.py:
--------------------------------------------------------------------------------
  1 | "Implement the 'loadprops' command"
  2 | 
  3 | # mmpdb - matched molecular pair database generation and analysis
  4 | #
  5 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  6 | # Copyright (c) 2019-2021, Andrew Dalke Scientific, AB
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are
 10 | # met:
 11 | #
 12 | #    * Redistributions of source code must retain the above copyright
 13 | #      notice, this list of conditions and the following disclaimer.
 14 | #    * Redistributions in binary form must reproduce the above
 15 | #      copyright notice, this list of conditions and the following
 16 | #      disclaimer in the documentation and/or other materials provided
 17 | #      with the distribution.
 18 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 19 | #      its contributors may be used to endorse or promote products
 20 | #      derived from this software without specific prior written
 21 | #      permission.
 22 | #
 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 27 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 34 | #
 35 | 
 36 | import sys
 37 | import click
 38 | 
 39 | from .click_utils import (
 40 |     command,
 41 |     die,
 42 |     open_database_from_options_or_exit,
 43 |     add_single_database_parameters,
 44 | )
 45 | 
 46 | 
 47 | loadprops_epilog = """
 48 | Load structure property values from a CSV into a data set.
 49 | 
 50 | The property file contains a data table. Here is an example:
 51 | 
 52 | \b
 53 |   ID MP CHR1 CHR2
 54 |   GEJYOJ 3 71 31.3
 55 |   ACIDUL 5 65 67.2
 56 |   KIXRIS 5 * *
 57 |   SOFWIV01 5 83 79.3
 58 | 
 59 | The fields should be tab separated. For full details use
 60 | "mmpdb help-property-format".
 61 | 
 62 | If the given property value already exists in the database then the
 63 | existing database value will be updated. Otherwise loadprops will
 64 | create a new property record. If an identifier isn't in the database
 65 | then its values will be ignored.
 66 | 
 67 | After importing the data, the corresponding aggregate values for the rules
 68 | will be recalculated.
 69 | 
 70 | Example:
 71 | 
 72 | \b
 73 |   % mmpdb loadprops --properties MP.csv mmpdb.db
 74 |   
 75 | """
 76 | 
 77 | 
 78 | @command(epilog=loadprops_epilog)
 79 | @click.option(
 80 |     "--properties",
 81 |     "-p",
 82 |     "properties_filename",
 83 |     metavar="FILENAME",
 84 |     help="File containing the identifiers to use and optional physical properties",
 85 | )
 86 | @add_single_database_parameters()
 87 | @click.pass_obj
 88 | def loadprops(
 89 |     reporter,
 90 |     properties_filename,
 91 |     database_options,
 92 | ):
 93 |     """Load properties for existing structures
 94 | 
 95 |     DATABASE: the mmpdb database to update with the new properties
 96 |     """
 97 |     from .. import properties_io, dbutils, schema
 98 | 
 99 |     db = open_database_from_options_or_exit(database_options)
100 |     c = db.get_cursor()
101 |     dataset = db.get_dataset()
102 |     reporter.report(f"Using dataset: {dataset.title}")
103 | 
104 |     if properties_filename is None:
105 |         reporter.report("Reading properties from stdin")
106 |         properties_file = sys.stdin
107 |         close = None
108 |         source = "<stdin>"
109 |     else:
110 |         reporter.report(f"Reading properties from {properties_filename!r}")
111 |         try:
112 |             properties_file = open(properties_filename)
113 |         except IOError as err:
114 |             die(f"Cannot open properties file: {err}")
115 |         close = properties_file.close
116 |         source = properties_filename
117 | 
118 |     try:
119 |         try:
120 |             with properties_file:
121 |                 properties = properties_io.load_properties(properties_file, reporter)
122 |         except ValueError as err:
123 |             die(f"Problem reading --properties file {properties_filename}: {err}")
124 |     finally:
125 |         if close is not None:
126 |             close()
127 | 
128 |     reporter.report(
129 |         f"Read {len(properties.property_columns)} properties for "
130 |         f"{len(properties.id_column)} compounds "
131 |         f"from {source!r}"
132 |     )
133 |     public_id_to_id = dataset.get_public_id_to_id_table(c)
134 | 
135 |     compound_ids = [public_id_to_id.get(public_id, None) for public_id in properties.id_column]
136 |     num_missing = compound_ids.count(None)
137 |     if num_missing:
138 |         reporter.report(
139 |             f"{num_missing} compounds from {source!r} are not in the dataset at {database_options.database!r}"
140 |         )
141 |         ## missing = [public_id for public_id in properties.id_column if public_id not in public_id_to_id]
142 |         ## del missing[6:]
143 |         ## if len(missing) > 5:
144 |         ##     reporter.warning("First five missing records: %r" % (missing[:5],))
145 |         ## else:
146 |         ##     reporter.warning("Missing records: %r" % (missing,))
147 | 
148 |     UPDATE_COMPOUND_PROPERTY_SQL = db.SQL(
149 |         "UPDATE compound_property "
150 |         "      SET value = ?"
151 |         "       WHERE compound_id = ?"
152 |         "         AND property_name_id = ?"
153 |     )
154 |     INSERT_COMPOUND_PROPERTY_SQL = db.SQL(
155 |         "INSERT INTO compound_property (compound_id, property_name_id, value) " " VALUES (?, ?, ?)"
156 |     )
157 | 
158 |     with db.atomic():
159 |         # Remember which compound properties exist, so I can tell if a
160 |         # value should replace an existing value or create a new value.
161 |         c.execute(db.SQL("SELECT compound_id, property_name_id from compound_property"))
162 |         seen_properties = dict((key, False) for key in c)
163 | 
164 |         compound_values_for_property_name_id = {}
165 |         property_name_ids = []
166 | 
167 |         for property_name, property_values in properties.iter_properties():
168 |             property_name_id = dataset.get_or_add_property_name(property_name)
169 |             property_name_ids.append(property_name_id)
170 |             # reporter.report("Loading property %r (id %d)" % (property_name.name, property_name.id))
171 | 
172 |             num_created = num_updated = 0
173 |             compound_values_for_property_name_id[property_name_id] = compound_values = {}
174 | 
175 |             for compound_id, value in zip(compound_ids, property_values):
176 |                 if compound_id is None:
177 |                     # Property specified but not in database
178 |                     continue
179 |                 if value is None:
180 |                     # Property value is "*", meaning it isn't available
181 |                     continue
182 |                 key = (compound_id, property_name_id)
183 |                 if key in seen_properties:
184 |                     seen_properties[key] = True
185 |                     num_updated += 1
186 |                     c.execute(
187 |                         UPDATE_COMPOUND_PROPERTY_SQL,
188 |                         (value, compound_id, property_name_id),
189 |                     )
190 |                 else:
191 |                     num_created += 1
192 |                     c.execute(
193 |                         INSERT_COMPOUND_PROPERTY_SQL,
194 |                         (compound_id, property_name_id, value),
195 |                     )
196 |                 compound_values[compound_id] = value
197 |             reporter.report(
198 |                 f"Imported {num_updated + num_created} {property_name!r} records "
199 |                 f"({num_created} new, {num_updated} updated)."
200 |             )
201 | 
202 |         # Remove existing compound properties where the property name was in the
203 |         # properties file but the where the file did not specify a value.
204 |         properties_to_delete = [
205 |             key for key, was_updated in seen_properties.items() if not was_updated and key[1] in property_name_ids
206 |         ]
207 |         if properties_to_delete:
208 |             dataset.delete_compound_properties(properties_to_delete)
209 | 
210 |         dbutils.reaggregate_properties(
211 |             dataset,
212 |             property_name_ids,
213 |             compound_values_for_property_name_id,
214 |             cursor=c,
215 |             reporter=reporter,
216 |         )
217 | 
218 |         # Check if any of the properties are completely gone
219 |         if properties_to_delete:
220 |             for property_name_id in property_name_ids:
221 |                 n = dataset.get_num_compound_properties(property_name_id, cursor=c)
222 |                 if n == 0:
223 |                     dataset.delete_property_name_id(property_name_id, cursor=c)
224 | 
225 |         # Update the environment statistics
226 |         reporter.update("Updating environment statistics count ...")
227 |         c.execute("SELECT count(*) from rule_environment_statistics")
228 |         num = schema._get_one(c)
229 |         c.execute(db.SQL("UPDATE dataset SET num_rule_environment_stats=?"), (num,))
230 | 
231 |         reporter.update("Commiting changed ...")
232 | 
233 |     reporter.report("Loaded all properties and re-computed all rule statistics.")
234 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/propcat.py:
--------------------------------------------------------------------------------
  1 | "Implement the 'propcat' command"
  2 | 
  3 | # mmpdb - matched molecular pair database generation and analysis
  4 | #
  5 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  6 | # Copyright (c) 2021, Andrew Dalke Scientific, AB
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are
 10 | # met:
 11 | #
 12 | #    * Redistributions of source code must retain the above copyright
 13 | #      notice, this list of conditions and the following disclaimer.
 14 | #    * Redistributions in binary form must reproduce the above
 15 | #      copyright notice, this list of conditions and the following
 16 | #      disclaimer in the documentation and/or other materials provided
 17 | #      with the distribution.
 18 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 19 | #      its contributors may be used to endorse or promote products
 20 | #      derived from this software without specific prior written
 21 | #      permission.
 22 | #
 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 27 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 34 | #
 35 | 
 36 | import click
 37 | 
 38 | from .click_utils import (
 39 |     command,
 40 |     die,
 41 |     add_single_database_parameters,
 42 |     add_multiple_properties,
 43 |     open_database_from_options_or_exit,
 44 |     get_property_names_or_error,
 45 | )
 46 | 
 47 | propcat_epilog = """
 48 | 
 49 | Write information about the properties for the compounds in DATABASE,
 50 | formatted as a property file. Use `mmpdb help-property-file` for
 51 | details about the property file format.
 52 | 
 53 | The output from this command is a tab-delimited CSV file where the
 54 | first column has the head "ID" and contains the compound identifier.
 55 | The other columns contain property information for each compound. The
 56 | column title is the property name.
 57 | 
 58 | By default there is one column for each property in the databases, and
 59 | the one row for each compound with at least one property. Use
 60 | '--property' to limit the output to a specific property, or use it
 61 | multiple times to specify multiple property names to output. Use
 62 | '--all' to list all of the compounds, even if the compound has none of
 63 | the specified properties.
 64 | 
 65 | The character "*" will be use if a listed compound is missing a given
 66 | property.
 67 | 
 68 | Examples:
 69 | 
 70 | 1) Write all of the properties to stdout:
 71 | 
 72 | \b
 73 |   % mmpdb propcat CHEMBL_thrombin_Ki_IC50.mmpdb
 74 | 
 75 | 2) Write the "MP" property to "MP.properties":
 76 | 
 77 | \b
 78 |   % mmpdb propcat csd.mmpdb --property MP -o MP.properties
 79 | 
 80 | 3) Write the compound identifiers only to stdout:
 81 | 
 82 | \b
 83 |   % mmpdb propcat csd.mmpdb --no-properties --all
 84 | 
 85 | """
 86 | 
 87 | 
 88 | @command(epilog=propcat_epilog)
 89 | @add_single_database_parameters()
 90 | @add_multiple_properties
 91 | @click.option(
 92 |     "--all",
 93 |     "show_all",
 94 |     is_flag=True,
 95 |     default=False,
 96 |     help="Include compounds which have no properties",
 97 | )
 98 | @click.option(
 99 |     "--output",
100 |     "-o",
101 |     "output_filename",
102 |     metavar="FILENAME",
103 |     help="Output filename (default is stdout)",
104 | )
105 | @click.pass_obj
106 | def propcat(
107 |     reporter,
108 |     database_options,
109 |     show_all,
110 |     property_names,
111 |     no_properties,
112 |     output_filename,
113 | ):
114 |     """Write the database properties to a properties file
115 | 
116 |     DATABASE: an mmpdb file
117 |     """
118 |     from .. import fileio
119 | 
120 |     db = open_database_from_options_or_exit(database_options)
121 |     c = db.get_cursor()
122 |     dataset = db.get_dataset()
123 | 
124 |     property_names = get_property_names_or_error(
125 |         dataset,
126 |         property_names=property_names,
127 |         no_properties=no_properties,
128 |     )
129 | 
130 |     property_values_list = []
131 |     for property_name in property_names:
132 |         property_name_id = dataset.get_property_name_id(property_name)
133 |         property_values_list.append(dataset.get_property_values(property_name_id))
134 | 
135 |     try:
136 |         outfile = fileio.open_output(output_filename, output_filename)
137 |     except IOError as err:
138 |         die(f"Cannot open --output: {err}")
139 | 
140 |     with outfile:
141 |         print("ID", *property_names, sep="\t", file=outfile)
142 |         for compound_row in dataset.iter_compounds(cursor=c):
143 |             columns = [compound_row.public_id]
144 |             is_empty = True
145 |             for property_values in property_values_list:
146 |                 value = property_values.get(compound_row.id, None)
147 |                 if value is None:
148 |                     columns.append("*")
149 |                 else:
150 |                     columns.append(value)
151 |                     is_empty = False
152 |             if show_all or not is_empty:
153 |                 print(*columns, sep="\t", file=outfile)
154 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/rgroup2smarts.py:
--------------------------------------------------------------------------------
  1 | "Implement the 'rgroup2smarts' command"
  2 | 
  3 | # mmpdb - matched molecular pair database generation and analysis
  4 | #
  5 | # Copyright (c) 2019, Andrew Dalke Scientific, AB
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions are
  9 | # met:
 10 | #
 11 | #    * Redistributions of source code must retain the above copyright
 12 | #      notice, this list of conditions and the following disclaimer.
 13 | #    * Redistributions in binary form must reproduce the above
 14 | #      copyright notice, this list of conditions and the following
 15 | #      disclaimer in the documentation and/or other materials provided
 16 | #      with the distribution.
 17 | #    * Neither the name of Andrew Dalke Scientific. nor the names of
 18 | #      its contributors may be used to endorse or promote products
 19 | #      derived from this software without specific prior written
 20 | #      permission.
 21 | #
 22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 23 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 24 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 25 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 26 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 27 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 28 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 29 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 30 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 31 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 32 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 33 | #
 34 | 
 35 | # Convert a R-group SMILES into a SMARTS pattern.
 36 | # Must be rooted at "*", with one single bond.
 37 | 
 38 | # The algorithm is:
 39 | #  - figure out the valence and hydrogen count for each atom
 40 | #  - convert the molecule into an isomeric SMILES, with
 41 | #      hydrogens on the atom terms
 42 | #  - convert the SMILES into a SMARTS:
 43 | #     - use a regexp to find the atom terms
 44 | #     - insert with the valence
 45 | 
 46 | # For examples:
 47 | #   *-Cl -> *!@[Clv1]
 48 | #   *-O  -> *!@[OH1v2]
 49 | 
 50 | import sys
 51 | 
 52 | import click
 53 | from .click_utils import command, die
 54 | 
 55 | rgroup2smarts_epilog = """
 56 | 
 57 | This command is primarily meant for users to see how the `mmpdb
 58 | fragment` parameters `--cut-rgroup` and `--cut-rgroup-file` work.
 59 | 
 60 | A fragment file contains one fragment SMILES per line. Each fragment
 61 | SMILES must contain one and only one wildcard atom ("*"), which marks
 62 | the attachment point.
 63 | 
 64 | Blank lines and leading whitespace are not supported. The SMILES ends
 65 | at the first whitespace. Additional text on a line is ignored.
 66 | 
 67 | Each fragment is turned into a SMARTS pattern which matches that
 68 | fragment. By default the SMARTS patterns are converted into a
 69 | recursive SMARTS with all of the fragments. Use `--single` to output
 70 | the non-recursive SMARTS for each input SMILES.
 71 | 
 72 | Use `--check` to verify that the final SMARTS matches the input
 73 | fragments. Use `--cut-rgroup` to specify the SMILES fragments on the
 74 | command-line instead of from a file.
 75 | """
 76 | 
 77 | 
 78 | @command(epilog=rgroup2smarts_epilog)
 79 | @click.option(
 80 |     "--cut-rgroup",
 81 |     metavar="SMILES",
 82 |     multiple=True,
 83 |     help="R-group SMILES to use",
 84 | )
 85 | @click.option(
 86 |     "--single",
 87 |     "-s",
 88 |     default=False,
 89 |     is_flag=True,
 90 |     help="Generate a SMARTS for each R-group SMILES (default: generate a single recursive SMARTS)",
 91 | )
 92 | @click.option(
 93 |     "--check",
 94 |     "-c",
 95 |     default=False,
 96 |     is_flag=True,
 97 |     help="Check that the SMARTS strings are valid (default: assume they are valid)",
 98 | )
 99 | @click.option(
100 |     "--explain",
101 |     is_flag=True,
102 |     default=False,
103 |     help="Write conversion and check details to stderr",
104 | )
105 | @click.argument(
106 |     "rgroup_filename",
107 |     metavar="FILENAME",
108 |     required=False,
109 | )
110 | @click.pass_obj
111 | def rgroup2smarts(
112 |     reporter,
113 |     check,
114 |     explain,
115 |     cut_rgroup,
116 |     rgroup_filename,
117 |     single,
118 | ):
119 |     """Convert an R-group file into a SMARTS which matches all of the SMILES
120 | 
121 |     FILENAME: file containing one or more R-group SMILES (use stdin if not specified)
122 |     """
123 |     from rdkit import Chem
124 |     from .. import rgroup2smarts as _rgroup2smarts
125 | 
126 |     reporter.set_explain(explain)
127 |     explain = reporter.explain
128 | 
129 |     close = None
130 | 
131 |     if cut_rgroup:
132 |         if rgroup_filename is not None:
133 |             die("Cannot specify both an R-group filename and a --cut-rgroup")
134 |         location = _rgroup2smarts.ListLocation("--cut-rgroup SMILES")
135 |         location.save(recno=1)
136 |         explain("Using --cut-rgroup SMILES from the command-line")
137 |         record_reader = _rgroup2smarts.iter_smiles_list(cut_rgroup, location)
138 | 
139 |     elif rgroup_filename is not None:
140 |         explain(f"Reading R-group SMILES from {rgroup_filename!r}")
141 |         location = _rgroup2smarts.FileLocation(rgroup_filename)
142 |         try:
143 |             f = open(rgroup_filename)
144 |         except OSError as err:
145 |             die(f"Cannot open input file: {err}")
146 |         close = f.close
147 |         record_reader = _rgroup2smarts.parse_rgroup_file(f, location)
148 | 
149 |     else:
150 |         explain("Reading R-group SMILES from <stdin>")
151 |         location = _rgroup2smarts.FileLocation("<stdin>")
152 |         record_reader = _rgroup2smarts.parse_rgroup_file(sys.stdin, location)
153 | 
154 |     if check:
155 |         all_mols = []
156 |     else:
157 |         all_mols = None
158 | 
159 |     outfile = sys.stdout
160 | 
161 |     iter_smarts = _rgroup2smarts.iter_smiles_as_smarts(record_reader, location, explain, all_mols)
162 | 
163 |     all_smarts = None
164 | 
165 |     try:
166 |         if single:
167 |             for smarts in iter_smarts:
168 |                 outfile.write(smarts + "\n")
169 |         else:
170 |             all_smarts = []
171 |             for smarts in iter_smarts:
172 |                 assert smarts.startswith("*-!@"), (smarts, location)
173 |                 all_smarts.append(smarts)
174 |             if not all_smarts:
175 |                 die(f"Cannot make a SMARTS: no SMILES strings found in {location.filename!r}")
176 | 
177 |     except _rgroup2smarts.ParseError as err:
178 |         die(f"Cannot parse input file: {err}")
179 |     except _rgroup2smarts.ConversionError as err:
180 |         die(str(err))
181 |     finally:
182 |         if close is not None:
183 |             close()
184 | 
185 |     if not single:
186 |         smarts = _rgroup2smarts.make_recursive_smarts(all_smarts)
187 | 
188 |         try:
189 |             if check:
190 |                 explain("Checking that the SMARTS matches all of the input molecules")
191 |                 all_pat = Chem.MolFromSmarts(smarts)
192 |                 if all_pat is None:
193 |                     die(f"Cannot process final SMARTS: {smarts!r}")
194 | 
195 |                 for i, (mol, where, smiles) in enumerate(all_mols):
196 |                     if not mol.HasSubstructMatch(all_pat):
197 |                         die(f"final SMARTS does not match SMILES from {where} ({smiles!r})")
198 |                     explain(f"checked #{i}")
199 |         finally:
200 |             outfile.write(smarts + "\n")
201 | 
202 |     outfile.flush()
203 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/rulecat.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from .click_utils import (
 4 |     command,
 5 |     GzipFile,
 6 |     add_single_database_parameters,
 7 |     open_dataset_from_options_or_exit,
 8 |     )
 9 | 
10 | @command()
11 | @click.option(
12 |     "--output",
13 |     "-o",
14 |     "outfile",
15 |     default = "-",
16 |     type = GzipFile("w"),
17 |     help = "Write the rules to the named file (default is stdout)",
18 |     )
19 | @add_single_database_parameters()
20 | @click.pass_obj
21 | def rulecat(
22 |         reporter,
23 |         database_options,
24 |         outfile,
25 |         ):
26 |     "Show the rules in an mmpdb file"
27 | 
28 |     dataset = open_dataset_from_options_or_exit(database_options, quiet=True)
29 |     rule_c = dataset.get_cursor()
30 |     rule_env_c = dataset.get_cursor()
31 | 
32 |     outfile.write(f"id\tfrom_smiles\tto_smiles\n")
33 |     for rule in dataset.iter_rules(rule_c):
34 |         outfile.write(f"{rule.id}\t{rule.from_smiles}\t{rule.to_smiles}\n")
35 |                     
36 |     outfile.close()
37 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/ruleenvcat.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from .click_utils import (
 4 |     command,
 5 |     GzipFile,
 6 |     add_single_database_parameters,
 7 |     open_dataset_from_options_or_exit,
 8 |     )
 9 | 
10 | @command()
11 | @click.option(
12 |     "--pairs / --no-pairs",
13 |     default = False,
14 |     help = "With --pairs, include pairs in the output",
15 |     )
16 | @click.option(
17 |     "--output",
18 |     "-o",
19 |     "outfile",
20 |     default = "-",
21 |     type = GzipFile("w"),
22 |     help = "Write the rules to the named file (default is stdout)",
23 |     )
24 | @add_single_database_parameters()
25 | @click.pass_obj
26 | def ruleenvcat(
27 |         reporter,
28 |         database_options,
29 |         pairs,
30 |         outfile,
31 |         ):
32 |     "Show the rules in an mmpdb file"
33 | 
34 |     dataset = open_dataset_from_options_or_exit(database_options, quiet=True)
35 |     rule_env_c = dataset.get_cursor()
36 |     pair_c = dataset.get_cursor()
37 | 
38 |     
39 |     outfile.write(f"from_smiles\tto_smiles\tradius\tSMARTS\tpseudoSMILES\n")
40 |     for rule_env in dataset.iter_rule_environments(rule_env_c):
41 |         outfile.write(
42 |             f"{rule_env.from_smiles}\t{rule_env.to_smiles}\t{rule_env.radius}\t"
43 |             f"{rule_env.smarts}\t{rule_env.pseudosmiles}\n"
44 |             )
45 |         if pairs:
46 |             for pair in rule_env.iter_pairs(pair_c):
47 |                 outfile.write(f"\t{pair.compound1_id}\t{pair.compound2_id}\t{pair.constant_id}\n")
48 |             
49 |             
50 |             
51 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/smi_split.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import click
  3 | 
  4 | from .click_utils import (
  5 |     command,
  6 |     template_type,
  7 |     positive_int,
  8 |     )
  9 | 
 10 | from .smi_utils import add_input_options
 11 | 
 12 | smi_split_epilog = """
 13 | 
 14 | Split a given SMILES file into several smaller SMILES files using one
 15 | of two available schemes. These new files can be used in a distributed
 16 | computing environment to fragment a dataset in parallel.
 17 | 
 18 | See `mmpdb help-smiles-format` for a description of how the
 19 | `--delimiter` and `--has-header` options affect parsing a SMILES
 20 | file. The output SMILES file has no header and uses tab-separated
 21 | columns.
 22 | 
 23 | The two split schemes read the input SMILES file and split the lines
 24 | (excluding any header) into the output files. The `--num-files` option
 25 | creates up to N output files in total, with roughly the same number of
 26 | records in each file. The `--num-records` option writes up to N
 27 | records to an output file, then switches to a new output file and
 28 | starts over.
 29 | 
 30 | If no scheme is specified then the default is the same as `--num-files
 31 | 10`, which creates up to 10 output files.
 32 | 
 33 | The output filenames are based on a template pattern, which can be
 34 | changed with `--template`. The template may contain fields formatted
 35 | using Python's [Format String
 36 | Syntax](https://docs.python.org/3/library/string.html#formatstrings).
 37 | 
 38 | The available fields are:
 39 | 
 40 | \b
 41 |   {prefix} - the input SMILES filename without the final extension(s)
 42 |   {parent} - the parent directory of the input SMILES filename, or '.'
 43 |   {stem} - the SMILES filename without the directory or final extension
 44 |   {sep} - the filesystem path seperator (eg, '/')
 45 |   {i} - an integer value 0 <= i < n
 46 | 
 47 | The value of `prefix`, `parent` and `stem` are based on the input
 48 | SMILES filename. For example, if the filename is '/abc/xyz.smi', on
 49 | a macOS system, then the field values are:
 50 | 
 51 | \b
 52 |   {prefix} = '/abc/xyz'
 53 |   {parent} = '/abc'
 54 |   {stem} = 'xyz'
 55 |   {sep} = '/'
 56 | 
 57 | If the filename ends with '.gz' then that is also removed.
 58 | 
 59 | The default template is "{prefix}.{i:04}.smi". If the input filename
 60 | is `ChEMBL_CYP3A4_hERG.smi.gz` and there are 4 output files then the
 61 | output filenames are:
 62 | 
 63 | \b
 64 | ```
 65 | ChEMBL_CYP3A4_hERG.0000.smi
 66 | ChEMBL_CYP3A4_hERG.0001.smi
 67 | ChEMBL_CYP3A4_hERG.0002.smi
 68 | ChEMBL_CYP3A4_hERG.0003.smi
 69 | ```
 70 | 
 71 | If the output filename ends with ".gz", as with the template
 72 | "{prefix}.{i:04}.smi.gz", then the output files will be
 73 | gzip-compressed.
 74 | 
 75 | """
 76 | 
 77 | @command(
 78 |     name="smi_split",
 79 |     epilog=smi_split_epilog)
 80 | @add_input_options
 81 | 
 82 | @click.option(
 83 |     "--num-files",
 84 |     "-n",
 85 |     default = None,
 86 |     type = positive_int(),
 87 |     help = "Number of output SMILES files to generate",
 88 |     )
 89 | 
 90 | @click.option(
 91 |     "--num-records",
 92 |     default = None,
 93 |     type = positive_int(),
 94 |     help = "Maximum number of SMILES to save to each file",
 95 |     )
 96 | 
 97 | @click.option(
 98 |     "--template",
 99 |     "-t",
100 |     default = "{prefix}.{i:04}.smi",
101 |     type = template_type(),
102 |     show_default = True,
103 |     help = "Template for the output filenames",
104 |     )
105 | 
106 | @click.argument(
107 |     "smiles_filename",
108 |     required = False,
109 |     default = None,
110 |     metavar = "FILE",
111 |     )
112 | @click.pass_obj
113 | def smi_split(
114 |         reporter,
115 |         input_options,
116 |         num_files,
117 |         num_records,
118 |         template,
119 |         smiles_filename,
120 |         ):
121 |     """Split the SMILES file 'FILE' into smaller files"""
122 |     import pathlib
123 |     from .. import fileio
124 | 
125 |     n = (num_files is not None) + (num_records is not None)
126 |     if n == 2:
127 |         # mutual exclusion is click is so tedious
128 |         click.fail("Cannot specify both --num-files and --num-records")
129 |     elif n == 0:
130 |         num_files = 10
131 | 
132 | 
133 |     # Get values used for template generation
134 |     s = "stdin.smi" if smiles_filename is None else smiles_filename
135 |     # Trim '.gz' from the template
136 |     if s.lower().endswith(".gz"):
137 |         s = s[:-3]
138 |     smi_path = pathlib.Path(s)
139 |     smi_filename = str(smi_path)
140 |     smi_parent = smi_path.parent
141 |     smi_stem = smi_path.stem
142 |     smi_prefix = str(smi_path.parent / smi_path.stem)
143 | 
144 |     # Read the SMILES
145 |     try:
146 |         with input_options.read_smiles_file(smiles_filename) as reader:
147 |             entries = list(reader)
148 | 
149 |     except fileio.FileFormatError as err:
150 |         die(f"Cannot parse input file: {err}")
151 | 
152 |     if not entries:
153 |         reporter.report("No SMILES records found. Exiting.")
154 |         return
155 | 
156 |     # Figure out how to partition
157 |     if num_files is None:
158 |         # use the number of records to get the number of files
159 |         num_files = (len(entries) + num_records-1) // num_records
160 |     else:
161 |         # Can't have more files than entries
162 |         num_files = min(num_files, len(entries))
163 | 
164 |     num_records_per_file = len(entries) // num_files
165 | 
166 |     i = -1
167 |     for i in range(num_files):
168 |         start = i * num_records_per_file
169 |         end = start + num_records_per_file
170 |         
171 |         subset = entries[start:end]
172 | 
173 | 
174 |         output_filename = template.format(
175 |             parent = smi_parent,
176 |             stem = smi_stem,
177 |             prefix = smi_prefix,
178 |             sep = os.sep,
179 |             i = i,
180 |             )
181 | 
182 |         try:
183 |             writer = fileio.open_output(output_filename, format_hint=None)
184 |         except IOError as err:
185 |             die(f"Cannot open output SMILES file: {err}")
186 | 
187 |         with writer:
188 |             writer.writelines(f"{smiles}\t{id}\n" for (smiles, id) in subset)
189 |     
190 |     i += 1
191 | 
192 |     reporter.report(f"Created {num_files} SMILES files containing {len(entries)} SMILES records.")
193 |     
194 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/smi_utils.py:
--------------------------------------------------------------------------------
  1 | "Utilities for working with SMILES file input reading"
  2 | 
  3 | # mmpdb - matched molecular pair database generation and analysis
  4 | #
  5 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  6 | # Copyright (c) 2019, Andrew Dalke Scientific, AB
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are
 10 | # met:
 11 | #
 12 | #    * Redistributions of source code must retain the above copyright
 13 | #      notice, this list of conditions and the following disclaimer.
 14 | #    * Redistributions in binary form must reproduce the above
 15 | #      copyright notice, this list of conditions and the following
 16 | #      disclaimer in the documentation and/or other materials provided
 17 | #      with the distribution.
 18 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 19 | #      its contributors may be used to endorse or promote products
 20 | #      derived from this software without specific prior written
 21 | #      permission.
 22 | #
 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 27 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 34 | #
 35 | 
 36 | import click
 37 | 
 38 | from .click_utils import set_click_attrs
 39 | 
 40 | 
 41 | class SmiInputOptions:
 42 |     def __init__(self, format, delimiter, has_header):
 43 |         self.format = format
 44 |         self.delimiter = delimiter
 45 |         self.has_header = has_header
 46 |         
 47 |     def read_smiles_file(self, filename):
 48 |         from .. import fileio
 49 |         return fileio.read_smiles_file(
 50 |             filename,
 51 |             self.format,
 52 |             self.delimiter,
 53 |             self.has_header,
 54 |             )
 55 |         
 56 | 
 57 | def add_input_options(command):
 58 |     def add_option(*args, **kwargs):
 59 |         click.option(*args, **kwargs)(command)
 60 | 
 61 |     add_option(
 62 |         "--in",
 63 |         "-i",
 64 |         "in_format",
 65 |         type=click.Choice(["smi", "smi.gz"]),
 66 |         help=(
 67 |             "Input structuture format (one of 'smi', 'smi.gz'). "
 68 |             "If not specified, use the filename extension or default to 'smi'."
 69 |         ),
 70 |     )
 71 | 
 72 |     add_option(
 73 |         "--delimiter",
 74 |         default="whitespace",
 75 |         type=click.Choice(["whitespace", "to-eol", "comma", "tab", "space", "native"]),
 76 |         help=(
 77 |             "SMILES file delimiter style (one of 'whitespace' (default), 'to-eol', "
 78 |             "'comma', 'tab', 'space', or 'native')"
 79 |         ),
 80 |     )
 81 | 
 82 |     add_option(
 83 |         "--has-header",
 84 |         is_flag=True,
 85 |         default=False,
 86 |         help="Skip the first line, which is the header line",
 87 |     )
 88 | 
 89 |     # Wrap the command to convert the fragment option parameters
 90 |     # into a single object
 91 | 
 92 |     def make_input_options_wrapper(**kwargs):
 93 |         kwargs["input_options"] = SmiInputOptions(
 94 |             format=kwargs.pop("in_format"),
 95 |             delimiter=kwargs.pop("delimiter"),
 96 |             has_header=kwargs.pop("has_header"),
 97 |         )
 98 |         return command(**kwargs)
 99 | 
100 |     set_click_attrs(make_input_options_wrapper, command)
101 | 
102 |     return make_input_options_wrapper
103 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/smicat.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from .click_utils import (
 4 |     command,
 5 |     add_single_database_parameters,
 6 |     die,
 7 |     GzipFile,
 8 |     open_fragdb_from_options_or_exit,
 9 |     open_dataset_from_options_or_exit,
10 |     )
11 | 
12 | smicat_epilog = """
13 | 
14 | Each compound has two associated SMILES, the input SMILES used as
15 | input to fragmentation, and the canonical SMILES string from RDKit
16 | after input processing (typically desalting and structure
17 | normalization). By default the output uses the processed SMILES. Use
18 | `--input-smiles` to use the input SMILES string.
19 | 
20 | By default the output SMILES file is written to stdout. Use `--output`
21 | to save the output to the named file.
22 | 
23 | Examples:
24 | 
25 | 1) Write the cleaned-up structures as a SMILES file to stdout:
26 | 
27 | \b
28 |   % mmpdb smicat csd.mmpdb
29 | 
30 | 2) Save the structures to the file "original.smi", and use the input
31 | SMILES instead of the de-salted SMILES:
32 | 
33 | \b
34 |   % mmpdb smicat csd.mmpdb -o original.smi --input-smiles
35 | 
36 | """
37 | 
38 | @command(
39 |     name="smicat",
40 |     epilog = smicat_epilog,
41 |     )
42 | 
43 | @click.option(
44 |     "--input-smiles",
45 |     is_flag = True,
46 |     default = False,
47 |     help = "Use the input SMILES instead of the cleaned-up SMILES",
48 |     )
49 | 
50 | @click.option(
51 |     "--output",
52 |     "-o",
53 |     "output_file",
54 |     default = "-",
55 |     type = GzipFile("w"),
56 |     help = "Output filename (default is stdout)",
57 |     )
58 | 
59 | @add_single_database_parameters()
60 | 
61 | def smicat(
62 |         input_smiles,
63 |         output_file,
64 |         database_options,
65 |         ):
66 |     """Write the mmpdb SMILES as a SMILES file"""
67 | 
68 |     if database_options.database.endswith(".fragdb"):
69 |         db = open_fragdb_from_options_or_exit(database_options)
70 |         c = db.cursor()
71 |         if input_smiles:
72 |             c.execute("SELECT title, input_smiles FROM record UNION SELECT title, input_smiles FROM error_record")
73 |         else:
74 |             c.execute("SELECT title, normalized_smiles FROM record")
75 |         iter_id_and_smiles = c
76 |     else:
77 |         dataset = open_dataset_from_options_or_exit(database_options)
78 |         db = dataset.mmpa_db
79 |         it = dataset.iter_compounds()
80 |         if input_smiles:
81 |             iter_id_and_smiles = ((compound.public_id, compound.input_smiles) for compound in it)
82 |         else:
83 |             iter_id_and_smiles = ((compound.public_id, compound.clean_smiles) for compound in it)
84 | 
85 |     with db:
86 |         for id, smiles in iter_id_and_smiles:
87 |             output_file.write(f"{smiles}\t{id}\n")
88 | 
89 | 


--------------------------------------------------------------------------------
/mmpdblib/cli/smifrag.py:
--------------------------------------------------------------------------------
  1 | "Implement the 'smifrag' command"
  2 | 
  3 | # mmpdb - matched molecular pair database generation and analysis
  4 | #
  5 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  6 | # Copyright (c) 2019-2021, Andrew Dalke Scientific, AB
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are
 10 | # met:
 11 | #
 12 | #    * Redistributions of source code must retain the above copyright
 13 | #      notice, this list of conditions and the following disclaimer.
 14 | #    * Redistributions in binary form must reproduce the above
 15 | #      copyright notice, this list of conditions and the following
 16 | #      disclaimer in the documentation and/or other materials provided
 17 | #      with the distribution.
 18 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 19 | #      its contributors may be used to endorse or promote products
 20 | #      derived from this software without specific prior written
 21 | #      permission.
 22 | #
 23 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 24 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 25 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 26 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 27 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 28 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 29 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 30 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 31 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 32 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 33 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 34 | #
 35 | 
 36 | import click
 37 | from .click_utils import (
 38 |     command,
 39 |     die,
 40 |     name_to_command_line,
 41 | )
 42 | 
 43 | from .. import smarts_aliases
 44 | 
 45 | from . import fragment_click
 46 | from .. import fragment_records
 47 | from .. import fragment_types
 48 | 
 49 | 
 50 | ########
 51 | smifrag_epilog = """
 52 | \b
 53 | """ + smarts_aliases.get_epilog(
 54 |     "--cut-smarts", smarts_aliases.cut_smarts_aliases
 55 | )
 56 | 
 57 | 
 58 | @command(epilog=smifrag_epilog)
 59 | @fragment_click.add_fragment_options
 60 | @click.argument(
 61 |     "smiles",
 62 |     # help="SMILES string to fragment"
 63 | )
 64 | @click.pass_context
 65 | def smifrag(ctx, fragment_options, smiles):
 66 |     """Fragment a single SMILES string
 67 | 
 68 |     SMILES: the SMILES string of the structure to fragment
 69 | 
 70 |     Fragment a SMILES and print details about each variable and constant
 71 |     fragment and how they are connected.
 72 | 
 73 |     """
 74 | 
 75 |     reporter = ctx.obj
 76 | 
 77 |     try:
 78 |         fragment_filter = fragment_options.get_fragment_filter()
 79 |     except fragment_types.FragmentValueError as err:
 80 |         die(f"Error in command-line option {name_to_command_line(err.name)!r} " f"({err.value!r}): err.reason")
 81 | 
 82 |     record = fragment_records.make_fragment_record_from_smiles(
 83 |         smiles,
 84 |         fragment_filter,
 85 |         reporter=reporter,
 86 |     )
 87 |     if record.errmsg:
 88 |         die(f"Cannot process SMILES: {record.errmsg}")
 89 | 
 90 |     columns = [
 91 |         ["#cuts"],
 92 |         ["enum.label"],
 93 |         ["#heavies"],
 94 |         ["symm.class"],
 95 |         ["smiles"],
 96 |         ["order"],
 97 |         ["#heavies"],
 98 |         ["symm.class"],
 99 |         ["smiles"],
100 |         ["with-H"],
101 |     ]
102 |     styles = [
103 |         "center",
104 |         "center",
105 |         "right",
106 |         "center",
107 |         "left",
108 |         "center",
109 |         "right",
110 |         "center",
111 |         "left",
112 |         "left",
113 |     ]
114 | 
115 |     for frag in record.fragmentations:
116 |         items = [
117 |             str(frag.num_cuts),
118 |             frag.enumeration_label,
119 |             str(frag.variable_num_heavies),
120 |             frag.variable_symmetry_class,
121 |             frag.variable_smiles,
122 |             frag.attachment_order,
123 |             str(frag.constant_num_heavies),
124 |             frag.constant_symmetry_class,
125 |             frag.constant_smiles,
126 |             frag.constant_with_H_smiles or "-",
127 |         ]
128 | 
129 |         for (item, column) in zip(items, columns):
130 |             column.append(str(item))
131 | 
132 |     sizes = []
133 |     for style, column in zip(styles, columns):
134 |         column_width = max(map(len, column))
135 |         column[0] = column[0].ljust(column_width)
136 |         sizes.append(column_width)
137 |         if len(column) == 1:
138 |             continue
139 | 
140 |         data_width = max(map(len, column[1:]))
141 | 
142 |         if style == "center":
143 |             column[1:] = [s.rjust(data_width).center(column_width) for s in column[1:]]
144 |         elif style == "left":
145 |             column[1:] = [s.ljust(data_width) for s in column[1:]]
146 |         ## elif style == "right-10":
147 |         ##     spacer = " "*10
148 |         ##     column[1:] = [spacer + s.rjust(data_width-10).center(column_width-10) for s in column[1:]]
149 |         elif style == "right":
150 |             column[1:] = [s.rjust(data_width).center(column_width) for s in column[1:]]
151 |         else:
152 |             raise AssertionError(style)
153 | 
154 |     first_line = (
155 |         " " * sizes[0]
156 |         + "   "
157 |         + " " * sizes[1]
158 |         + " |-"
159 |         + "  variable  ".center(sizes[2] + sizes[3] + sizes[4] + 6, "-")
160 |         + "-| "
161 |         + " " * sizes[5]
162 |         + " |-"
163 |         + "  constant  ".center(sizes[6] + sizes[7] + sizes[8] + sizes[9] + 9, "-")
164 |     )
165 | 
166 |     print(first_line)
167 |     for lineno, fields in enumerate(zip(*columns)):
168 |         print(*fields, sep=" | ")
169 |         if lineno == 0:
170 |             print(*["-" * len(s) for s in fields], sep="-+-")
171 | 


--------------------------------------------------------------------------------
/mmpdblib/config.py:
--------------------------------------------------------------------------------
  1 | # mmpdb - matched molecular pair database generation and analysis
  2 | #
  3 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without
  6 | # modification, are permitted provided that the following conditions are
  7 | # met:
  8 | #
  9 | #    * Redistributions of source code must retain the above copyright
 10 | #      notice, this list of conditions and the following disclaimer.
 11 | #    * Redistributions in binary form must reproduce the above
 12 | #      copyright notice, this list of conditions and the following
 13 | #      disclaimer in the documentation and/or other materials provided
 14 | #      with the distribution.
 15 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 16 | #      its contributors may be used to endorse or promote products
 17 | #      derived from this software without specific prior written
 18 | #      permission.
 19 | #
 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | #
 32 | 
 33 | #### Handle command-line arguments ####
 34 | 
 35 | import argparse
 36 | 
 37 | from . import smarts_aliases
 38 | from . import fragment_types
 39 | from . import index_types
 40 | 
 41 | # Things to pass as the ArgumentParser argument's 'type'
 42 | 
 43 | 
 44 | def positive_int(value):
 45 |     try:
 46 |         value = int(value)
 47 |     except ValueError:
 48 |         raise argparse.ArgumentTypeError("must be a positive integer")
 49 |     if value <= 0:
 50 |         raise argparse.ArgumentTypeError("must be a positive integer")
 51 |     return value
 52 | 
 53 | 
 54 | def positive_int_or_none(value):
 55 |     if value == "none":
 56 |         return "none"
 57 |     try:
 58 |         value = int(value)
 59 |     except ValueError:
 60 |         raise argparse.ArgumentTypeError("must be a positive integer or 'none'")
 61 |     if value <= 0:
 62 |         raise argparse.ArgumentTypeError("must be a positive integer or 'none'")
 63 |     return value
 64 | 
 65 | 
 66 | def positive_float(value):
 67 |     try:
 68 |         value = float(value)
 69 |     except ValueError:
 70 |         raise argparse.ArgumentTypeError("must be a positive float")
 71 |     if not (value > 0.0):
 72 |         raise argparse.ArgumentTypeError("must be a positive float")
 73 |     return value
 74 | 
 75 | 
 76 | def nonnegative_float(value):
 77 |     try:
 78 |         value = float(value)
 79 |     except ValueError:
 80 |         raise argparse.ArgumentTypeError("must be a positive float or zero")
 81 |     if not (value >= 0.0):
 82 |         raise argparse.ArgumentTypeError("must be a positive float or zero")
 83 |     return value
 84 | 
 85 | 
 86 | def nonnegative_int(value):
 87 |     try:
 88 |         value = int(value)
 89 |     except ValueError:
 90 |         raise argparse.ArgumentTypeError("must be a positive integer or zero")
 91 |     if not (value >= 0):
 92 |         raise argparse.ArgumentTypeError("must be a positive integer or zero")
 93 |     return value
 94 | 
 95 | 
 96 | def cutoff_list(value_s):
 97 |     prev = None
 98 |     values = []
 99 |     for term in value_s.split(","):
100 |         try:
101 |             value = int(term)
102 |         except ValueError as err:
103 |             raise argparse.ArgumentTypeError("could not parse %r as an integer: %s" % (term, err))
104 | 
105 |         if value < 0:
106 |             raise argparse.ArgumentTypeError("threshold values must be non-negative")
107 | 
108 |         if prev is not None and prev <= value:
109 |             raise argparse.ArgumentTypeError("threshold values must be in decreasing order")
110 |         prev = value
111 | 
112 |         values.append(value)
113 | 
114 |     if not values:  # Let people specify ""
115 |         return [0]
116 | 
117 |     return values
118 | 
119 | 
120 | #### Fragment
121 | 
122 | parse_max_heavies_value = positive_int_or_none
123 | parse_max_rotatable_bonds_value = positive_int_or_none
124 | parse_min_heavies_per_const_frag_value = nonnegative_int
125 | parse_min_heavies_total_const_frag_value = nonnegative_int
126 | 
127 | 
128 | def parse_num_cuts_value(value):
129 |     if value not in ("1", "2", "3"):
130 |         raise argparse.ArgumentTypeError("must be '1', '2', or '3'")
131 |     return int(value)
132 | 
133 | 
134 | def parse_method_value(value):
135 |     if value not in ("chiral",):
136 |         raise argparse.ArgumentTypeError("must be 'chiral'")
137 |     return value
138 | 
139 | 
140 | DEFAULT_FRAGMENT_OPTIONS = fragment_types.FragmentOptions(
141 |     max_heavies=100,
142 |     max_rotatable_bonds=10,
143 |     rotatable_smarts="[!$([NH]!@C(=O))&!D1&!$(*#*)]-&!@[!$([NH]!@C(=O))&!D1&!$(*#*)]",
144 |     cut_smarts=smarts_aliases.cut_smarts_aliases_by_name["default"].smarts,
145 |     num_cuts=3,
146 |     method="chiral",
147 |     salt_remover="<default>",
148 |     min_heavies_per_const_frag=0,
149 |     min_heavies_total_const_frag=0,
150 |     max_up_enumerations=1000,
151 | )
152 | 
153 | 
154 | ###### Index
155 | 
156 | parse_min_variable_heavies_value = nonnegative_int
157 | parse_max_variable_heavies_value = positive_int_or_none
158 | 
159 | parse_max_variable_ratio_value = nonnegative_float
160 | parse_min_variable_ratio_value = positive_float
161 | parse_max_heavies_transf = nonnegative_int
162 | parse_max_frac_trans = nonnegative_float
163 | parse_max_radius = nonnegative_int
164 | 
165 | 
166 | DEFAULT_INDEX_OPTIONS = index_types.IndexOptions(
167 |     min_variable_heavies=None,  # XXX can this be 0?
168 |     max_variable_heavies=10,
169 |     min_variable_ratio=None,  # XXX can this be 0.0?
170 |     max_variable_ratio=None,
171 |     max_heavies_transf=None,
172 |     max_frac_trans=None,  # XXX can this be 1.0?,
173 |     min_radius=0,
174 |     max_radius=5,
175 |     symmetric=False,
176 |     smallest_transformation_only=False,
177 | )
178 | 
179 | 
180 | def add_index_options(parser):
181 |     p = parser
182 |     OPTS = DEFAULT_INDEX_OPTIONS
183 |     p.add_argument(
184 |         "--min-variable-heavies",
185 |         type=parse_min_variable_heavies_value,
186 |         metavar="N",
187 |         default=OPTS.min_variable_heavies,
188 |         help="Minimum number of non-hydrogen atoms in the variable fragment.",
189 |     )
190 |     p.add_argument(
191 |         "--max-variable-heavies",
192 |         type=parse_max_variable_heavies_value,
193 |         default=DEFAULT_INDEX_OPTIONS.max_variable_heavies,
194 |         metavar="N",
195 |         help="Maximum number of non-hydrogen atoms in the variable fragment "
196 |         "(default: 10; for no maximum use 'none')",
197 |     )
198 |     p.add_argument(
199 |         "--min-variable-ratio",
200 |         type=parse_min_variable_ratio_value,
201 |         default=None,
202 |         metavar="FLT",
203 |         help="Minimum ratio of variable fragment heavies to heavies in the (cleaned) structure",
204 |     )
205 |     p.add_argument(
206 |         "--max-variable-ratio",
207 |         type=parse_max_variable_ratio_value,
208 |         default=None,
209 |         metavar="FLT",
210 |         help="Maximum ratio of variable fragment heavies to heavies in the (cleaned) structure",
211 |     )
212 |     p.add_argument(
213 |         "--max-heavies-transf",
214 |         type=parse_max_heavies_transf,
215 |         default=None,
216 |         metavar="N",
217 |         help="Maximum difference in the number of heavies transfered in a transformation",
218 |     )
219 |     p.add_argument(
220 |         "--max-frac-trans",
221 |         type=parse_max_frac_trans,
222 |         default=None,
223 |         metavar="FLT",
224 |         help="Maximum fraction of atoms taking part in a transformation",
225 |     )
226 |     p.add_argument(
227 |         "--max-radius",
228 |         type=parse_max_radius,
229 |         default=DEFAULT_INDEX_OPTIONS.max_radius,
230 |         metavar="N",
231 |         help="Maximum Environment Radius to be indexed in the MMPDB database",
232 |     )
233 | 
234 | 
235 | class RuleSelectionOptions:
236 |     def __init__(self, where, score, cutoff_list):
237 |         self.where = where
238 |         self.score = score
239 |         self.cutoff_list = cutoff_list
240 | 
241 |     def get_rule_selection_function(self):
242 |         from . import analysis_algorithms
243 | 
244 |         # generate the sort key for the different cutoffs
245 |         score_function = self.score
246 |         if score_function is None:
247 |             score_function = analysis_algorithms.default_score_function
248 | 
249 |         rule_key_function = analysis_algorithms.ComputeRuleKey(
250 |             score_function=score_function,
251 |             cutoffs=self.cutoff_list,
252 |         )
253 |         # wrap it together into a selection function
254 |         return analysis_algorithms.RuleSelectionFunction(
255 |             self.where,
256 |             rule_key_function,
257 |         )
258 | 
259 | 
260 | DEFAULT_RULE_SELECTION_OPTIONS = RuleSelectionOptions(
261 |     where=None,
262 |     score=None,
263 |     cutoff_list=(10, 5, 0),
264 | )
265 | 


--------------------------------------------------------------------------------
/mmpdblib/create_index.sql:
--------------------------------------------------------------------------------
 1 | -- mmpdb - matched molecular pair database generation and analysis
 2 | --
 3 | -- Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
 4 | --
 5 | -- Redistribution and use in source and binary forms, with or without
 6 | -- modification, are permitted provided that the following conditions are
 7 | -- met:
 8 | --
 9 | --    * Redistributions of source code must retain the above copyright
10 | --     notice, this list of conditions and the following disclaimer.
11 | --    * Redistributions in binary form must reproduce the above
12 | --      copyright notice, this list of conditions and the following
13 | --      disclaimer in the documentation and/or other materials provided
14 | --      with the distribution.
15 | --    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
16 | --      its contributors may be used to endorse or promote products
17 | --      derived from this software without specific prior written
18 | --      permission.
19 | --
20 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | -- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | -- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | -- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | -- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | -- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | -- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | -- DATA, OR PROFITS, OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | -- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | -- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | --
32 | -- END OF LICENSE
33 | 
34 | CREATE INDEX rule_environment_rule_id on rule_environment (rule_id);
35 | CREATE INDEX rule_environment_environment_fingerprint_id on rule_environment (environment_fingerprint_id);
36 | CREATE INDEX rule_environment_rule_id_num_pairs on rule_environment (rule_id, num_pairs);
37 | CREATE INDEX rule_environment_rule_id_num_pairs_fingerprint_id on rule_environment (rule_id, num_pairs, environment_fingerprint_id);
38 | 
39 | CREATE INDEX rule_from_smiles_id on rule (from_smiles_id);
40 | CREATE INDEX rule_to_smiles_id on rule (to_smiles_id);
41 | CREATE UNIQUE INDEX rule_smiles_smiles on rule_smiles (smiles);
42 | CREATE INDEX environment_fingerprint_smarts on environment_fingerprint (smarts);
43 | CREATE INDEX rule_environment_statistics_rule_environment_and_property_name_ids on rule_environment_statistics (rule_environment_id, property_name_id);
44 | CREATE INDEX rule_environment_statistics_count on rule_environment_statistics(count);
45 | 
46 | CREATE INDEX pair_rule_environment_id on pair (rule_environment_id);
47 | CREATE INDEX pair_compound_ids on pair (compound1_id, compound2_id);
48 | 
49 | CREATE INDEX compound_property_compound_id_property_name_id on compound_property (compound_id, property_name_id);
50 | 


--------------------------------------------------------------------------------
/mmpdblib/drop_index.sql:
--------------------------------------------------------------------------------
 1 | -- mmpdb - matched molecular pair database generation and analysis
 2 | --
 3 | -- Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
 4 | --
 5 | -- Redistribution and use in source and binary forms, with or without
 6 | -- modification, are permitted provided that the following conditions are
 7 | -- met:
 8 | --
 9 | --    * Redistributions of source code must retain the above copyright
10 | --      notice, this list of conditions and the following disclaimer.
11 | --    * Redistributions in binary form must reproduce the above
12 | --      copyright notice, this list of conditions and the following
13 | --      disclaimer in the documentation and/or other materials provided
14 | --      with the distribution.
15 | --    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
16 | --      its contributors may be used to endorse or promote products
17 | --      derived from this software without specific prior written
18 | --      permission.
19 | --
20 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | -- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | -- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | -- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | -- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | -- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | -- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | -- DATA, OR PROFITS, OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | -- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | -- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | --
32 | -- END OF LICENSE
33 | 
34 | DROP INDEX IF EXISTS rule_environment_rule_id;
35 | DROP INDEX IF EXISTS rule_environment_environment_fingerprint_id;
36 | DROP INDEX IF EXISTS rule_environment_rule_id_num_pairs;
37 | DROP INDEX IF EXISTS rule_environment_rule_id_num_pairs_fingerprint_id;
38 | 
39 | DROP INDEX IF EXISTS rule_from_smiles_id;
40 | DROP INDEX IF EXISTS rule_to_smiles_id;
41 | DROP INDEX IF EXISTS rule_smiles_smiles;
42 | DROP INDEX IF EXISTS environment_fingerprint_smarts;
43 | DROP INDEX IF EXISTS rule_environment_statistics_rule_environment_and_property_name_ids;
44 | DROP INDEX IF EXISTS rule_environment_statistics_count;
45 | 
46 | DROP INDEX IF EXISTS pair_rule_environment_id;
47 | DROP INDEX IF EXISTS pair_compound_ids;
48 | 
49 | DROP INDEX IF EXISTS compound_property_compound_id_property_name_id;
50 | 


--------------------------------------------------------------------------------
/mmpdblib/fragment_create_index.sql:
--------------------------------------------------------------------------------
1 | CREATE INDEX fragmentation_on_record_id ON fragmentation(record_id);
2 | 
3 | -- Needed in fragdb_partition
4 | CREATE INDEX fragmentation_on_constant_smiles ON fragmentation(constant_smiles);
5 | 
6 | CREATE UNIQUE INDEX record_on_title ON record(title);
7 | 
8 | CREATE UNIQUE INDEX error_record_on_title ON error_record(title);
9 | 


--------------------------------------------------------------------------------
/mmpdblib/fragment_records.py:
--------------------------------------------------------------------------------
  1 | # mmpdb - matched molecular pair database generation and analysis
  2 | #
  3 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  4 | # Copyright (c) 2019-2021, Andrew Dalke Scientific, AB
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are
  8 | # met:
  9 | #
 10 | #    * Redistributions of source code must retain the above copyright
 11 | #      notice, this list of conditions and the following disclaimer.
 12 | #    * Redistributions in binary form must reproduce the above
 13 | #      copyright notice, this list of conditions and the following
 14 | #      disclaimer in the documentation and/or other materials provided
 15 | #      with the distribution.
 16 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 17 | #      its contributors may be used to endorse or promote products
 18 | #      derived from this software without specific prior written
 19 | #      permission.
 20 | #
 21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 22 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 23 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 24 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 25 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 26 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 27 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 28 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 29 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 30 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 31 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 32 | #
 33 | 
 34 | import sys
 35 | 
 36 | from . import fragment_types
 37 | from . import fileio
 38 | 
 39 | 
 40 | def _as_list(method, normalized_mol, fragment_filter, num_normalized_heavies):
 41 |     return list(method(normalized_mol, fragment_filter, num_normalized_heavies))
 42 | 
 43 | 
 44 | ###
 45 | 
 46 | 
 47 | class ParsedSmilesRecord(object):
 48 |     __slots__ = (
 49 |         "id",
 50 |         "smiles",
 51 |         "mol",
 52 |         "normalized_mol",
 53 |         "normalized_smiles",
 54 |         "num_normalized_heavies",
 55 |     )
 56 | 
 57 |     def __init__(self, id, smiles, mol, normalized_mol, normalized_smiles, num_normalized_heavies):
 58 |         self.id = id
 59 |         self.smiles = smiles
 60 |         self.mol = mol
 61 |         self.normalized_mol = normalized_mol
 62 |         self.normalized_smiles = normalized_smiles
 63 |         self.num_normalized_heavies = num_normalized_heavies
 64 | 
 65 | 
 66 | def parse_record(id, smiles, fragment_filter):
 67 |     from rdkit import Chem
 68 |     from . import fragment_algorithm
 69 | 
 70 |     mol = Chem.MolFromSmiles(smiles)
 71 |     if mol is None:
 72 |         return "invalid smiles", ParsedSmilesRecord(id, smiles, mol, None, None, 0)
 73 | 
 74 |     errmsg, normalized_mol = fragment_filter.normalize(mol)
 75 |     normalized_smiles = Chem.MolToSmiles(normalized_mol, isomericSmiles=True)
 76 |     if errmsg is None:
 77 |         if "." in normalized_smiles:
 78 |             errmsg = "multiple fragments"
 79 |     num_normalized_heavies = mol.GetNumHeavyAtoms()
 80 | 
 81 |     record = ParsedSmilesRecord(id, smiles, mol, normalized_mol, normalized_smiles, num_normalized_heavies)
 82 |     if errmsg is not None:
 83 |         return errmsg, record
 84 | 
 85 |     errmsg = fragment_filter.apply_filters(normalized_mol)
 86 |     if errmsg is not None:
 87 |         return errmsg, record
 88 | 
 89 |     return None, record
 90 | 
 91 | 
 92 | def make_hydrogen_fragment_record(id, input_smiles, fragment_filter):
 93 |     from . import fragment_algorithm
 94 | 
 95 |     errmsg, record = parse_record(id, input_smiles, fragment_filter)
 96 |     if errmsg:
 97 |         return fragment_types.FragmentErrorRecord(id, input_smiles, errmsg)
 98 | 
 99 |     fragments = fragment_algorithm.fragment_molecule_on_explicit_hydrogens(input_smiles)
100 |     return fragment_types.FragmentRecord(
101 |         id,
102 |         input_smiles,
103 |         record.num_normalized_heavies,
104 |         record.normalized_smiles,
105 |         fragments,
106 |     )
107 | 
108 | 
109 | # Adapater to emulate the multiprocessing pool without using any threads/processes
110 | class SingleProcessPool(object):
111 |     def apply_async(self, f, args):
112 |         return SyncResult(f, args)
113 | 
114 |     def terminate(self):
115 |         pass
116 | 
117 |     def join(self):
118 |         pass
119 | 
120 |     def close(self):
121 |         pass
122 | 
123 | 
124 | class SyncResult(object):
125 |     def __init__(self, f, args):
126 |         self.f = f
127 |         self.args = args
128 | 
129 |     def get(self):
130 |         # Don't compute until requested
131 |         # Note: if you .get() twice, I'll compute twice. That's all that
132 |         # this code needs.
133 |         return self.f(*self.args)
134 | 
135 | 
136 | def make_fragment_records(smiles_reader, fragment_filter, cache=None, pool=None, reporter=None):
137 |     jobs = []
138 | 
139 |     if pool is None:
140 |         pool = SingleProcessPool()
141 | 
142 |     # There are two phases:
143 |     #   1) establish what needs to be fragmented vs. what is available from
144 |     #       cache or could not be parsed
145 |     #   2) fragment the unfragmented
146 |     for recno, terms in reporter.progress(enumerate(smiles_reader), "Preparing record"):
147 |         input_smiles = terms[0]
148 |         id = terms[1]
149 |         where = smiles_reader.location.where()
150 | 
151 |         # If the fragment information is available from cache then use it
152 |         if cache is not None:
153 |             record = cache.get(id)
154 |             if record is not None:
155 |                 if record.input_smiles == input_smiles:
156 |                     jobs.append((id, input_smiles, where, None, record))
157 |                     continue
158 | 
159 |         # If I can't parse it then record the error messages
160 |         errmsg, record = parse_record(id, input_smiles, fragment_filter)
161 |         if errmsg:
162 |             result = fragment_types.FragmentErrorRecord(id, input_smiles, errmsg)
163 |             jobs.append((id, input_smiles, where, None, result))
164 |             continue
165 | 
166 |         # Submit it as something to work on
167 |         args = (
168 |             fragment_filter.method,
169 |             record.normalized_mol,
170 |             fragment_filter,
171 |             record.num_normalized_heavies,
172 |         )
173 |         result = pool.apply_async(_as_list, args)  # fragment_filter.method calls the actual fragmentation algorithm
174 | 
175 |         jobs.append((id, input_smiles, where, record, result))
176 | 
177 |     # I'll a bit cautious. I'll process the jobs in order, yield
178 |     # the result, then throw it away. This keeps the job list from
179 |     # being filled with completed results.
180 |     def pop_iter(jobs):
181 |         while jobs:
182 |             yield jobs.pop(0)
183 | 
184 |     with reporter.progress(pop_iter(jobs), "Fragmented record", len(jobs)) as job_iter:
185 |         for (id, input_smiles, where, record, result) in job_iter:
186 |             if record is None:
187 |                 # use a pre-computed result (from cache, or an error record)
188 |                 yield result
189 |                 continue
190 | 
191 |             try:
192 |                 fragments = result.get()
193 |             except RuntimeError as err:
194 |                 # Some sort of RDKit failure.
195 |                 reporter.update("")
196 |                 print("Skipping %s: %s" % (err, where), file=sys.stderr)
197 |                 continue
198 | 
199 |             except fragment_types.FragmentationFailure as err:
200 |                 yield fragment_types.FragmentErrorRecord(
201 |                     id,
202 |                     input_smiles,
203 |                     str(err),
204 |                     )
205 |                 continue
206 | 
207 |             except Exception:
208 |                 # Something unexpected happened.
209 |                 # Give some idea of what failed.
210 |                 reporter.update("")
211 |                 print("Failure:", where, file=sys.stderr)
212 |                 raise
213 | 
214 |             yield fragment_types.FragmentRecord(
215 |                 id,
216 |                 input_smiles,
217 |                 record.num_normalized_heavies,
218 |                 record.normalized_smiles,
219 |                 fragments,
220 |             )
221 | 
222 | 
223 | ########
224 | 
225 | 
226 | class SingleSmilesReader(object):
227 |     def __init__(self, smiles, id="query"):
228 |         self.id = id
229 |         self.smiles = smiles
230 | 
231 |     def __iter__(self):
232 |         yield (self.smiles, self.id)
233 | 
234 |     location = fileio.Location.from_source("<string>")
235 | 
236 | 
237 | def make_fragment_record_from_smiles(smiles, fragment_filter, reporter=None):
238 |     from . import reporters
239 | 
240 |     reporter = reporters.get_reporter(reporter)
241 | 
242 |     reader = SingleSmilesReader(smiles)
243 |     records = make_fragment_records(reader, fragment_filter, reporter=reporter)
244 |     for record in records:
245 |         return record
246 |     raise AssertionError("how can there not be any records?")
247 | 


--------------------------------------------------------------------------------
/mmpdblib/fragment_schema.sql:
--------------------------------------------------------------------------------
 1 | -- mmpdb - matched molecular pair database generation and analysis
 2 | --
 3 | -- Copyright (c) 2021, Andrew Dalke Scientific, AB
 4 | --
 5 | -- Redistribution and use in source and binary forms, with or without
 6 | -- modification, are permitted provided that the following conditions are
 7 | -- met:
 8 | --
 9 | --    * Redistributions of source code must retain the above copyright
10 | --      notice, this list of conditions and the following disclaimer.
11 | --    * Redistributions in binary form must reproduce the above
12 | --      copyright notice, this list of conditions and the following
13 | --      disclaimer in the documentation and/or other materials provided
14 | --      with the distribution.
15 | --    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
16 | --      its contributors may be used to endorse or promote products
17 | --      derived from this software without specific prior written
18 | --      permission.
19 | --
20 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | -- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | -- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | -- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | -- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | -- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | -- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | -- DATA, OR PROFITS, OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | -- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | -- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | --
32 | -- END OF LICENSE
33 | 
34 | 
35 | -- Version 3.0 switched to a SQLite database to store the fragments.
36 | -- Earlier versions used JSON-Lines.
37 | -- The SQLite database improves I/O time, reduces memory use, and
38 | --     simplifies the development of fragment analysis tools.
39 | 
40 | -- NOTE: There is configuration information in three files!
41 | -- 1) fragment_types.py -- the data types
42 | -- 2) fragment_schema.sql -- (this file) defines the SQL schema
43 | -- 3) fragment_db.py -- defines the mapping from SQL to the data types
44 | 
45 | 
46 | CREATE TABLE options (
47 | 	id INTEGER NOT NULL,
48 | 	version INTEGER,
49 | 	cut_smarts VARCHAR(1000),
50 | 	max_heavies INTEGER,
51 | 	max_rotatable_bonds INTEGER,
52 | 	method VARCHAR(20),
53 | 	num_cuts INTEGER,
54 | 	rotatable_smarts VARCHAR(1000),
55 | 	salt_remover VARCHAR(200),
56 | 	min_heavies_per_const_frag INTEGER,
57 | 	min_heavies_total_const_frag INTEGER,
58 |         max_up_enumerations INTEGER,
59 | 	PRIMARY KEY (id)
60 | );
61 | 
62 | CREATE TABLE error_record (
63 | 	id INTEGER NOT NULL,
64 | 	title VARCHAR(100) NOT NULL,
65 | 	input_smiles VARCHAR(300) NOT NULL,
66 | 	errmsg VARCHAR(100),
67 | 	PRIMARY KEY (id)
68 | );
69 | 
70 | -- Unfortunately, this 'record' does not use the same colum names as
71 | -- 'compound' in the mmpdb schema.
72 | 
73 | CREATE TABLE record (
74 | 	id INTEGER NOT NULL,
75 | 	title VARCHAR(50) NOT NULL,
76 | 	input_smiles VARCHAR(400) NOT NULL,
77 | 	num_normalized_heavies INTEGER,
78 | 	normalized_smiles VARCHAR(350) NOT NULL,
79 | 	PRIMARY KEY (id)
80 | );
81 | 
82 | CREATE TABLE fragmentation (
83 | 	id INTEGER NOT NULL,
84 | 	record_id INTEGER,
85 | 	num_cuts INTEGER,
86 | 	enumeration_label VARCHAR(1) NOT NULL,
87 | 	variable_num_heavies INTEGER,
88 | 	variable_symmetry_class VARCHAR(3) NOT NULL,
89 | 	variable_smiles VARCHAR(350) NOT NULL,
90 | 	attachment_order VARCHAR(3) NOT NULL,
91 | 	constant_num_heavies INTEGER,
92 | 	constant_symmetry_class VARCHAR(3) NOT NULL,
93 | 	constant_smiles VARCHAR(350) NOT NULL,
94 | 	constant_with_H_smiles VARCHAR(350),
95 | 	PRIMARY KEY (id),
96 | 	FOREIGN KEY(record_id) REFERENCES record (id)
97 | );
98 | 


--------------------------------------------------------------------------------
/mmpdblib/index_types.py:
--------------------------------------------------------------------------------
  1 | "index configuration"
  2 | # mmpdb - matched molecular pair database generation and analysis
  3 | #
  4 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  5 | # Copyright (c) 2021, F. Hoffmann-La Roche Ltd.
  6 | #
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions are
  9 | # met:
 10 | #
 11 | #    * Redistributions of source code must retain the above copyright
 12 | #      notice, this list of conditions and the following disclaimer.
 13 | #    * Redistributions in binary form must reproduce the above
 14 | #      copyright notice, this list of conditions and the following
 15 | #      disclaimer in the documentation and/or other materials provided
 16 | #      with the distribution.
 17 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 18 | #      its contributors may be used to endorse or promote products
 19 | #      derived from this software without specific prior written
 20 | #      permission.
 21 | #
 22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 23 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 24 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 25 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 26 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 27 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 28 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 29 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 30 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 31 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 32 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 33 | #
 34 | 
 35 | # These are shared by a few different modules and placed into its own
 36 | # module to reduce inter-dependencies between them.
 37 | 
 38 | 
 39 | class IndexOptions(object):
 40 |     __slots__ = (
 41 |         "min_variable_heavies",
 42 |         "max_variable_heavies",
 43 |         "min_variable_ratio",
 44 |         "max_variable_ratio",
 45 |         "min_radius",
 46 |         "max_radius",
 47 |         "symmetric",
 48 |         "max_heavies_transf",
 49 |         "max_frac_trans",
 50 |         "smallest_transformation_only",
 51 |     )
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         min_variable_heavies = None,
 56 |         max_variable_heavies = None,
 57 |         min_variable_ratio = None,
 58 |         max_variable_ratio = None,
 59 |         max_heavies_transf = None,
 60 |         max_frac_trans = None,
 61 |         min_radius = 0,
 62 |         max_radius = 5,
 63 |         symmetric = False,
 64 |         smallest_transformation_only = False,
 65 |     ):
 66 | 
 67 |         assert min_variable_heavies is None or min_variable_heavies >= 0, min_variable_heavies
 68 |         self.min_variable_heavies = min_variable_heavies
 69 | 
 70 |         assert (
 71 |             (max_variable_heavies is None)
 72 |             or (min_variable_heavies is None and max_variable_heavies >= 0)
 73 |             or (min_variable_heavies <= max_variable_heavies)
 74 |         ), max_variable_heavies
 75 |         self.max_variable_heavies = max_variable_heavies
 76 | 
 77 |         assert min_variable_ratio is None or 0.0 <= min_variable_ratio <= 1.0, min_variable_ratio
 78 |         self.min_variable_ratio = min_variable_ratio
 79 | 
 80 |         assert (
 81 |             (max_variable_ratio is None)
 82 |             or (min_variable_ratio is None and max_variable_ratio <= 1.0)
 83 |             or (min_variable_ratio <= max_variable_ratio <= 1.0)
 84 |         )
 85 |         self.max_variable_ratio = max_variable_ratio
 86 | 
 87 |         assert max_heavies_transf is None or max_heavies_transf >= 0, max_heavies_transf
 88 |         self.max_heavies_transf = max_heavies_transf
 89 | 
 90 |         assert max_frac_trans is None or max_frac_trans >= 0, max_heavies_transf
 91 |         self.max_frac_trans = max_frac_trans
 92 | 
 93 |         assert min_radius <= max_radius, (min_radius, max_radius)
 94 |         
 95 |         assert min_radius >= 0, min_radius
 96 |         self.min_radius = min_radius
 97 | 
 98 |         assert max_radius >= 0, max_radius
 99 |         self.max_radius = max_radius
100 | 
101 |         assert isinstance(symmetric, bool)
102 |         self.symmetric = symmetric
103 | 
104 |         assert isinstance(smallest_transformation_only, bool)
105 |         self.smallest_transformation_only = smallest_transformation_only
106 | 
107 |     def to_dict(self):
108 |         d = {}
109 |         for name in IndexOptions.__slots__:
110 |             value = getattr(self, name)
111 |             if value is not None:
112 |                 d[name] = value
113 |         return d
114 | 
115 |     def get_fragment_filter(self):
116 |         from . import index_algorithm
117 | 
118 |         filters = []
119 |         if self.min_variable_heavies is not None:
120 |             filters.append(index_algorithm.MinVariableHeaviesFilter(self.min_variable_heavies))
121 |         if self.max_variable_heavies is not None:
122 |             filters.append(index_algorithm.MaxVariableHeaviesFilter(self.max_variable_heavies))
123 |         if self.min_variable_ratio is not None:
124 |             filters.append(index_algorithm.MinVariableRatioFilter(self.min_variable_ratio))
125 |         if self.max_variable_ratio is not None:
126 |             filters.append(index_algorithm.MaxVariableRatioFilter(self.max_variable_ratio))
127 | 
128 |         if not filters:
129 |             # It's easier to have 0 filters than to make a special do-nothing filter.
130 |             return index_algorithm.MultipleFilters([])
131 |         elif len(filters) == 1:
132 |             return filters[0]
133 |         else:
134 |             return index_algorithm.MultipleFilters(filters)
135 | 


--------------------------------------------------------------------------------
/mmpdblib/properties_io.py:
--------------------------------------------------------------------------------
  1 | # mmpdb - matched molecular pair database generation and analysis
  2 | #
  3 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without
  6 | # modification, are permitted provided that the following conditions are
  7 | # met:
  8 | #
  9 | #    * Redistributions of source code must retain the above copyright
 10 | #      notice, this list of conditions and the following disclaimer.
 11 | #    * Redistributions in binary form must reproduce the above
 12 | #      copyright notice, this list of conditions and the following
 13 | #      disclaimer in the documentation and/or other materials provided
 14 | #      with the distribution.
 15 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 16 | #      its contributors may be used to endorse or promote products
 17 | #      derived from this software without specific prior written
 18 | #      permission.
 19 | #
 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | #
 32 | 
 33 | 
 34 | class Properties(object):
 35 |     def __init__(self, id_header, id_column, property_names, property_columns, property_table):
 36 |         self.id_header = id_header
 37 |         self.id_column = id_column
 38 |         self.property_names = property_names
 39 |         self.property_columns = property_columns
 40 |         self.property_table = property_table
 41 | 
 42 |     def get_ids(self):
 43 |         return self.id_column
 44 | 
 45 |     def get_property_values(self, id):
 46 |         return self.property_table[id]
 47 | 
 48 |     def iter_properties(self):
 49 |         for name, column in zip(self.property_names, self.property_columns):
 50 |             yield name, column
 51 | 
 52 | 
 53 | # If a line contains a tab then it's tab-delimited.  Otherwise it's
 54 | # whitespace delimited.  (Originally it was whitespace delimited. Then
 55 | # I decided to support identifiers which contained a space in
 56 | # them. This seemed like a hacky-but-good-enough solution.)
 57 | def _split(line):
 58 |     if "\t" in line:
 59 |         return line.rstrip("\r\n").split("\t")
 60 |     return line.split()
 61 | 
 62 | 
 63 | def load_properties(properties_file, reporter):
 64 |     try:
 65 |         header_line = next(properties_file)
 66 |     except StopIteration:
 67 |         raise ValueError("First line of the properties file must contain the header")
 68 |     header_names = _split(header_line)
 69 |     if not header_names:
 70 |         raise ValueError("The properties file must contain at least one column name, for the id")
 71 |     if header_names[0] not in ("id", "ID", "Name", "name"):
 72 |         reporter.warning(
 73 |             "the identifier column in the properties file (column 1) has "
 74 |             "a header of %r; should be 'id', 'ID', 'Name', or 'name'" % (header_names[0],)
 75 |         )
 76 | 
 77 |     seen = set()
 78 |     for header_name in header_names:
 79 |         if header_name in seen:
 80 |             raise ValueError(
 81 |                 "Duplicate header %r found. A property name may not be listed more than once." % (header_name,)
 82 |             )
 83 |         seen.add(header_name)
 84 | 
 85 |     n = len(header_names)
 86 | 
 87 |     id_column = []
 88 |     property_table = {}
 89 |     property_rows = []
 90 |     for lineno, line in enumerate(properties_file, 2):
 91 |         fields = _split(line)
 92 |         if len(fields) != n:
 93 |             raise ValueError("Line %d has %d fields but the header has %d" % (lineno, len(fields), n))
 94 |         float_fields = []
 95 |         try:
 96 |             for field in fields[1:]:
 97 |                 if field == "*":
 98 |                     float_fields.append(None)
 99 |                 else:
100 |                     float_fields.append(float(field))
101 |         except ValueError:
102 |             raise ValueError("Line %d value %r cannot be converted to a float" % (lineno, field))
103 | 
104 |         id = fields[0]
105 |         id_column.append(id)
106 |         property_table[id] = float_fields
107 |         property_rows.append(float_fields)
108 | 
109 |     if property_rows:
110 |         property_columns = list(zip(*property_rows))
111 |     else:
112 |         property_columns = [[] for _ in header_names]
113 |     return Properties(header_names[0], id_column, header_names[1:], property_columns, property_table)
114 | 


--------------------------------------------------------------------------------
/mmpdblib/reporters.py:
--------------------------------------------------------------------------------
  1 | # mmpdb - matched molecular pair database generation and analysis
  2 | #
  3 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without
  6 | # modification, are permitted provided that the following conditions are
  7 | # met:
  8 | #
  9 | #    * Redistributions of source code must retain the above copyright
 10 | #      notice, this list of conditions and the following disclaimer.
 11 | #    * Redistributions in binary form must reproduce the above
 12 | #      copyright notice, this list of conditions and the following
 13 | #      disclaimer in the documentation and/or other materials provided
 14 | #      with the distribution.
 15 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 16 | #      its contributors may be used to endorse or promote products
 17 | #      derived from this software without specific prior written
 18 | #      permission.
 19 | #
 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | #
 32 | 
 33 | """A 'reporter' is used to provide progress information and status reports"""
 34 | 
 35 | import sys
 36 | import time
 37 | 
 38 | 
 39 | def get_reporter(reporter):
 40 |     if reporter is None:
 41 |         return Quiet()
 42 |     if isinstance(reporter, str):
 43 |         if reporter == "quiet":
 44 |             return Quiet()
 45 |         if reporter == "verbose":
 46 |             return Verbose()
 47 |         raise ValueError(f"Unsupported reporter {reporter!r}")
 48 |     return reporter
 49 | 
 50 | 
 51 | class BaseReporter(object):
 52 |     def warning(self, msg):
 53 |         "Print a warning message"
 54 |         pass
 55 | 
 56 |     def report(self, msg):
 57 |         "Print a report message"
 58 |         pass
 59 | 
 60 |     def progress(self, it, text, n=None):
 61 |         "Return a context manager for giving status report about an iterator"
 62 |         return StatusContext(iter(it))
 63 | 
 64 |     def update(self, msg):
 65 |         "Update the status line. This will erase the previous status."
 66 |         pass
 67 | 
 68 |     def explain(self, msg, *args):
 69 |         if args:
 70 |             self.report(msg % args)
 71 |         else:
 72 |             self.report(msg)
 73 | 
 74 | 
 75 | class Quiet(BaseReporter):
 76 |     "This reporter does nothing"
 77 | 
 78 | 
 79 | # This lets me do things like:
 80 | class StatusContext(object):
 81 |     """Adapter to treat an iterator as a context manger"""
 82 | 
 83 |     def __init__(self, it):
 84 |         self._it = it
 85 | 
 86 |     def __enter__(self):
 87 |         return self
 88 | 
 89 |     def __iter__(self):
 90 |         return self._it
 91 | 
 92 |     def __exit__(self, *args):
 93 |         pass
 94 | 
 95 | 
 96 | class Verbose(BaseReporter):
 97 |     "This reporter sends report and status information to stderr."
 98 | 
 99 |     def __init__(self):
100 |         self._erase = ""  # how to erase the last status message
101 | 
102 |     def warning(self, msg):
103 |         "Clear any status message and report the warning"
104 |         self.update("")
105 |         sys.stderr.write("WARNING: %s\n" % (msg,))
106 |         sys.stderr.flush()
107 | 
108 |     def report(self, msg):
109 |         "Clear any status message and print the report line"
110 |         if self._erase:
111 |             self.update("")
112 |         sys.stderr.write(msg + "\n")
113 |         sys.stderr.flush()
114 | 
115 |     def progress(self, it, text, n=None):
116 |         # Used in iterators
117 | 
118 |         def iterate():
119 |             if n is None or n == 0:
120 | 
121 |                 def get_text(i):
122 |                     return text + " " + str(i)
123 | 
124 |             else:
125 | 
126 |                 def get_text(i):
127 |                     return text + " %d/%d (%.1f%%)" % (i, n, 100.0 * i / n)
128 | 
129 |             i = 0
130 |             self.update(get_text(i))
131 |             t1 = time.time()
132 |             try:
133 |                 for i, value in enumerate(it, 1):
134 |                     yield value
135 |                     t2 = time.time()
136 |                     if t2 - t1 > 0.5:
137 |                         self.update(get_text(i))
138 |                         t1 = t2
139 |             finally:
140 |                 self.update("")
141 | 
142 |         obj = StatusContext(iterate())
143 |         # A bit of a hack so I can wrap location-based iterators
144 |         if hasattr(it, "location"):
145 |             obj.location = getattr(it, "location")
146 |         return obj
147 | 
148 |     def update(self, msg):
149 |         "Update the status line (erase the previous status message and display the new one)"
150 |         sys.stderr.write(self._erase)
151 |         sys.stderr.write(msg)
152 |         sys.stderr.flush()
153 |         self._erase = "\r" + " " * len(msg) + "\r"
154 | 
155 | 
156 | # This is a bit of a hack that was developed at the very end of the project.
157 | # It's used during the database load process, from an ".mmpa" file.
158 | 
159 | # It's an iter-like wrapper wihch shows progress across multiple
160 | # stages.  It assumes that the total number of elements is known in
161 | # the beginning, across all of the stages. For each stage, I want to
162 | # show the overall progress as a percentage, and give some feedback
163 | # about the progress of each stage.
164 | 
165 | 
166 | class MultiStageReporter(object):
167 |     def __init__(self, reporter, num_rows):
168 |         self.reporter = reporter
169 |         self.num_rows = num_rows
170 |         self._it = None
171 |         self._row_count = 0
172 | 
173 |     def set_iter(self, template, container):
174 |         """A string template (must have the '%' terms in the right order) and the container to iterator over
175 | 
176 |         This must be called to start each stage.
177 |         """
178 |         self.template = template
179 |         self._it = enumerate(container)  # enumerate() so I can track progress for the stage
180 |         self._n = len(container)
181 |         msg = self.template % (100.0 * self._row_count / self.num_rows, 0, self._n)
182 |         self.reporter.update(msg)
183 |         self._prev_time = time.time()
184 | 
185 |     def __iter__(self):
186 |         return self
187 | 
188 |     def __next__(self):
189 |         try:
190 |             i, value = next(self._it)
191 |         except StopIteration:
192 |             self.reporter.update("")  # Reset at the end of the stage.
193 |             raise
194 |         self._row_count = row_count = self._row_count + 1  # Global count
195 | 
196 |         now = time.time()
197 |         if now - self._prev_time > 0.5:
198 |             # Show the progress. Template '%' terms must be: overall percentage,
199 |             # element number in the stage, total number of elements in the stage.
200 |             self.reporter.update(self.template % (100.0 * row_count / self.num_rows, i, self._n))
201 |             self._prev_time = now
202 | 
203 |         return value
204 | 
205 |     next = __next__
206 | 
207 | 
208 | # the 'no_explain' function. I needed somewhere to put it, and this seemed okay.
209 | 
210 | 
211 | def no_explain(msg, *args):
212 |     pass
213 | 


--------------------------------------------------------------------------------
/mmpdblib/rgroup2smarts.py:
--------------------------------------------------------------------------------
  1 | # mmpdb - matched molecular pair database generation and analysis
  2 | #
  3 | # Copyright (c) 2019, Andrew Dalke Scientific, AB
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without
  6 | # modification, are permitted provided that the following conditions are
  7 | # met:
  8 | #
  9 | #    * Redistributions of source code must retain the above copyright
 10 | #      notice, this list of conditions and the following disclaimer.
 11 | #    * Redistributions in binary form must reproduce the above
 12 | #      copyright notice, this list of conditions and the following
 13 | #      disclaimer in the documentation and/or other materials provided
 14 | #      with the distribution.
 15 | #    * Neither the name of Andrew Dalke Scientific. nor the names of
 16 | #      its contributors may be used to endorse or promote products
 17 | #      derived from this software without specific prior written
 18 | #      permission.
 19 | #
 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | #
 32 | 
 33 | # Convert a R-group SMILES into a SMARTS pattern.
 34 | # Must be rooted at "*", with one single bond.
 35 | 
 36 | # The algorithm is:
 37 | #  - figure out the valence and hydrogen count for each atom
 38 | #  - convert the molecule into an isomeric SMILES, with
 39 | #      hydrogens on the atom terms
 40 | #  - convert the SMILES into a SMARTS:
 41 | #     - use a regexp to find the atom terms
 42 | #     - insert with the valence
 43 | 
 44 | # For examples:
 45 | #   *-Cl -> *!@[Clv1]
 46 | #   *-O  -> *!@[OH1v2]
 47 | 
 48 | import re
 49 | from rdkit import Chem
 50 | 
 51 | from . import fileio
 52 | 
 53 | # Match the atom terms. These are all in brackets.
 54 | _atom_term = re.compile(r"\[([^]]+)\]")
 55 | 
 56 | 
 57 | def rgroup_mol_to_smarts(mol):
 58 |     """mol -> SMARTS matching the R-group
 59 | 
 60 |     'mol' contain a single wildcard atom ("*") with one
 61 |     single bond to the rest of the R-group.
 62 | 
 63 |     The molecule will be modified. Pass in a copy if
 64 |     you do not want the modification.
 65 |     """
 66 | 
 67 |     wildcard_idx = -1
 68 |     suffixes = []
 69 | 
 70 |     n = len(Chem.GetMolFrags(mol))
 71 |     if n > 1:
 72 |         raise ValueError("more than one fragment found")
 73 |     # Could also check for n == 0. Decided to leave that
 74 |     # for the check for no wildcard atom.
 75 | 
 76 |     for idx, atom in enumerate(mol.GetAtoms()):
 77 |         if atom.HasProp("molAtomMapNumber"):
 78 |             atom_map = atom.GetProp("molAtomMapNumber")
 79 |             raise ValueError(f"atom maps are not supported (atom {idx} has atom map {atom_map!r})")
 80 | 
 81 |         # Figure out which atom is the wildcard
 82 |         atomno = atom.GetAtomicNum()
 83 |         if atomno == 0:
 84 |             if wildcard_idx == -1:
 85 |                 wildcard_idx = idx
 86 |             else:
 87 |                 raise ValueError("more than one wildcard atom")
 88 |             bonds = list(atom.GetBonds())
 89 |             if not bonds:
 90 |                 raise ValueError("wildcard atom not bonded to anything")
 91 |             if len(bonds) != 1:
 92 |                 raise ValueError("wildcard atom must only have one bond")
 93 |             if bonds[0].GetBondType() != 1:
 94 |                 raise ValueError("wildcard atom not bonded via a single bond")
 95 | 
 96 |             if atom.GetTotalNumHs():
 97 |                 raise ValueError("wildcard atom must not have implicit hydrogens")
 98 |             if atom.GetFormalCharge():
 99 |                 raise ValueError("wildcard atom must be uncharged")
100 |             # I could check for the chiral flag, but RDKit won't use it.
101 | 
102 |         # The suffix is after the atom term.
103 |         v = atom.GetTotalValence()
104 |         suffix = "v" + str(v)
105 | 
106 |         # If an atom has no hydrogens then MolToSmiles(allHsExplicit=True)
107 |         # will not include an H term. When used as a SMARTS, this means
108 |         # the atom term will match anything, when I want to to match nothing.
109 |         # I need to also include an H0 in the SMARTS.
110 |         if not atom.GetTotalNumHs():
111 |             suffix = "H0" + suffix
112 |         suffixes.append(suffix)
113 | 
114 |     if wildcard_idx == -1:
115 |         raise ValueError("no wildcard atom found")
116 | 
117 |     # allHsExplicit=True so all of the atom terms are in [brackets].
118 |     # allBondsExplicit=True to distinguish between aromatic and single bonds
119 |     #    (which SMILES handles as an implicit bond)
120 |     converted_smi = Chem.MolToSmiles(
121 |         mol,
122 |         allBondsExplicit=True,
123 |         allHsExplicit=True,
124 |         rootedAtAtom=wildcard_idx,
125 |     )
126 | 
127 |     # Figure out how to get from the canonical Kekule SMILES output order
128 |     # back to the moleule order.
129 |     # The MolToSmiles() set the '_smilesAtomOutputOrder'property, which
130 |     # is a string like '[0,1,2,3,4,5,6,7,8,13,9,10,11,12,]' which maps
131 |     # between the atom order in the molecule and the output order
132 |     output_order_str = mol.GetProp("_smilesAtomOutputOrder")
133 |     assert output_order_str[0] == "[" and output_order_str[-1:] == "]", output_order_str
134 |     output_order_map = map(int, output_order_str[1:-1].split(","))
135 |     invert_order = dict(enumerate(output_order_map))
136 | 
137 |     ## print(converted_smi)
138 |     ## print(suffixes)
139 | 
140 |     # Use a regular expression to
141 | 
142 |     # This emulates 'nonlocal' support so I can get
143 |     # the position of the re.sub() index even under
144 |     # Python 2.7.
145 |     sub_index_nonlocal = [0]
146 | 
147 |     def rename_atoms(m):
148 |         sub_index = sub_index_nonlocal[0]
149 |         sub_index_nonlocal[0] = sub_index + 1
150 | 
151 |         original_index = invert_order[sub_index]
152 |         suffix = suffixes[original_index]
153 |         g = m.group(1)
154 |         return "[" + g + suffix + "]"
155 | 
156 |     output_smi = _atom_term.sub(rename_atoms, converted_smi)
157 |     assert output_smi.startswith("[*H0v1]-"), output_smi
158 |     return "*-!@" + output_smi[8:]
159 | 
160 | 
161 | class ParseError(ValueError):
162 |     def __init__(self, msg, location):
163 |         super(ParseError, self).__init__(msg, location)
164 |         self.msg = msg
165 |         self.location = location
166 |         self._where = location.where()
167 | 
168 |     def __str__(self):
169 |         return f"{self.msg} at {self._where}"
170 | 
171 | 
172 | class ConversionError(ValueError):
173 |     def __init__(self, msg, location, extra=None):
174 |         super(ConversionError, self).__init__(msg, location, extra)
175 |         self.msg = msg
176 |         self.location = location
177 |         self.extra = extra
178 |         self._where = location.where()
179 | 
180 |     def __str__(self):
181 |         if self.extra is None:
182 |             return f"{self.msg} at {self._where}"
183 |         else:
184 |             return f"{self.msg} at {self._where}: {self.extra}"
185 | 
186 | 
187 | class Record(object):
188 |     def __init__(self, smiles, id=None):
189 |         self.smiles = smiles
190 |         self.id = id
191 | 
192 |     def __repr__(self):
193 |         return "Record({self.smiles!r}, id={self.id!r})"
194 | 
195 | 
196 | def parse_rgroup_file(infile, location=None):
197 |     if location is None:
198 |         location = FileLocation(getattr(infile, "name", "<unknown>"))
199 |     recno = 0
200 |     lineno = 0
201 | 
202 |     def get_recno():
203 |         return lineno
204 | 
205 |     def get_lineno():
206 |         return lineno
207 | 
208 |     location.register(get_recno=get_recno, get_lineno=get_lineno)
209 | 
210 |     try:
211 |         for lineno, line in enumerate(infile, 1):
212 |             if line == "\n":
213 |                 raise ParseError("no SMILES found", location)
214 | 
215 |             if line[:1] in "\r\v\t ":
216 |                 raise ParseError("expected SMILES at start of line", location)
217 | 
218 |             terms = line.split(None, 1)
219 |             if not terms:
220 |                 raise ParseError("no SMILES found", location)
221 |             elif len(terms) == 1:
222 |                 rec = Record(terms[0], None)
223 |             else:
224 |                 rec = Record(terms[0], terms[1].rstrip("\n\r"))
225 |             yield rec
226 |     finally:
227 |         location.save(recno=recno, lineno=lineno)
228 | 
229 | 
230 | ######
231 | 
232 | 
233 | class FileLocation(fileio.Location):
234 |     def where(self):
235 |         msg = f"{self.filename!r}, line {self.lineno}"
236 |         if self.record_id is not None:
237 |             msg += f", record {self.record_id!r}"
238 |         return msg
239 | 
240 | 
241 | class ListLocation(fileio.Location):
242 |     def where(self):
243 |         if self.record_id is None:
244 |             return f"{self.filename} #{self.recno}"
245 |         else:
246 |             return f"{self.source} #{self.recno}, record {self.record_id!r}"
247 | 
248 | 
249 | def iter_smiles_list(smiles_iter, location):
250 |     recno = location.recno
251 | 
252 |     def get_recno():
253 |         return recno
254 | 
255 |     location.register(get_recno=get_recno)
256 |     try:
257 |         for recno, smiles in enumerate(smiles_iter, recno):
258 |             yield Record(smiles, None)
259 |     finally:
260 |         location.save(recno=recno)
261 | 
262 | 
263 | def iter_smiles_as_smarts(record_reader, location, explain=None, all_mols=None):
264 |     if explain is None:
265 | 
266 |         def explain(msg, *args):
267 |             pass
268 | 
269 |     for record in record_reader:
270 |         location.record_id = record.id
271 |         smiles = record.smiles
272 |         mol = Chem.MolFromSmiles(smiles)
273 |         if mol is None:
274 |             raise ConversionError(f"Cannot parse SMILES ({smiles!r})", location)
275 |         try:
276 |             smarts = rgroup_mol_to_smarts(mol)
277 |         except ValueError as err:
278 |             raise ConversionError(f"Cannot convert SMILES ({smiles!r})", location, str(err))
279 | 
280 |         explain(f"#{location.recno}: converted SMILES {smiles!r} to SMARTS {smarts!r}")
281 |         if all_mols is not None:
282 |             pat = Chem.MolFromSmarts(smarts)
283 |             if pat is None:
284 |                 raise ConversionError(
285 |                     f"SMARTS failure for {smarts!r}",
286 |                     location,
287 |                     "using SMILES {smiles!r}",
288 |                 )
289 | 
290 |             if not mol.HasSubstructMatch(pat):
291 |                 raise ConversionError(
292 |                     "SMARTS {smarts!r} does not match SMILES {smiles!r}",
293 |                     location,
294 |                 )
295 |             all_mols.append((mol, location.where(), smiles))
296 |             explain(f"#{location.recno} passed the self-check")
297 | 
298 |         yield smarts
299 | 
300 | 
301 | def make_recursive_smarts(smarts_list):
302 |     terms = []
303 |     for smarts in smarts_list:
304 |         if not smarts.startswith("*-!@"):
305 |             raise ValueError("invalid prefix: {smarts!r}")
306 | 
307 |         terms.append("$(" + smarts[4:] + ")")
308 |     return "*-!@[" + ",".join(terms) + "]"
309 | 
310 | 
311 | def get_recursive_smarts_from_cut_rgroups(rgroups, source="rgroup", offset=0):
312 |     location = ListLocation(source)
313 |     location.save(recno=offset)
314 |     record_reader = iter_smiles_list(rgroups, location)
315 |     iter_smarts = iter_smiles_as_smarts(record_reader, location)
316 |     return make_recursive_smarts(iter_smarts)
317 | 
318 | 
319 | def get_recursive_smarts_from_cut_filename(filename):
320 |     location = FileLocation(filename)
321 |     with open(filename) as infile:
322 |         record_reader = parse_rgroup_file(infile, location)
323 |         iter_smarts = iter_smiles_as_smarts(record_reader, location)
324 |         return make_recursive_smarts(iter_smarts)
325 | 


--------------------------------------------------------------------------------
/mmpdblib/schema.sql:
--------------------------------------------------------------------------------
  1 | -- mmpdb - matched molecular pair database generation and analysis
  2 | --
  3 | -- Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  4 | --
  5 | -- Redistribution and use in source and binary forms, with or without
  6 | -- modification, are permitted provided that the following conditions are
  7 | -- met:
  8 | --
  9 | --    * Redistributions of source code must retain the above copyright
 10 | --      notice, this list of conditions and the following disclaimer.
 11 | --    * Redistributions in binary form must reproduce the above
 12 | --      copyright notice, this list of conditions and the following
 13 | --      disclaimer in the documentation and/or other materials provided
 14 | --      with the distribution.
 15 | --    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 16 | --      its contributors may be used to endorse or promote products
 17 | --      derived from this software without specific prior written
 18 | --      permission.
 19 | --
 20 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | -- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | -- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | -- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | -- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | -- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | -- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | -- DATA, OR PROFITS, OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | -- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | -- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | --
 32 | -- END OF LICENSE
 33 | 
 34 | -- Create the tables for the mmpdb matched molecular pair database.
 35 | 
 36 | -- This schema is meant to support both SQLite and MySQL, but those
 37 | -- databases use slightly different SQL. This schema definition
 38 | -- contains two terms which must be replaced by text substution to
 39 | -- get the correct database-specific SQL. These are:
 40 | 
 41 | --   $PRIMARY_KEY$ - specifies the primary key type
 42 | --     SQLite:   "INTEGER PRIMARY KEY"
 43 | --     MySQL:    "PRIMARY KEY"
 44 | --     Postgres: "SERIAL PRIMARY KEY"
 45 | 
 46 | --   $COLLATE$ - specifies a case-sensitive collation order
 47 | --     SQLite:   ""
 48 | --     MySQL:    "COLLATE latin1_bin"
 49 | --     Postgres: 'COLLATE "C"'
 50 | 
 51 | --   $DATETIME$ - specifies a case-sensitive collation order
 52 | --     SQLite:   "DATETIME"
 53 | --     MySQL:    "DATETIME"
 54 | --     Postgres: "TIMESTAMP"
 55 | 
 56 | -- In addition, to simplify processing, this file must only use
 57 | -- semicolons at the end of a SQL statement.
 58 | 
 59 | -- There's only one row in this table
 60 | CREATE TABLE dataset (
 61 |   id $PRIMARY_KEY$,
 62 |   mmpdb_version INTEGER NOT NULL,
 63 |   title VARCHAR(255) NOT NULL $COLLATE$,    -- human-visible label
 64 |   creation_date $DATETIME$ NOT NULL,
 65 |   fragment_options VARCHAR(2000) NOT NULL $COLLATE$, -- the JSON-encoded options used to fragment the dataset
 66 |   index_options VARCHAR(2000) NOT NULL $COLLATE$, -- the JSON-encoded options used to index the dataset
 67 |   is_symmetric INTEGER NOT NULL,
 68 |   
 69 |   num_compounds INTEGER, -- used for "list"
 70 |   num_rules INTEGER,
 71 |   num_pairs INTEGER,
 72 |   num_rule_environments INTEGER,
 73 |   num_rule_environment_stats INTEGER
 74 | );
 75 | 
 76 | 
 77 | -- Each used input structure gets its own 'compound'
 78 | -- This table might not exist for rule-only data sets.
 79 | 
 80 | CREATE TABLE compound (
 81 |   id $PRIMARY_KEY$,
 82 |   public_id VARCHAR(255) NOT NULL $COLLATE$ UNIQUE,    -- the public compound id (should this have a better name?)
 83 |   input_smiles VARCHAR(255) NOT NULL $COLLATE$, -- the input SMILES, before salt removal
 84 |   clean_smiles VARCHAR(255) NOT NULL $COLLATE$, -- the SMILES after salt removal
 85 |   clean_num_heavies INTEGER NOT NULL  -- the number of heavies in the cleaned SMILES (needed?)
 86 |   );
 87 | 
 88 | 
 89 | -- Normalized property names (eg, "IC50" might be mapped to 3).
 90 | 
 91 | CREATE TABLE property_name (
 92 |   id $PRIMARY_KEY$,
 93 |   name VARCHAR(255) NOT NULL $COLLATE$ UNIQUE
 94 |   );
 95 | 
 96 | 
 97 | -- Properties for each input compound
 98 | 
 99 | CREATE TABLE compound_property (
100 |   id $PRIMARY_KEY$,
101 |   compound_id INTEGER NOT NULL,
102 |   property_name_id INTEGER NOT NULL,
103 |   value REAL NOT NULL,
104 |   FOREIGN KEY (compound_id) REFERENCES compound (id),
105 |   FOREIGN KEY (property_name_id) REFERENCES property_name (id)
106 |   );
107 | 
108 | 
109 | -- Normalized SMILES for the LHS or RHS of the rule transformation SMILES.
110 | CREATE TABLE rule_smiles (
111 |   id $PRIMARY_KEY$,
112 |   smiles VARCHAR(255) NOT NULL $COLLATE$ UNIQUE,
113 |   num_heavies INTEGER
114 | );
115 | 
116 | 
117 | 
118 | -- -- A matched molecular pair rule
119 | 
120 | CREATE TABLE rule (
121 |   id $PRIMARY_KEY$,
122 | 
123 |   -- The SMIRKS/transformation SMILES for this rule is:
124 |   --   rule_smiles[id = from_smiles_id].smiles + ">>" +
125 |   --   rule_smiles[id = to_smiles_id].smiles
126 |   from_smiles_id INTEGER NOT NULL REFERENCES rule_smiles(id),
127 |   to_smiles_id INTEGER NOT NULL REFERENCES rule_smiles(id)
128 |   );
129 | 
130 | 
131 | -- Table with normalized SMARTS for the rule_environment.
132 | 
133 | -- These are based on the RDKit Morgan (ECFP-like) circular
134 | -- fingerprints but using canonical SMARTS. The "pseudo-SMILES"
135 | -- is a SMILES-like string which tries to represent the SMARTS
136 | -- as a depictable SMILES string. It is *not* SMILES because:
137 | --
138 | -- 1. ring atoms which may be either aromatic or aliphatic are
139 | --   represented with [#7], [#8], etc. instead of a symbol.
140 | -- 2. "(~*)" terms are added to show the number of additional
141 | --   connections. The bond type isn't known. Note: these
142 | --   represent additional bonds, not additional atoms. For
143 | --   example, two (~*) terms may represent bonds going to
144 | --   the same atom.
145 | 
146 | -- In my testing I found a 640-byte SMARTS and a 241 byte pseudo-SMILES.
147 | 
148 | 
149 | -- Fingerprints are based on the RDKit Morgan (ECFP-like) circular
150 | -- fingerprints, interpreted as a canonical SMARTS pattern.
151 | 
152 | CREATE TABLE environment_fingerprint (
153 |  id $PRIMARY_KEY$,
154 |  smarts VARCHAR(1024) $COLLATE$ NOT NULL,  -- the environment as a SMARTS string
155 |  pseudosmiles VARCHAR(400) $COLLATE$  NOT NULL,  -- the environment as a SMILES-like string
156 |  parent_smarts VARCHAR(1024) $COLLATE$ NOT NULL -- the parent SMARTS to this environment
157 |         -- (it's the empty string "" when there is no parent)
158 |  );
159 | 
160 | 
161 | -- A rule can have multiple rule environment, one per radius.
162 | 
163 | CREATE TABLE rule_environment (
164 |  id $PRIMARY_KEY$,
165 |  rule_id INTEGER REFERENCES rule(id),
166 |  environment_fingerprint_id INTEGER REFERENCES environment_fingerprint(id),
167 |  radius INTEGER,
168 |  num_pairs INTEGER
169 |  -- (the rule_env..._statistics "count" is the number of pairs with a given property)
170 |  );
171 | 
172 | 
173 | -- The pairs that belong to a rule_environment
174 | 
175 | -- The "constant_part" (also called the context) is the substructure
176 | -- which remains constant in the transformation. It typically has one
177 | -- or more R-groups. It is omitted from the transformation A>>B.
178 | 
179 | CREATE TABLE constant_smiles (
180 |   id $PRIMARY_KEY$,
181 |   smiles VARCHAR(255)
182 |   );
183 | 
184 | 
185 | CREATE TABLE pair (
186 |   id $PRIMARY_KEY$,
187 |   rule_environment_id INTEGER REFERENCES rule_environment (id) NOT NULL,
188 |   compound1_id INTEGER NOT NULL REFERENCES compound (id),
189 |   compound2_id INTEGER NOT NULL REFERENCES compound (id),
190 |   constant_id INTEGER REFERENCES constant_smiles(id)
191 |   );
192 | 
193 | 
194 | -- The aggregate property deltas for each rule environment
195 | 
196 | CREATE TABLE rule_environment_statistics (
197 |   id $PRIMARY_KEY$,
198 |   rule_environment_id INTEGER REFERENCES rule_environment (id),
199 |   property_name_id INTEGER NOT NULL REFERENCES property_name (id),
200 |   count INTEGER NOT NULL,
201 |   avg REAL NOT NULL,
202 |   std REAL,
203 |   kurtosis REAL,
204 |   skewness REAL,
205 |   min REAL NOT NULL,
206 |   q1 REAL NOT NULL,
207 |   median REAL NOT NULL,
208 |   q3 REAL NOT NULL,
209 |   max REAL NOT NULL,
210 |   paired_t REAL,
211 |   p_value REAL
212 |   );
213 | 


--------------------------------------------------------------------------------
/mmpdblib/smarts_aliases.py:
--------------------------------------------------------------------------------
 1 | # mmpdb - matched molecular pair database generation and analysis
 2 | #
 3 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
 4 | #
 5 | # Redistribution and use in source and binary forms, with or without
 6 | # modification, are permitted provided that the following conditions are
 7 | # met:
 8 | #
 9 | #    * Redistributions of source code must retain the above copyright
10 | #      notice, this list of conditions and the following disclaimer.
11 | #    * Redistributions in binary form must reproduce the above
12 | #      copyright notice, this list of conditions and the following
13 | #      disclaimer in the documentation and/or other materials provided
14 | #      with the distribution.
15 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
16 | #      its contributors may be used to endorse or promote products
17 | #      derived from this software without specific prior written
18 | #      permission.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | #
32 | 
33 | 
34 | class CutSmarts(object):
35 |     def __init__(self, name, smarts, description):
36 |         self.name = name
37 |         self.smarts = smarts
38 |         self.description = description
39 | 
40 | 
41 | cut_smarts_aliases_by_name = {}
42 | 
43 | cut_smarts_aliases = [
44 |     CutSmarts(
45 |         "default",
46 |         "[#6+0;!$(*=,#[!#6])]!@!=!#[!#0;!#1;!$([CH2]);!$([CH3][CH2])]",
47 |         "Cut all C-[!H] non-ring single bonds except for Amides/Esters/Amidines/Sulfonamides "
48 |         "and CH2-CH2 and CH2-CH3 bonds",
49 |     ),
50 |     CutSmarts(
51 |         "cut_AlkylChains",
52 |         "[#6+0;!$(*=,#[!#6])]!@!=!#[!#0;!#1]",
53 |         "As default, but also cuts CH2-CH2 and CH2-CH3 bonds",
54 |     ),
55 |     CutSmarts(
56 |         "cut_Amides",
57 |         "[#6+0]!@!=!#[!#0;!#1;!$([CH2]);!$([CH3][CH2])]",
58 |         "As default, but also cuts [O,N]=C-[O,N] single bonds",
59 |     ),
60 |     CutSmarts(
61 |         "cut_all",
62 |         "[#6+0]!@!=!#[!#0;!#1]",
63 |         "Cuts all Carbon-[!H] single non-ring bonds. Use carefully, this will create a lot of cuts",
64 |     ),
65 |     CutSmarts("exocyclic", "[R]!@!=!#[!#0;!#1]", "Cuts all exocyclic single bonds"),
66 |     CutSmarts(
67 |         "exocyclic_NoMethyl",
68 |         "[R]!@!=!#[!#0;!#1;!$([CH3])]",
69 |         "Cuts all exocyclic single bonds apart from those connecting to CH3 groups",
70 |     ),
71 | ]
72 | 
73 | 
74 | for alias in cut_smarts_aliases:
75 |     cut_smarts_aliases_by_name[alias.name] = alias
76 | 
77 | 
78 | def get_epilog(option_name, aliases):
79 |     lines = ["The " + option_name + " argument supports the following short-hand aliases:"]
80 |     for alias in aliases:
81 |         lines.append("  '%s': %s" % (alias.name, alias.description))
82 |         lines.append("     smarts: %s" % (alias.smarts,))
83 |     return "\n".join(lines) + "\n"
84 | 


--------------------------------------------------------------------------------
/mmpdblib/smiles_syntax.py:
--------------------------------------------------------------------------------
  1 | # mmpdb - matched molecular pair database generation and analysis
  2 | #
  3 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without
  6 | # modification, are permitted provided that the following conditions are
  7 | # met:
  8 | #
  9 | #    * Redistributions of source code must retain the above copyright
 10 | #      notice, this list of conditions and the following disclaimer.
 11 | #    * Redistributions in binary form must reproduce the above
 12 | #      copyright notice, this list of conditions and the following
 13 | #      disclaimer in the documentation and/or other materials provided
 14 | #      with the distribution.
 15 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 16 | #      its contributors may be used to endorse or promote products
 17 | #      derived from this software without specific prior written
 18 | #      permission.
 19 | #
 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | #
 32 | 
 33 | import re
 34 | 
 35 | # Match a '*' in the different forms that might occur,
 36 | # including with directional single bonds inside of ()s.
 37 | _wildcard_regex = " |\n".join(
 38 |     re.escape(regex)
 39 |     for regex in (
 40 |         "*",
 41 |         "[*]",
 42 |         "(*)",
 43 |         "([*])",
 44 |         "(/*)",
 45 |         "(/[*])",
 46 |         "/*",
 47 |         "/[*]",
 48 |         "(\\*)",
 49 |         "(\\[*])",
 50 |         "\\*",
 51 |         "\\[*]",
 52 |     )
 53 | )
 54 | _wildcard_pattern = re.compile(_wildcard_regex, re.X)
 55 | 
 56 | # Match the SMILES for an atom, followed by its closures
 57 | _atom_pattern = re.compile(
 58 |     r"""
 59 | (
 60 |  Cl? |             # Cl and Br are part of the organic subset
 61 |  Br? |
 62 |  [NOSPFIbcnosp*] |  # as are these single-letter elements
 63 |  \[[^]]*\]         # everything else must be in []s
 64 | )
 65 | """,
 66 |     re.X,
 67 | )
 68 | 
 69 | 
 70 | def convert_wildcards_to_closures(smiles, offsets=None):
 71 |     # This is designed for RDKit's canonical SMILES output. It does
 72 |     # not handle all possible SMILES inputs.
 73 |     if offsets is None:
 74 |         # Use 0, 1, 2, ... up to the number of '*'s
 75 |         offsets = range(smiles.count("*"))
 76 |     closure_terms = []
 77 |     for offset in offsets:
 78 |         if not (0 <= offset <= 9):
 79 |             raise ValueError("offset %d out of range (must be from 0 to 9)" % (offset,))
 80 |         closure_terms.append("%%%02d" % (90 + offset))
 81 | 
 82 |     new_smiles = smiles
 83 |     while 1:
 84 |         # Find the first '*'. If none are left, stop.
 85 |         wildcard_match = _wildcard_pattern.search(new_smiles)
 86 |         if wildcard_match is None:
 87 |             break
 88 | 
 89 |         closure_term = closure_terms.pop(0)
 90 | 
 91 |         wildcard_start = wildcard_match.start()
 92 |         if wildcard_start == 0 or new_smiles[wildcard_start - 1] == ".":
 93 |             # At the start of the molecule or after a ".". Need to
 94 |             # put the closure after the second atom. Find the second
 95 |             # atom. Since we only ever break on single non-ring bonds,
 96 |             # and since the first atom is a terminal atom, the second
 97 |             # atom must either be immediately after the first atom, or
 98 |             # there is a directional bond between them.
 99 |             wildcard_end = wildcard_match.end()
100 |             second_atom_match = _atom_pattern.match(new_smiles, wildcard_end)
101 |             if second_atom_match is None:
102 |                 # There was no atom. Is it a "/" or "\"? If so,
103 |                 # we'll need to swap the direction when we move
104 |                 # to a closure after the second atom.
105 |                 bond_dir = new_smiles[wildcard_end : wildcard_end + 1]
106 |                 if bond_dir == "/":
107 |                     direction = "\\"
108 |                 elif bond_dir == "\\":
109 |                     direction = "/"
110 |                 else:
111 |                     raise AssertionError(new_smiles)
112 |                 # Look for the second atom, which must exist
113 |                 second_atom_match = _atom_pattern.match(new_smiles, wildcard_end + 1)
114 |                 if second_atom_match is None:
115 |                     raise AssertionError((new_smiles, new_smiles[wildcard_end:]))
116 |             else:
117 |                 direction = ""
118 | 
119 |             second_atom_term = second_atom_match.group(1)
120 |             # I changed the bond configuration, so I may need to
121 |             # invert chirality of implicit chiral hydrogens.
122 |             if "@@H" in second_atom_term:
123 |                 second_atom_term = second_atom_term.replace("@@H", "@H")
124 |             elif "@H" in second_atom_term:
125 |                 second_atom_term = second_atom_term.replace("@H", "@@H")
126 | 
127 |             # Reassemble the string with the wildcard term deleted and
128 |             # the new closure inserted directly after the second atom
129 |             # (and before any of its closures).
130 |             new_smiles = (
131 |                 new_smiles[:wildcard_start]
132 |                 + second_atom_term
133 |                 + direction
134 |                 + closure_term
135 |                 + new_smiles[second_atom_match.end() :]
136 |             )
137 | 
138 |         else:
139 |             # The match is somewhere inside of a molecule, so we attach
140 |             # assign the closure to the atom it's bonded to on the left
141 |             c = new_smiles[wildcard_start - 1]
142 |             if c == "(" or c == ")":
143 |                 # In principle, this could be something like "CCC(F)(Cl)[*]",
144 |                 # where I would need to count the number of groups back to
145 |                 # the main atom, and flip chirality accordingly. Thankfully,
146 |                 # RDKit always puts the "[*]" terms immediately after the
147 |                 # preceeding atom, so I don't need to worry.
148 |                 raise NotImplementedError(
149 |                     "intermediate groups not supported",
150 |                     new_smiles,
151 |                     new_smiles[wildcard_start - 1 :],
152 |                 )
153 | 
154 |             elif c in "CNcnOS]Pos0123456789ABDEFGHIJKLMQRTUVWXYZabdefghijklmpqrtuvwxyz":
155 |                 # Double-check the the previous character looks like part of an atom.
156 |                 wildcard_term = wildcard_match.group()
157 |                 # Preserve the direction, if present
158 |                 if "/" in wildcard_term:
159 |                     direction = "/"
160 |                 elif "\\" in wildcard_term:
161 |                     direction = "\\"
162 |                 else:
163 |                     direction = ""
164 |                 new_smiles = new_smiles[:wildcard_start] + direction + closure_term + new_smiles[wildcard_match.end() :]
165 | 
166 |             else:
167 |                 raise AssertionError((new_smiles, c, new_smiles[wildcard_start - 1 :]))
168 | 
169 |     return new_smiles
170 | 
171 | 
172 | ##### Same thing, for labeled wildcards
173 | 
174 | _labeled_wildcard_pattern = re.compile(r"\*:([123])")
175 | 
176 | 
177 | def convert_labeled_wildcards_to_closures(smiles):
178 |     offsets = []
179 | 
180 |     def sub_function(m):
181 |         offsets.append(int(m.group(1)))
182 |         return "*"
183 | 
184 |     new_smiles = _labeled_wildcard_pattern.sub(sub_function, smiles)
185 |     # print("convert_labeled_wildcards_to_closures:", smiles, new_smiles, offsets)
186 |     return convert_wildcards_to_closures(new_smiles, offsets)
187 | 
188 | 
189 | if __name__ == "__main__":
190 |     for smiles in ("*C", "*/CO.*CN", "C*.C(*)N"):
191 |         print(smiles, convert_wildcards_to_closures(smiles, (0,) * smiles.count("*")))
192 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "mmpdb"
 7 | dynamic = ["version"]
 8 | description = "A package to identify matched molecular pairs and use them to predict property changes"
 9 | readme = "README.md"
10 | license-files = ["LICENSE.txt"]
11 | requires-python = ">=3.10"
12 | authors = [
13 |   {name = "Andrew Dalke", email = "dalke@dalkescientific.com"},
14 | ]
15 | maintainers = [
16 |   {name = "Jerome (chem-bio)", email = "rdkit-discuss@lists.sourceforge.net"},
17 |   {name = "Christian Kramer", email = "rdkit-discuss@lists.sourceforge.net"},
18 | ]
19 | keywords = [
20 |   "MMP",
21 |   "cheminformatics",
22 |   "matched molecular pair",
23 | ]
24 | classifiers = [
25 |   "Development Status :: 5 - Production/Stable",
26 |   "Environment :: Console",
27 |   "Intended Audience :: Science/Research",
28 |   "Operating System :: Unix",
29 |   "Programming Language :: Python :: 3",
30 |   "Topic :: Scientific/Engineering :: Chemistry",
31 | ]
32 | dependencies = [
33 |   "click",
34 |   "peewee >= 3.0",
35 |   "rdkit >= 2024.3",
36 |   "scipy",
37 | ]
38 | 
39 | [project.scripts]
40 | mmpdb = "mmpdblib.cli:main"
41 | 
42 | [project.urls]
43 | Homepage = "https://github.com/rdkit/mmpdb"
44 | 
45 | [tool.hatch.version]
46 | path = "mmpdblib/__init__.py"
47 | 
48 | [tool.hatch.build]
49 | packages = ["mmpdblib"]
50 | 
51 | [tool.black]
52 | line-length = 120
53 | exclude = '''
54 | /(
55 |     \.git
56 |   | \.hg
57 |   | \.mypy_cache
58 |   | \.tox
59 |   | \.nox
60 |   | \.venv
61 |   | _build
62 |   | build
63 |   | dist
64 | )/
65 | '''
66 | 
67 | 


--------------------------------------------------------------------------------
/tests/cached.fragdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/mmpdb/3ca81ae2a3e3192607cf020e376e6c0adecb2f2e/tests/cached.fragdb


--------------------------------------------------------------------------------
/tests/comma.smi:
--------------------------------------------------------------------------------
1 | Oc1ccccc1O,record 1
2 | Nc1ccccc1C,entry,2
3 | Nc1cc(S)ccc1C,item 3,extra
4 | 


--------------------------------------------------------------------------------
/tests/space.smi:
--------------------------------------------------------------------------------
1 | Oc1ccccc1O record 1
2 | Nc1ccccc1C entry 2
3 | Nc1cc(S)ccc1C item 3
4 | 


--------------------------------------------------------------------------------
/tests/space.smi.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/mmpdb/3ca81ae2a3e3192607cf020e376e6c0adecb2f2e/tests/space.smi.gz


--------------------------------------------------------------------------------
/tests/support.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import sys
 4 | import os
 5 | import tempfile
 6 | import shutil
 7 | from click.testing import CliRunner
 8 | from mmpdblib import cli
 9 | 
10 | 
11 | def expect_pass(args, input=None):
12 |     runner = CliRunner(mix_stderr=False)
13 |     result = runner.invoke(cli.main, args, input=input)
14 |     if result.exit_code:
15 |         import shlex
16 | 
17 |         args_msg = " ".join(shlex.quote(word) for word in args)
18 |         if result.exc_info:
19 |             import traceback
20 | 
21 |             traceback.print_exception(*result.exc_info)
22 |         raise AssertionError(f"SystemExit trying to run '{args_msg}': {result.exit_code}: {result.stderr}")
23 |     return result
24 | 
25 | 
26 | def expect_fail(args, input=None):
27 |     runner = CliRunner(mix_stderr=False)
28 |     result = runner.invoke(cli.main, args, input=input)
29 |     if not result.exit_code:
30 |         raise AssertionError(f"Should have failed: {args!r}")
31 |     return result
32 | 
33 | 
34 | def get_filename(filename):
35 |     return os.path.join(os.path.dirname(__file__), filename)
36 | 
37 | 
38 | def create_test_filename(test_case, filename):
39 |     dirname = tempfile.mkdtemp(prefix="mmpdb_test_")
40 |     test_case.addCleanup(shutil.rmtree, dirname)
41 |     return os.path.join(dirname, filename)
42 | 


--------------------------------------------------------------------------------
/tests/tab.smi:
--------------------------------------------------------------------------------
1 | Oc1ccccc1O	record 1
2 | Nc1ccccc1C	entry 2
3 | 


--------------------------------------------------------------------------------
/tests/test_data.csv:
--------------------------------------------------------------------------------
 1 | ID	MW	MP
 2 | phenol	94.1	41
 3 | catechol	110.1	105
 4 | 2-aminophenol	109.1	174
 5 | 2-chlorophenol	128.6	8
 6 | o-phenylenediamine	108.1	102
 7 | amidol	124.1	*
 8 | hydroxyquinol	126.1	140
 9 | phenylamine	93.1	-6
10 | cyclopentanol	86.1	-19
11 | 


--------------------------------------------------------------------------------
/tests/test_data.fragdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/mmpdb/3ca81ae2a3e3192607cf020e376e6c0adecb2f2e/tests/test_data.fragdb


--------------------------------------------------------------------------------
/tests/test_data.mmpdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/mmpdb/3ca81ae2a3e3192607cf020e376e6c0adecb2f2e/tests/test_data.mmpdb


--------------------------------------------------------------------------------
/tests/test_data.smi:
--------------------------------------------------------------------------------
 1 | Oc1ccccc1 phenol
 2 | Oc1ccccc1O catechol
 3 | Oc1ccccc1N 2-aminophenol
 4 | Oc1ccccc1Cl 2-chlorophenol
 5 | Nc1ccccc1N o-phenylenediamine
 6 | Nc1cc(O)ccc1N amidol
 7 | Oc1cc(O)ccc1O hydroxyquinol
 8 | Nc1ccccc1 phenylamine
 9 | C1CCCC1N cyclopentanol
10 | 


--------------------------------------------------------------------------------
/tests/test_data_2019.mmpdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/mmpdb/3ca81ae2a3e3192607cf020e376e6c0adecb2f2e/tests/test_data_2019.mmpdb


--------------------------------------------------------------------------------
/tests/test_index.py:
--------------------------------------------------------------------------------
  1 | # mmpdb - matched molecular pair database generation and analysis
  2 | #
  3 | # Copyright (c) 2015-2017, F. Hoffmann-La Roche Ltd.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without
  6 | # modification, are permitted provided that the following conditions are
  7 | # met:
  8 | #
  9 | #    * Redistributions of source code must retain the above copyright
 10 | #      notice, this list of conditions and the following disclaimer.
 11 | #    * Redistributions in binary form must reproduce the above
 12 | #      copyright notice, this list of conditions and the following
 13 | #      disclaimer in the documentation and/or other materials provided
 14 | #      with the distribution.
 15 | #    * Neither the name of F. Hoffmann-La Roche Ltd. nor the names of
 16 | #      its contributors may be used to endorse or promote products
 17 | #      derived from this software without specific prior written
 18 | #      permission.
 19 | #
 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | #
 32 | 
 33 | import unittest
 34 | import json
 35 | 
 36 | from mmpdblib import dbutils
 37 | 
 38 | from support import get_filename, create_test_filename, expect_pass
 39 | 
 40 | TEST_DATA_FRAGDB = get_filename("test_data.fragdb")
 41 | TEST_DATA_CSV = get_filename("test_data.csv")
 42 | 
 43 | 
 44 | def index(mmpdb_filename, *args):
 45 |     args = ("--quiet", "index", TEST_DATA_FRAGDB, "-o", mmpdb_filename) + tuple(args)
 46 |     expect_pass(args)
 47 | 
 48 | 
 49 | class TestIndexCommandline(unittest.TestCase):
 50 |     def _get_options(self, *args):
 51 |         mmpdb_filename = create_test_filename(self, "default.mmpdb")
 52 |         index(mmpdb_filename, *args)
 53 |         db = dbutils.open_database(mmpdb_filename)
 54 |         dataset = db.get_dataset()
 55 |         return dataset, json.loads(dataset.index_options_str)
 56 | 
 57 |     def test_default_filters(self):
 58 |         dataset, options = self._get_options()
 59 |         self.assertEqual(
 60 |             options,
 61 |             {
 62 |                 "min_radius": 0,
 63 |                 "max_radius": 5,
 64 |                 "max_variable_heavies": 10,
 65 |                 "smallest_transformation_only": False,
 66 |                 "symmetric": False,
 67 |             },
 68 |         )
 69 |         self.assertEqual(dataset.get_num_rules(), 47)
 70 |         self.assertEqual(dataset.get_num_pairs(), 342)
 71 |         self.assertEqual(dataset.get_property_names(), [])
 72 | 
 73 |     def test_set_filters(self):
 74 |         dataset, options = self._get_options(
 75 |             "--min-variable-heavies",
 76 |             "1",
 77 |             "--max-variable-heavies",
 78 |             "29",
 79 |             "--min-variable-ratio",
 80 |             "0.1",
 81 |             "--max-variable-ratio",
 82 |             "0.99",
 83 |             "--max-heavies-transf",
 84 |             "25",
 85 |             "--symmetric",
 86 |             "--max-frac-trans",
 87 |             "3",
 88 |         )
 89 |         self.assertEqual(
 90 |             options,
 91 |             {
 92 |                 "symmetric": True,
 93 |                 "max_frac_trans": 3.0,
 94 |                 "max_heavies_transf": 25,
 95 |                 "min_radius": 0,
 96 |                 "max_radius": 5,
 97 |                 "max_variable_heavies": 29,
 98 |                 "max_variable_ratio": 0.99,
 99 |                 "min_variable_heavies": 1,
100 |                 "min_variable_ratio": 0.1,
101 |                 "smallest_transformation_only": False,
102 |             },
103 |         )
104 |         self.assertEqual(dataset.get_num_rules(), 2 * 47)  # because --symmetric
105 |         self.assertEqual(dataset.get_num_pairs(), 2 * 342)
106 |         self.assertEqual(dataset.get_property_names(), [])
107 | 
108 |     def test_max_variable_heavies_none(self):
109 |         dataset, options = self._get_options("--max-variable-heavies", "none")
110 |         self.assertEqual(
111 |             options,
112 |             {
113 |                 "min_radius": 0,
114 |                 "max_radius": 5,
115 |                 "symmetric": False,
116 |                 "smallest_transformation_only": False,
117 |             },
118 |         )
119 | 
120 |     def test_with_properties(self):
121 |         dataset, options = self._get_options("--properties", TEST_DATA_CSV, "--title", "test data")
122 |         self.assertEqual(dataset.title, "test data")
123 |         self.assertEqual(dataset.get_property_names(), ["MW", "MP"])
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     unittest.main()
128 | 


--------------------------------------------------------------------------------
/tests/test_list.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import re
  3 | import support
  4 | from support import get_filename
  5 | 
  6 | TEST_DATA_MMPDB = get_filename("test_data_2019.mmpdb")
  7 | 
  8 | def list_main(args):
  9 |     args = ("--quiet", "list") + tuple(args)
 10 |     return support.expect_pass(args)
 11 | 
 12 | 
 13 | def list_main_fail(args):
 14 |     args = ("--quiet", "list") + tuple(args)
 15 |     return support.expect_fail(args).stderr
 16 | 
 17 | 
 18 | _header_fields = """
 19 | Name #cmpds #rules #pairs #envs  #stats |Title| Properties
 20 | """.split()
 21 | 
 22 | _expected_fields = f"""
 23 | {TEST_DATA_MMPDB}      9     47    342    321    533  MMPs from 'test_data.fragdb' MW MP
 24 | """.split()
 25 | 
 26 | 
 27 | class TestList(unittest.TestCase):
 28 |     def _check_header(self, line):
 29 |         fields = []
 30 |         # Normalize the '|--- Title ---|' section
 31 |         line = re.sub(r"\|-{1,} ", "|", line)
 32 |         line = re.sub(r" -{1,}\|", "|", line)
 33 |         for field in line.split():
 34 |             fields.append(field)
 35 | 
 36 |         self.assertEqual(fields, _header_fields)
 37 | 
 38 |     def _check_output(self, output):
 39 |         lines = output.splitlines()
 40 |         self._check_header(lines[0])
 41 |         n = 1
 42 |         if len(lines) > 1:
 43 |             n = 0 
 44 |             for line in lines[1:]:
 45 |                 if "test_data_2019.mmpdb" in line:
 46 |                     n += 1
 47 |                     self.assertEqual(line.split(), _expected_fields)
 48 |         return n
 49 | 
 50 |     def test_no_args(self):
 51 |         result = list_main([])
 52 |         n = self._check_output(result.output)
 53 |         self.assertEqual(n, 1)
 54 | 
 55 |     def test_one_arg(self):
 56 |         result = list_main([TEST_DATA_MMPDB])
 57 |         n = self._check_output(result.output)
 58 |         self.assertEqual(n, 1)
 59 | 
 60 |     def test_two_args(self):
 61 |         result = list_main([TEST_DATA_MMPDB, TEST_DATA_MMPDB])
 62 |         n = self._check_output(result.output)
 63 |         self.assertEqual(n, 2, "should have existed twice")
 64 | 
 65 |     def test_file_does_not_exist(self):
 66 |         result = list_main(["does_not_exist.mmpdb", TEST_DATA_MMPDB])
 67 |         n = self._check_output(result.output)
 68 |         self.assertEqual(n, 1)
 69 | 
 70 |     def test_recount(self):
 71 |         result = list_main(["--recount", TEST_DATA_MMPDB])
 72 |         n = self._check_output(result.output)
 73 |         self.assertEqual(n, 1)
 74 | 
 75 |     def test_all(self):
 76 |         self._test_all(["--all", TEST_DATA_MMPDB])
 77 | 
 78 |     def test_a(self):
 79 |         self._test_all(["-a", TEST_DATA_MMPDB])
 80 | 
 81 |     def _test_all(self, flag):
 82 |         result = list_main(flag)
 83 |         n = self._check_output(result.output)
 84 |         self.assertEqual(n, 1)
 85 | 
 86 |         # If the data set is ever regenerated then at the very least
 87 |         # the 'Created' line will need to be updated.
 88 |         expected_lines = f"""\
 89 |                          Name                         #cmpds #rules #pairs #envs  #stats  |--------- Title ----------| Properties
 90 | {TEST_DATA_MMPDB}      9     47    342    321    533  MMPs from 'test_data.fragdb' MW MP
 91 |       Created: 2025-05-02 14:54:33.639458
 92 |         #compounds/property:  8/MP 9/MW
 93 |         #smiles for rules: 21  for constants: 10
 94 |         Fragment options:
 95 |           cut_smarts: [#6+0;!$(*=,#[!#6])]!@!=!#[!#0;!#1;!$([CH2]);!$([CH3][CH2])]
 96 |           max_heavies: 100
 97 |           max_rotatable_bonds: 10
 98 |           max_up_enumerations: 1000
 99 |           method: chiral
100 |           min_heavies_per_const_frag: 0
101 |           min_heavies_total_const_frag: 0
102 |           num_cuts: 3
103 |           rotatable_smarts: [!$([NH]!@C(=O))&!D1&!$(*#*)]-&!@[!$([NH]!@C(=O))&!D1&!$(*#*)]
104 |           salt_remover: <default>
105 |         Index options:
106 |           max_radius: 5
107 |           max_variable_heavies: 10
108 |           min_radius: 0
109 |           smallest_transformation_only: False
110 |           symmetric: False""".splitlines()
111 |         result_lines = result.output.splitlines()
112 |         num_checked = 0
113 |         if len(expected_lines) == len(result_lines):
114 |             for line1, line2 in zip(expected_lines, result_lines):
115 |                 self.assertIn(re.sub(r'\s+', ' ', line1).strip(), re.sub(r'\s+', ' ', line2).strip())
116 |                 num_checked += 1
117 |         self.assertEqual(num_checked, 22)
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     unittest.main()
122 | 


--------------------------------------------------------------------------------
/tests/test_loadprops.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import json
 3 | 
 4 | from mmpdblib import dbutils
 5 | 
 6 | from support import (
 7 |     get_filename,
 8 |     create_test_filename,
 9 |     expect_pass,
10 | )
11 | 
12 | TEST_DATA_FRAGDB = get_filename("test_data.fragdb")
13 | TEST_DATA_CSV = get_filename("test_data.csv")
14 | 
15 | 
16 | def index(mmpdb_filename, *args):
17 |     args = ("--quiet", "index", TEST_DATA_FRAGDB, "-o", mmpdb_filename) + tuple(args)
18 |     expect_pass(args)
19 | 
20 | 
21 | def loadprops(mmpdb_filename, *args):
22 |     args = ("--quiet", "loadprops", "-p", TEST_DATA_CSV, mmpdb_filename) + tuple(args)
23 |     expect_pass(args)
24 | 
25 | 
26 | class TestLoadpropsCommandline(unittest.TestCase):
27 |     def _get_options(self, *args):
28 |         mmpdb_filename = create_test_filename(self, "default.mmpdb")
29 |         index(mmpdb_filename, *args)
30 |         db = dbutils.open_database(mmpdb_filename)
31 |         dataset = db.get_dataset()
32 |         return dataset, json.loads(dataset.index_options_str), mmpdb_filename
33 | 
34 |     def test_loadprops(self):
35 |         dataset, options, mmpdb_filename = self._get_options()
36 |         self.assertEqual(dataset.get_num_rule_environment_stats(), 0)
37 |         loadprops(mmpdb_filename)
38 |         self.assertEqual(dataset.get_num_rule_environment_stats(), 533)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     unittest.main()
43 | 


--------------------------------------------------------------------------------
/tests/two_tabs.smi:
--------------------------------------------------------------------------------
1 | Oc1ccccc1O	record	1
2 | Nc1ccccc1C	vinyl	2
3 | 


--------------------------------------------------------------------------------