├── .gitignore
├── INSTALL.txt
├── LICENSE
├── Makefile
├── README.md
├── TODO
├── bin
    ├── csv2txt.sh
    ├── fpscores.pkl.gz
    ├── inchi2smi.py
    ├── molenc.sh
    ├── molenc_HA.py
    ├── molenc_SA.py
    ├── molenc_atoms_filter.py
    ├── molenc_color.py
    ├── molenc_common.py
    ├── molenc_deepsmi.py
    ├── molenc_diam.py
    ├── molenc_drug.py
    ├── molenc_elements.py
    ├── molenc_eln.py
    ├── molenc_frag.py
    ├── molenc_frag2smi.py
    ├── molenc_fscan.py
    ├── molenc_get_tag.py
    ├── molenc_gpc.py
    ├── molenc_gpr.py
    ├── molenc_histo.py
    ├── molenc_ifg.py
    ├── molenc_iupac.smi
    ├── molenc_lead.py
    ├── molenc_lean.py
    ├── molenc_ligprep.sh
    ├── molenc_linker.py
    ├── molenc_lizard.py
    ├── molenc_mol2smi.py
    ├── molenc_mview.py
    ├── molenc_nearest.py
    ├── molenc_padel.py
    ├── molenc_panascan.py
    ├── molenc_ph4.py
    ├── molenc_ph4_type_atoms.py
    ├── molenc_qed.py
    ├── molenc_rbonds_filter.py
    ├── molenc_rdfrag.py
    ├── molenc_regr_stats.py
    ├── molenc_rfr.py
    ├── molenc_rotbond.py
    ├── molenc_scaffold.py
    ├── molenc_scan.py
    ├── molenc_scan.sh
    ├── molenc_sdf2smi.py
    ├── molenc_sdf_strip.py
    ├── molenc_smi2cansmi.py
    ├── molenc_smi2png.py
    ├── molenc_smisur.py
    ├── molenc_stable.py
    ├── molenc_std.py
    ├── molenc_thash.py
    ├── molenc_type_atoms.py
    ├── molenc_uniq.py
    ├── rgb_scale.py
    ├── smi2png.py
    └── smi2svg.py
├── data
    ├── 1k_mols_std_01.txt
    ├── 3.frags
    ├── 3.mols
    ├── 3.smi
    ├── 3.to_frag
    ├── 3_frags.smi
    ├── 3_frags.txt
    ├── 3_genmols.smi
    ├── 3_genmols.txt
    ├── 3_genmols_uniq.smi
    ├── ALDH1_2conf.ph4
    ├── AP_test.smi
    ├── AP_test.smi.dix.ref
    ├── AP_test.txt.ref
    ├── alcools.AD.ref
    ├── alcools.smi
    ├── caff_coca.sdf
    ├── caff_coca.smi
    ├── caff_coca_feats.ref
    ├── caff_coca_types.ref
    ├── caffeine.mol2
    ├── caffeine.sdf
    ├── caffeine.smi
    ├── caffeine_3d.sdf
    ├── chembl1868_std.AP
    ├── chembl1868_std.smi
    ├── chembl30_10mols.sdf
    ├── chembl30_10mols.txt.ref
    ├── chembl30_10mols_am1bcc.mol2
    ├── chembl_antivirals.frags.smi
    ├── chembl_antivirals.genmol.smi
    ├── chembl_antivirals.smi
    ├── chemical_formulas.txt
    ├── cisapride.smi
    ├── co_1conf.sdf
    ├── cocaine.smi
    ├── ethanol.smi
    ├── ethanol.uhd.dix.ref
    ├── ethanol.uhd.ref
    ├── fda_approved.smi
    ├── features.txt
    ├── gen_mols.txt
    ├── h2o_1conf.sdf
    ├── merge.txt
    ├── opio.smi
    ├── ptable.txt
    ├── test_HYD_group.sdf
    ├── test_HYD_group.smi
    ├── test_in.pbc
    ├── test_mols.txt
    └── test_out.ref
├── deepsmi_test.sh
├── doc
    ├── Ester_KDD_1996_DBSCANclustering.pdf
    └── Shrivastava_2016_ExactWeightedMinwiseHashing.pdf
├── dune-project
├── fcodec
├── histo.gpl
├── kb_test.sh
├── mol_frag_test.sh
├── molenc.opam
├── molenc_frag
├── rfp
├── smisur_test.sh
├── src
    ├── AP_BBAD.ml
    ├── BBAD.ml
    ├── MSE_mol.ml
    ├── MST.ml
    ├── WMH.ml
    ├── ap_encoder.ml
    ├── ap_types.ml
    ├── atom_env.ml
    ├── atom_pair.ml
    ├── bloom.ml
    ├── bond.ml
    ├── butina.ml
    ├── decoder.ml
    ├── dsmi.ml
    ├── dune
    ├── encoder.ml
    ├── filter.ml
    ├── finder.ml
    ├── fingerprint.ml
    ├── formula.ml
    ├── fpMol.ml
    ├── fp_test.ml
    ├── fragmentable_mol.ml
    ├── gen_bindings.sh
    ├── get_mol.ml
    ├── gnuplot.ml
    ├── gram.ml
    ├── index.ml
    ├── indexer.ml
    ├── intSet.ml
    ├── lean.ml
    ├── lig_box.ml
    ├── merge.ml
    ├── mini_mol.ml
    ├── mol2.ml
    ├── molenc_AP.ml
    ├── molenc_UHD.ml
    ├── molenc_fcodec.ml
    ├── myList.ml
    ├── node.ml
    ├── norm.ml
    ├── palette.ml
    ├── pareto.ml
    ├── ph4.ml
    ├── ph4_atom.ml
    ├── piEltHA.ml
    ├── prune.ml
    ├── ptable.ml
    ├── pubchem_decoder.ml
    ├── rank.ml
    ├── rdkit.ml
    ├── rdkit_wrapper.py
    ├── rdkit_wrapper_specs.txt
    ├── scale.ml
    ├── sdf.ml
    ├── sdf_3D.ml
    ├── sdf_read.ml
    ├── shannon.ml
    ├── shuf.ml
    ├── smi.ml
    ├── split.ml
    ├── syb_atom.ml
    ├── sybyl.ml
    ├── test_RS.ml
    ├── to_dense.ml
    ├── uniq.ml
    ├── utls.ml
    ├── wmh_bench.ml
    ├── wmh_test.ml
    └── wmh_unit_test.ml
├── tani_est.gpl
├── test.sh
├── test_BBAD.sh
├── test_sdf_read.sh
└── test_uhd.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | molenc.install
2 | src/.merlin
3 | data/*.svg
4 | 


--------------------------------------------------------------------------------
/INSTALL.txt:
--------------------------------------------------------------------------------
 1 | Molenc install guide
 2 | ====================
 3 | 
 4 | Author: Francois Berenger
 5 | Date: 6th July 2022
 6 | 
 7 | Example installation instructions on a fresh Debian 11.3 system.
 8 | On Ubuntu Linux, installation should be very similar.
 9 | 
10 | On Mac computers, this software has worked in the past, but
11 | installation is a pain; hence we don't maintain anymore
12 | neither recommend this setup.
13 | 
14 | The Bash shell is assumed for all commands.
15 | 
16 | Sudo rights are assumed for the user performing the installation.
17 | 
18 | I) Install system-wide packages
19 | -------------------------------
20 | 
21 | $ sudo apt install git opam python3-pip python3-numpy
22 | 
23 | II) Configure the OCaml package manager
24 | ---------------------------------------
25 | 
26 | $ opam init -y
27 | $ eval `opam config env` # path setup for ocaml executables
28 |                          # might be needed in your ~/.bashrc
29 | 
30 | III) Install OCaml packages
31 | ---------------------------
32 | 
33 | $ opam depext -i molenc # this will also install rdkit system-wide
34 | 
35 | II) Install user-space packages
36 | -------------------------------
37 | 
38 | $ pip3 install six # required by chemo-standardizer
39 | $ pip3 install chemo-standardizer # requires system-wide rdkit
40 | 
41 | III) Tests
42 | ----------
43 | 
44 | Test the molecular standardiser is correctly installed.
45 | It is used by molenc in case molecules need to be standardized.
46 | 
47 | $ standardiser -h
48 | 
49 | If not, it may be missing from PATH:
50 | 
51 | $ export PATH=$PATH:~/.local/bin # might be needed in your ~/.bashrc
52 | $ standardiser -h # test again
53 | 
54 | IV) Encode some molecules
55 | -------------------------
56 | 
57 | Get some molecules in the SMILES format:
58 | 
59 | $ wget https://raw.githubusercontent.com/UnixJunkie/molenc/master/data/chembl_antivirals.smi -O antivirals.smi
60 | 
61 | Encode those molecules using counted atom pairs fingerprint:
62 | 
63 | $ molenc.sh --pairs -i antivirals.smi -o antivirals_std.AP
64 | 
65 | Look at what was obtained:
66 | $ head -1 antivirals_std.AP
67 | CHEMBL807,0.0,[2:6;8:1;15:3;25:12;26:2;70:3;93:3;372:6;393:6;407:1;412:2;453:3;466:2;524:9;917:9;1095:3;1742:1;1776:3;2063:3;2576:4;2646:1;4428:3;5906:2;5916:1;6005:2]
68 | 
69 | V) Encode more molecules with an existing encoding dictionary
70 | -------------------------------------------------------------
71 | 
72 | Let's say we want to encode some new molecules using an existing encoding dictionary
73 | (a dictionary was created in the previous step for antivirals.smi).
74 | In the real world, you might want the encoding dictionary to cover the whole ChEMBL database
75 | (or your company's whole compound collection), so that the dictionary is exhaustive enough.
76 | 
77 | In the following, you need to replace MY_MOLECULES.smi with the SMILES file of your choice.
78 | 
79 | $ molenc.sh --pairs -d antivirals.smi.dix -i MY_MOLECULES.smi -o MY_MOLECULES_std.AP
80 | 
81 | Concluding remarks
82 | ------------------
83 | 
84 | Molenc is a research software prototype.
85 | As such, it might be be a little difficult to install and under-documented.
86 | So is the fate of research by-products.
87 | Don't hesitate to contact the author in case you cannot install the software,
88 | find any bug or encounter some problems while using it.
89 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |  BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, Francois BERENGER
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build install uninstall reinstall test
 2 | 
 3 | build:
 4 | 	dune build @install -j `getconf _NPROCESSORS_ONLN`
 5 | 
 6 | clean:
 7 | 	rm -rf _build
 8 | 
 9 | edit:
10 | 	emacs src/*.ml TODO commands.sh &
11 | 
12 | install: build
13 | 	dune install
14 | 
15 | uninstall:
16 | 	dune uninstall
17 | 
18 | reinstall: uninstall install
19 | 
20 | test:
21 | 	rm -f _build/default/src/fp_test.exe
22 | 	dune build src/fp_test.exe
23 | 	_build/default/src/fp_test.exe
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | MolEnc: a molecular encoder using rdkit and OCaml.
 4 | 
 5 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3546675.svg)](https://doi.org/10.5281/zenodo.3546675)
 6 | 
 7 | The implemented fingerprint is J-L Faulon's "Signature Molecular Descriptor"
 8 | (SMD [1]).
 9 | This is an unfolded-counted chemical fingerprint.
10 | Such fingerprints are less lossy than famous chemical fingerprints like ECFP4.
11 | SMD encoding doesn't introduce feature collisions upon encoding.
12 | Also, a feature dictionary is created at encoding time.
13 | This dictionary can be used later on to map a given feature index to an
14 | atom environment.
15 | Molenc also implements unfolded-counted atom pairs [2].
16 | 
17 | For SMD, we recommend using a radius of zero to one (molenc.sh -r 0:1 ...) or
18 | zero to two.
19 | 
20 | Currently, the atom typing scheme being used is:
21 | (#pi-electrons, element symbol, #HA neighbors, formal charge).
22 | 
23 | In the future, we might add pharmacophore feature points[3]
24 | (Donor, Acceptor, PosIonizable, NegIonizable, Aromatic, Hydrophobe),
25 | to allow a fuzzier description of molecules.
26 | 
27 | # How to install the software
28 | 
29 | For beginners/non opam users:
30 | download and execute the latest self-installer
31 | shell script from (https://github.com/UnixJunkie/molenc/releases).
32 | 
33 | Then execute:
34 | ```
35 | ./molenc-5.0.1.sh ~/usr/molenc-5.0.1
36 | ```
37 | 
38 | This will create ~/usr/molenc-5.0.1/bin/molenc.sh, among other things
39 | inside the same directory.
40 | 
41 | For opam users:
42 | ```
43 | opam install molenc
44 | ```
45 | 
46 | Do not hesitate to contact the author in case you have problems installing
47 | or using the software or if you have any question.
48 | 
49 | # Usage
50 | 
51 | ```
52 | molenc.sh -i input.smi -o output.txt
53 |          [-d encoding.dix]: reuse existing feature dictionary
54 |          [-r i:j]: fingerprint radius (default=0:1)
55 |          [--pairs]: use atom pairs instead of Faulon's FP
56 |          [-m <int>]: maximum allowed atom-pair distance
57 |                      (default: no limit)
58 |          [--seq]: sequential mode (disable parallelization)
59 |          [-v]: debug mode; keep temp files
60 |          [-n <int>]: max jobs in parallel
61 |          [-c <int>]: chunk size
62 |          [--no-std]: don't standardize input file molecules
63 |                      ONLY USE IF THEY HAVE ALREADY BEEN STANDARDIZED
64 | ```
65 | 
66 | How to encode a database of molecules:
67 | 
68 | ```
69 | molenc.sh -i molecules.smi -o molecules.txt
70 | 
71 | ```
72 | 
73 | How to encode another database of molecules, but reusing the feature
74 | dictionary from another database:
75 | 
76 | ```
77 | molenc.sh -i other_molecules.smi -o other_molecules.txt -d molecules.txt.dix
78 | ```
79 | 
80 | # Bibliography
81 | 
82 | [1] Faulon, J. L., Visco, D. P., & Pophale, R. S. (2003). The signature molecular descriptor. 1. Using extended valence sequences in QSAR and QSPR studies. Journal of chemical information and computer sciences, 43(3), 707-720.
83 | 
84 | [2] Carhart, R. E., Smith, D. H., & Venkataraghavan, R. (1985). Atom pairs as molecular features in structure-activity studies: definition and applications. Journal of Chemical Information and Computer Sciences, 25(2), 64-73.
85 | 
86 | [3] Kearsley, S. K., Sallamack, S., Fluder, E. M., Andose, J. D., Mosley, R. T., & Sheridan, R. P. (1996). Chemical similarity using physiochemical property descriptors. Journal of Chemical Information and Computer Sciences, 36(1), 118-127.
87 | 


--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | time ./bin/molenc_frag.py -i data/all_kegg_drugs_20112019_std.smi \
3 |                           -o data/all_kegg_drugs_20112019_std.to_frag
4 | #7605 molecules at 1045.63 molecule/s
5 | #real    0m7.595s
6 | time ./molenc_frag -i data/all_kegg_drugs_20112019_std.to_frag \
7 |                    -o data/all_kegg_drugs_20112019_std.frags -s 1234
8 | #real    0m1.158s i.e. ~= 6567 molecule/s
9 | 


--------------------------------------------------------------------------------
/bin/csv2txt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # csv format output by molenc_lizard.py to txt format expected by
4 | # several molenc tools
5 | 
6 | awk -F',' \
7 |     '{print $1","$2",[0:"$3";1:"$4";2:"$5";3:"$6";4:"$7";5:"$8";6:"$9"]"}' $1
8 | 


--------------------------------------------------------------------------------
/bin/fpscores.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UnixJunkie/molenc/edc27db8206e6cbca4409b962426c94f3d14e18d/bin/fpscores.pkl.gz


--------------------------------------------------------------------------------
/bin/inchi2smi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # InChI to SMILES conversion
 4 | 
 5 | import argparse
 6 | import rdkit
 7 | import sys
 8 | from rdkit import Chem
 9 | 
10 | def RobustMolSupplier(filename):
11 |     with open(filename) as f:
12 |         for line in f:
13 |             words = line.split()
14 |             name = words[0]
15 |             inchi = words[1]
16 |             yield (name, Chem.MolFromInchi(inchi))
17 | 
18 | if __name__ == '__main__':
19 |     # parse CLI
20 |     # show help in case user has no clue of what to do
21 |     if len(sys.argv) != 3:
22 |         sys.stderr.write("%s input.inchi output.smi\n" % sys.argv[0])
23 |         sys.exit(1)
24 |     input_inchi = sys.argv[1]
25 |     output_smi = sys.argv[2]
26 |     output = open(output_smi, 'w')
27 |     for name, mol in RobustMolSupplier(input_inchi):
28 |         if mol is None:
29 |             continue
30 |         smi = Chem.MolToSmiles(mol)
31 |         output.write("%s\t%s\n" % (smi, name))
32 |     output.close()
33 | 


--------------------------------------------------------------------------------
/bin/molenc_HA.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (C) 2023, Francois Berenger
 4 | # Tsuda laboratory, The University of Tokyo,
 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan.
 6 | 
 7 | # Robust Heavy Atom count from a .smi input file
 8 | # (molenc_lizard.py can ignore some molecules because of RDKit)
 9 | 
10 | import rdkit, sys
11 | from rdkit import Chem
12 | from rdkit.Chem import Lipinski
13 | 
14 | def RobustSmilesMolSupplier(filename):
15 |     with open(filename) as f:
16 |         for i, line in enumerate(f):
17 |             words = line.split()
18 |             smile = words[0]
19 |             name = words[1]
20 |             yield (Chem.MolFromSmiles(smile, sanitize=False), name)
21 | 
22 | input_smi = sys.argv[1]
23 | for mol, name in RobustSmilesMolSupplier(input_smi):
24 |     if mol is None:
25 |         print("rdkit could not parse: %s" % name, file=sys.stderr)
26 |     else:
27 |         HA = Lipinski.HeavyAtomCount(mol)
28 |         print("%s\t%d" % (name, HA))
29 | 


--------------------------------------------------------------------------------
/bin/molenc_atoms_filter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Copyright (C) 2023, Francois Berenger
 4 | # Tsuda laboratory, Tokyo University,
 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan.
 6 | #
 7 | # Keep only molecules using allowed atoms
 8 | 
 9 | import argparse, re, sys, time
10 | from rdkit import Chem
11 | 
12 | regex = re.compile('\s')
13 | 
14 | def find_whitespace(s):
15 |     m = re.search(regex, s)
16 |     if m == None:
17 |         return -1
18 |     else:
19 |         return m.start()
20 | 
21 | def parse_smiles_line(line):
22 |     fst_white = find_whitespace(line)
23 |     smi = ''
24 |     name = ''
25 |     if fst_white == -1:
26 |         # no whitespace separator: assume molecule has no name
27 |         # use the SMILES itself as the name, so this unnamed
28 |         # molecule will percolate instead of behing lost
29 |         smi = line
30 |         name = line
31 |     else:
32 |         smi = line[0:fst_white]
33 |         name = line[fst_white + 1:]
34 |     return Chem.MolFromSmiles(smi)
35 | 
36 | def parse_atoms_list(line):
37 |     return set(line.strip().split(','))
38 | 
39 | def atoms_filter(allowed_atoms, mol):
40 |     for a in mol.GetAtoms():
41 |         if a.GetSymbol() not in allowed_atoms:
42 |             return False
43 |     return True
44 | 
45 | if __name__ == '__main__':
46 |     before = time.time()
47 |     # CLI options parsing
48 |     parser = argparse.ArgumentParser(description = "filter out molecules w/ disallowed atoms")
49 |     parser.add_argument("-i", metavar = "input.smi", dest = "input_fn",
50 |                         help = "molecules input file")
51 |     parser.add_argument("-o", metavar = "output.smi", dest = "output_fn",
52 |                         help = "molecules output file")
53 |     parser.add_argument('-a', metavar = "ATOMS_LIST", dest='allowed_atoms',
54 |                         default="C,H,N,O,P,S,F,Cl,Br,I",
55 |                         help = "comma-separated list of allowed atoms \
56 |                         (default=C,H,N,O,P,S,F,Cl,Br,I)")
57 |     # parse CLI ---------------------------------------------------------------
58 |     if len(sys.argv) == 1:
59 |         # user has no clue of what to do -> usage
60 |         parser.print_help(sys.stderr)
61 |         sys.exit(1)
62 |     args = parser.parse_args()
63 |     input_fn = args.input_fn
64 |     output_fn = args.output_fn
65 |     allowed_atoms = parse_atoms_list(args.allowed_atoms)
66 |     # parse CLI end -----------------------------------------------------------
67 |     count = 0
68 |     errors = 0
69 |     with open(output_fn, 'w') as out:
70 |         with open(input_fn, 'r') as input:
71 |             for line in input.readlines():
72 |                 mol = parse_smiles_line(line.strip())
73 |                 if atoms_filter(allowed_atoms, mol):
74 |                     out.write("%s" % line)
75 |                 else:
76 |                     errors += 1
77 |                 count += 1
78 |     after = time.time()
79 |     dt = after - before
80 |     print("%d molecules @ %.2fHz; removed %d" % (count, count / dt, errors),
81 |           file=sys.stderr)
82 | 


--------------------------------------------------------------------------------
/bin/molenc_color.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # color molecules from a SMILES file according to per-atom delta score
 4 | # values from another file
 5 | 
 6 | import matplotlib.pyplot as plot
 7 | import rdkit, sys
 8 | from rdkit import Chem
 9 | from rdkit.Chem import Draw
10 | from rdkit.Chem.Draw import rdDepictor, SimilarityMaps
11 | 
12 | def RobustSmilesMolSupplier(filename):
13 |     with open(filename) as f:
14 |         for line in f:
15 |             words = line.split()
16 |             smile = words[0]
17 |             name = " ".join(words[1:]) # everything after the SMILES string
18 |             yield (name, Chem.MolFromSmiles(smile))
19 | 
20 | # draw all atoms in black
21 | drawOptions = Draw.DrawingOptions()
22 | drawOptions.elemDict = {}
23 | drawOptions.bgColor = None
24 | 
25 | if __name__ == '__main__':
26 |     if len(sys.argv) != 3:
27 |         print("usage: %s molecules.smi molecules.delta" % sys.argv[0])
28 |         exit(1)
29 |     smiles_fn = sys.argv[1]
30 |     deltas_fn = sys.argv[2]
31 |     delta_max = 0.1 # arbitrary, to normalize deltas and color-scale them
32 |     delta_file = open(deltas_fn, 'r')
33 |     count = 0
34 |     for long_name, mol in RobustSmilesMolSupplier(smiles_fn):
35 |         # split by '_' in case name was postfixed with underscores
36 |         # and additional data
37 |         name = long_name.split('_')[0]
38 |         line = delta_file.readline()
39 |         words = line.split()
40 |         curr_name = words[0]
41 |         if curr_name != name:
42 |             print("names differ: %s != %s" % (name, curr_name))
43 |             exit(1)
44 |         delta_strings = words[1:]
45 |         nb_deltas = len(delta_strings)
46 |         nb_atoms = mol.GetNumAtoms()
47 |         assert(nb_deltas == nb_atoms)
48 |         deltas = list(map(lambda x: float(x), delta_strings))
49 |         rdDepictor.Compute2DCoords(mol) # 2D conformer for figure
50 |         # compute similarity map weights
51 |         weights = []
52 |         for delta in deltas:
53 |             # run-time check that delta is not too high or delta_max too small
54 |             assert(delta <= delta_max)
55 |             weight = delta / delta_max
56 |             weights.append(weight)
57 |         sim_map = Draw.SimilarityMaps.\
58 |                   GetSimilarityMapFromWeights(mol, weights, size = (200,200),
59 |                                               options=drawOptions,
60 |                                               scale=50.0)
61 |         # the bbox param forces centering the molecule in the figure
62 |         sim_map.savefig(name + '.svg', bbox_inches = 'tight')
63 |         plot.close(sim_map)
64 |         count += 1
65 |         print('processed: %d\r' % count, end='')
66 |     print('processed: %d' % count)
67 |     delta_file.close()
68 | 


--------------------------------------------------------------------------------
/bin/molenc_deepsmi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright (C) 2021, Francois Berenger
  4 | # Tsuda laboratory, Tokyo University, Japan.
  5 | 
  6 | # DeepSMILES encoder/decoder from/to SMILES
  7 | #
  8 | # "DeepSMILES: An adaptation of SMILES for use in machine-learning of
  9 | # chemical structures". Noel M. O’Boyle and Andrew Dalke. ChemRxiv (2018).
 10 | 
 11 | import argparse
 12 | import deepsmiles
 13 | import molenc_common
 14 | import rdkit
 15 | import sys
 16 | import time
 17 | 
 18 | from rdkit import Chem
 19 | from rdkit.Chem import AllChem
 20 | 
 21 | from molenc_common import RobustSmilesSupplier
 22 | 
 23 | def encode(converter, smi):
 24 |     return converter.encode(smi)
 25 | 
 26 | def decode(converter, deep_smi):
 27 |     try:
 28 |         smi = converter.decode(deep_smi)
 29 |         # currently, de decoder does not output a canonical SMILES
 30 |         # https://github.com/baoilleach/deepsmiles/issues/19
 31 |         # I want canonical SMILES, because this is rdkit's default
 32 |         mol = Chem.MolFromSmiles(smi)
 33 |         cano_smi = Chem.MolToSmiles(mol)
 34 |         return cano_smi
 35 |     except deepsmiles.DecodeError as e:
 36 |         print("molenc_deepsmi.py: decode: '%s'" % e.message,
 37 |               file = sys.stderr)
 38 |         return None
 39 | 
 40 | if __name__ == '__main__':
 41 |     before = time.time()
 42 |     # CLI options
 43 |     parser = argparse.ArgumentParser(
 44 |         description = "DeepSMILES encoder/decoder")
 45 |     parser.add_argument("-i", metavar = "input.smi", dest = "input_fn",
 46 |                         help = "molecules input file")
 47 |     parser.add_argument("-o", metavar = "output.smi", dest = "output_fn",
 48 |                         help = "molecules output file")
 49 |     parser.add_argument("--no-rings", dest = "rings",
 50 |                         action = "store_true",
 51 |                         default = False,
 52 |                         help = "DeepSMILES without ring openings")
 53 |     parser.add_argument("--no-branches", dest = "branches",
 54 |                         action = "store_true",
 55 |                         default = False,
 56 |                         help = "DeepSMILES without branches")
 57 |     parser.add_argument("-e", dest = "do_encode",
 58 |                         action = "store_true",
 59 |                         default = True,
 60 |                         help = "encode: SMILES to DeepSMILES (default)")
 61 |     parser.add_argument("-d", dest = "do_decode",
 62 |                         action = "store_true",
 63 |                         help = "decode: DeepSMILES to SMILES")
 64 |     # parse CLI ----------------------------------------------
 65 |     if len(sys.argv) == 1:
 66 |         # user has no clue of what to do -> usage
 67 |         parser.print_help(sys.stderr)
 68 |         sys.exit(1)
 69 |     args = parser.parse_args()
 70 |     input_fn = args.input_fn
 71 |     output = open(args.output_fn, 'w')
 72 |     rings = args.rings
 73 |     branches = args.branches
 74 |     do_encode = args.do_encode
 75 |     do_decode = args.do_decode
 76 |     if do_decode:
 77 |         do_encode = False
 78 |     assert(not (do_encode and do_decode))
 79 |     if not (rings or branches):
 80 |         print("use at least --no-rings or --no-branches",
 81 |               file=sys.stderr)
 82 |         sys.exit(1)
 83 |     count = 0
 84 |     # work ----------------------------------------------
 85 |     smi_supplier = RobustSmilesSupplier(input_fn)
 86 |     converter = deepsmiles.Converter(rings, branches)
 87 |     if do_encode:
 88 | 
 89 |         for smi, name in smi_supplier:
 90 |             deep_smi = encode(converter, smi)
 91 |             print("%s\t%s" % (deep_smi, name), file=output)
 92 |             count += 1
 93 |     else: # decode
 94 |         for deep_smi, name in smi_supplier:
 95 |             smi = decode(converter, deep_smi)
 96 |             if smi != None:
 97 |                 print("%s\t%s" % (smi, name), file=output)
 98 |             count += 1
 99 |     after = time.time()
100 |     dt = after - before
101 |     print("%d molecules at %.2f mol/s" % (count, count / dt), file=sys.stderr)
102 |     output.close()
103 | 


--------------------------------------------------------------------------------
/bin/molenc_diam.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Copyright (C) 2022, Francois Berenger
 4 | # Tsuda laboratory, Tokyo University,
 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan.
 6 | #
 7 | # Compute the diamater of a molecule's 3D conformer
 8 | # i.e. largest interatomic distance
 9 | 
10 | import argparse, math, sys
11 | from rdkit import Chem
12 | 
13 | def euclid(xyz0, xyz1):
14 |     x0, y0, z0 = xyz0
15 |     x1, y1, z1 = xyz1
16 |     dx = x0 - x1
17 |     dy = y0 - y1
18 |     dz = z0 - z1
19 |     return math.sqrt(dx*dx + dy*dy + dz*dz)
20 | 
21 | # WARNING: O(n^2)
22 | def diameter(mol):
23 |     num_atoms = mol.GetNumAtoms()
24 |     conf = mol.GetConformer(0)
25 |     diam = 0.0
26 |     for i in range(num_atoms - 1):
27 |         xyz_i = conf.GetAtomPosition(i)
28 |         for j in range(i + 1, num_atoms):
29 |             xyz_j = conf.GetAtomPosition(j)
30 |             dist = euclid(xyz_i, xyz_j)
31 |             if dist > diam:
32 |                 diam = dist
33 |     return diam
34 | 
35 | if __name__ == '__main__':
36 |     # CLI options parsing
37 |     parser = argparse.ArgumentParser(description =
38 |                                      "compute molecular diameter")
39 |     parser.add_argument("-i", metavar = "input.sdf", dest = "input_fn",
40 |                         help = "3D conformer input file \
41 |                         (single molecule AND conformer)")
42 |     # parse CLI ---------------------------------------------------------------
43 |     if len(sys.argv) == 1:
44 |         # user has no clue of what to do -> usage
45 |         parser.print_help(sys.stderr)
46 |         sys.exit(1)
47 |     args = parser.parse_args()
48 |     input_fn = args.input_fn
49 |     # parse CLI end -----------------------------------------------------------
50 |     count = 0
51 |     mol_supplier = Chem.SDMolSupplier(input_fn)
52 |     for mol in mol_supplier:
53 |         if (mol == None) or (count > 1):
54 |             assert(False)
55 |         count += 1
56 |         diam = diameter(mol)
57 |         print("%f" % diam)
58 | 


--------------------------------------------------------------------------------
/bin/molenc_drug.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Copyright (C) 2023, Francois Berenger
 4 | # Tsuda laboratory, The University of Tokyo,
 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan.
 6 | #
 7 | # Drug-like filter: only drug-like molecules will be printed on stdout
 8 | 
 9 | import sys
10 | 
11 | from rdkit import Chem
12 | from rdkit.Chem import Descriptors
13 | 
14 | # Tran-Nguyen, V. K., Jacquemard, C., & Rognan, D. (2020).
15 | # LIT-PCBA: An unbiased data set for machine learning and virtual screening.
16 | # Journal of chemical information and modeling, 60(9), 4263-4273.
17 | def drug_like_filter(mol):
18 |     MolW = Descriptors.MolWt(mol)
19 |     if MolW <= 150 or MolW >= 800: # 150 < MolW < 800 Da
20 |         return False
21 |     cLogP = Descriptors.MolLogP(mol)
22 |     if cLogP <= -3.0 or cLogP >= 5.0: # −3.0 < AlogP < 5.0
23 |         return False
24 |     RotB = Descriptors.NumRotatableBonds(mol)
25 |     if RotB >= 15: # RotB < 15
26 |         return False
27 |     HBA = Descriptors.NumHAcceptors(mol)
28 |     if HBA >= 10: # HBA < 10
29 |         return False
30 |     HBD = Descriptors.NumHDonors(mol)
31 |     if HBD >= 10: # HBD < 10
32 |         return False
33 |     FC = Chem.rdmolops.GetFormalCharge(mol)
34 |     if FC <= -2 or FC >= 2: # −2.0 < FC < 2.0
35 |         return False
36 |     return True # Still here? Drug-like then!
37 | 
38 | def RobustSmilesMolSupplier(filename):
39 |     with open(filename) as f:
40 |         for line in f:
41 |             smile, name = line.strip().split("\t") # enforce TAB-separated
42 |             try:
43 |                 mol = Chem.MolFromSmiles(smile)
44 |                 yield (mol, smile, name)
45 |             except Exception:
46 |                 print("ERROR: cannot parse: %s" % line,
47 |                       file=sys.stderr, end='')
48 | 
49 | input_fn = sys.argv[1]
50 | 
51 | for mol, smile, name in RobustSmilesMolSupplier(input_fn):
52 |     if drug_like_filter(mol):
53 |         print('%s\t%s' % (smile, name))
54 | 


--------------------------------------------------------------------------------
/bin/molenc_elements.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # output to stdout elements found in each molecule
 4 | # INPUT: SMILES file
 5 | # OUTPUT: one symbol per line; several times if element present multiple times;
 6 | #         hydrogens are made explicit
 7 | 
 8 | import sys
 9 | from rdkit import Chem
10 | 
11 | def RobustSmilesMolSupplier(filename):
12 |     with open(filename) as f:
13 |         for line in f:
14 |             smi, _name = line.strip().split("\t") # enforce TAB-separated
15 |             try:
16 |                 yield Chem.MolFromSmiles(smi)
17 |             except Exception:
18 |                 print("ERROR: cannot parse: %s" % line,
19 |                       file=sys.stderr, end='')
20 | 
21 | input_fn = sys.argv[1]
22 | 
23 | for mol in RobustSmilesMolSupplier(input_fn):
24 |     mol_H = Chem.AddHs(mol)
25 |     for a in mol_H.GetAtoms():
26 |         print(a.GetSymbol())
27 | 


--------------------------------------------------------------------------------
/bin/molenc_fscan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Fluorine scan of a molecule
 4 | # create all analogs of input molecule where one heavy atom
 5 | # at a time has all of its hydrogens replaced by F
 6 | # optionally, do this only for heteroatoms
 7 | 
 8 | import argparse, rdkit, sys
 9 | from rdkit import Chem
10 | 
11 | def RobustSmilesMolSupplier(input_fn):
12 |     with open(input_fn) as f:
13 |         for line in f:
14 |             strip = line.strip()
15 |             toks = strip.split()
16 |             smi = toks[0]
17 |             toks.reverse()
18 |             name = toks[0]
19 |             yield (smi, name)
20 | 
21 | fluor = Chem.Atom(9)
22 | 
23 | if __name__ == '__main__':
24 |     # CLI options parsing
25 |     parser = argparse.ArgumentParser(
26 |         description = "compute atom types and distances")
27 |     parser.add_argument("-i", metavar = "input.smi", dest = "input_fn",
28 |                         help = "molecules input file")
29 |     parser.add_argument("-o", metavar = "output.smi", dest = "output_fn",
30 |                         help = "molecules output file")
31 |     parser.add_argument('--hetero', dest='only_heteroatoms', action='store_true',
32 |                         help = "only scan heteroatoms")
33 |     # parse CLI
34 |     if len(sys.argv) == 1:
35 |         # show help in case user has no clue of what to do
36 |         parser.print_help(sys.stderr)
37 |         sys.exit(1)
38 |     args = parser.parse_args()
39 |     input_fn = args.input_fn
40 |     output_fn = args.output_fn
41 |     only_hetero = args.only_heteroatoms
42 |     with open(output_fn, 'w') as out:
43 |         for smi, name in RobustSmilesMolSupplier(input_fn):
44 |             # output original molecule first
45 |             print("%s\t%s" % (smi, name), file=out)
46 |             mol = Chem.MolFromSmiles(smi)
47 |             mol = Chem.AddHs(mol)
48 |             # then output its fluorinated analogs
49 |             count = 1
50 |             for a in mol.GetAtoms():
51 |                 anum = a.GetAtomicNum()
52 |                 if anum > 1 and ((not only_hetero) or anum != 6):
53 |                     # heavy atom
54 |                     if a.GetTotalNumHs(includeNeighbors=True) >= 1:
55 |                         # hydrogens attached
56 |                         editable = Chem.EditableMol(mol)
57 |                         for neighb in a.GetNeighbors():
58 |                             if neighb.GetAtomicNum() == 1:
59 |                                 # Fluorine instead
60 |                                 a_j = neighb.GetIdx()
61 |                                 editable.ReplaceAtom(a_j, fluor)
62 |                         edited = editable.GetMol()
63 |                         smi = Chem.MolToSmiles(edited)
64 |                         print("%s\t%s_%d" % (smi, name, count), file=out)
65 |                         count += 1
66 | 


--------------------------------------------------------------------------------
/bin/molenc_get_tag.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Extract given tags from an SDF file
 4 | # could also be called sdf2csv
 5 | 
 6 | import argparse, rdkit, sys
 7 | from rdkit import Chem
 8 | 
 9 | # m.GetProp but w/ a default value
10 | def get_prop_default(m, prop, def_val):
11 |     res = def_val
12 |     try:
13 |         res = m.GetProp(prop)
14 |     except KeyError:
15 |         pass
16 |     return res
17 | 
18 | def get_props(m, tags):
19 |     res = []
20 |     for t in tags:
21 |         x = get_prop_default(m, t, '')
22 |         res.append(x)
23 |     return res
24 | 
25 | if __name__ == '__main__':
26 |     # CLI options parsing
27 |     parser = argparse.ArgumentParser(
28 |         description = "compute atom types and distances")
29 |     parser.add_argument("-i", metavar = "input.sdf", dest = "input_fn",
30 |                         help = "molecules input file")
31 |     parser.add_argument("-o", metavar = "output.txt", dest = "output_fn",
32 |                         help = "output file")
33 |     parser.add_argument('-t', metavar = 'tag1,tag2,...', dest = "tags",
34 |                         help = "comma-separated list of tags to extract")
35 |     # parse CLI
36 |     if len(sys.argv) == 1:
37 |         # show help in case user has no clue of what to do
38 |         parser.print_help(sys.stderr)
39 |         sys.exit(1)
40 |     args = parser.parse_args()
41 |     input_fn = args.input_fn
42 |     output_fn = args.output_fn
43 |     tags = args.tags.strip().split(',')
44 |     # -------------------------------------------------------------------------
45 |     with open(output_fn, 'w') as out:
46 |         # we just want to extract SDF tags; we don't care if rdkit
47 |         # is unhappy w/ some molecules
48 |         for mol in Chem.SDMolSupplier(input_fn, sanitize=False):
49 |             if mol:
50 |                 props = get_props(mol, tags)
51 |                 for i, p in enumerate(props):
52 |                     if i > 0:
53 |                         print(',%s' % p, end='', file=out)
54 |                     else:
55 |                         print('%s' % p, end='', file=out)
56 |                 # EOL plus makes empty fields obvious
57 |                 print(',', file=out)
58 | 


--------------------------------------------------------------------------------
/bin/molenc_histo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # usage: ./histo.py FILE:str STEPS:int
 4 | #        0          1        2
 5 | 
 6 | import sys
 7 | 
 8 | input_fn = sys.argv[1]
 9 | num_steps = float(int(sys.argv[2]))
10 | 
11 | # read in values
12 | floats = []
13 | for line in open(input_fn).readlines():
14 |     strip = line.strip()
15 |     x = float(strip)
16 |     floats.append(x)
17 | assert(len(floats) > 0)
18 | 
19 | min_val = min(floats)
20 | max_val = max(floats)
21 | assert(min_val < max_val)
22 | delta = (max_val - min_val) / num_steps
23 | print('DEBUG: min:%g max:%g steps:%d delta:%g' %
24 |       (min_val, max_val, num_steps, delta), file=sys.stderr)
25 | 
26 | # initialize the histogram
27 | histo = {}
28 | x = min_val
29 | i = 0
30 | while x <= max_val + delta:
31 |     histo[i] = 0
32 |     x += delta
33 |     i += 1
34 | 
35 | # finalize the histogram
36 | for x in floats:
37 |     assert(x >= min_val)
38 |     assert(x <= max_val)
39 |     hist_bin = int((x - min_val) / delta)
40 |     histo[hist_bin] += 1
41 | 
42 | # print out histogram
43 | x = min_val
44 | i = 0
45 | while x <= max_val + delta:
46 |     x_val = min_val + float(i) * delta
47 |     y_val = histo[i]
48 |     print('%f %d' % (x_val, y_val))
49 |     x += delta
50 |     i += 1
51 | 


--------------------------------------------------------------------------------
/bin/molenc_ifg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | #  Original authors: Richard Hall and Guillaume Godin
 4 | #  This file is part of the RDKit.
 5 | #  The contents are covered by the terms of the BSD license
 6 | #  which is included in the file license.txt, found at the root
 7 | #  of the RDKit source tree.
 8 | 
 9 | # Richard hall 2017
10 | # IFG main code
11 | # Guillaume Godin 2017
12 | # refine output function
13 | # astex_ifg: identify functional groups a la Ertl, J. Cheminform (2017) 9:36
14 | from rdkit import Chem
15 | from collections import namedtuple
16 | import sys
17 | 
18 | def RobustSmilesMolSupplier(filename):
19 |     with open(filename) as f:
20 |         for line in f:
21 |             words = line.split()
22 |             smile = words[0]
23 |             name = " ".join(words[1:]) # everything after the SMILES string
24 |             yield (name, Chem.MolFromSmiles(smile))
25 | 
26 | def merge(mol, marked, aset):
27 |     bset = set()
28 |     for idx in aset:
29 |         atom = mol.GetAtomWithIdx(idx)
30 |         for nbr in atom.GetNeighbors():
31 |             jdx = nbr.GetIdx()
32 |             if jdx in marked:
33 |                 marked.remove(jdx)
34 |                 bset.add(jdx)
35 |     if not bset:
36 |         return
37 |     merge(mol, marked, bset)
38 |     aset.update(bset)
39 | 
40 | # atoms connected by non-aromatic double or triple bond to any heteroatom
41 | # c=O should not match (see fig1, box 15).  I think using A instead of * should sort that out?
42 | PATT_DOUBLE_TRIPLE = Chem.MolFromSmarts('A=,#[!#6]')
43 | # atoms in non aromatic carbon-carbon double or triple bonds
44 | PATT_CC_DOUBLE_TRIPLE = Chem.MolFromSmarts('C=,#C')
45 | # acetal carbons, i.e. sp3 carbons connected to tow or more oxygens, nitrogens or sulfurs; these O, N or S atoms must have only single bonds
46 | PATT_ACETAL = Chem.MolFromSmarts('[CX4](-[O,N,S])-[O,N,S]')
47 | # all atoms in oxirane, aziridine and thiirane rings
48 | PATT_OXIRANE_ETC = Chem.MolFromSmarts('[O,N,S]1CC1')
49 | 
50 | PATT_TUPLE = (PATT_DOUBLE_TRIPLE, PATT_CC_DOUBLE_TRIPLE, PATT_ACETAL, PATT_OXIRANE_ETC)
51 | 
52 | def identify_functional_groups(mol):
53 |     marked = set()
54 | #mark all heteroatoms in a molecule, including halogens
55 |     for atom in mol.GetAtoms():
56 |         if atom.GetAtomicNum() not in (6,1): # would we ever have hydrogen?
57 |             marked.add(atom.GetIdx())
58 | 
59 | #mark the four specific types of carbon atom
60 |     for patt in PATT_TUPLE:
61 |         for path in mol.GetSubstructMatches(patt):
62 |             for atomindex in path:
63 |                 marked.add(atomindex)
64 | 
65 | #merge all connected marked atoms to a single FG
66 |     groups = []
67 |     while marked:
68 |         grp = set([marked.pop()])
69 |         merge(mol, marked, grp)
70 |         groups.append(grp)
71 | 
72 | #extract also connected unmarked carbon atoms
73 |     ifg = namedtuple('IFG', ['atomIds', 'atoms', 'type'])
74 |     ifgs = []
75 |     for g in groups:
76 |         uca = set()
77 |         for atomidx in g:
78 |             for n in mol.GetAtomWithIdx(atomidx).GetNeighbors():
79 |                 if n.GetAtomicNum() == 6:
80 |                     uca.add(n.GetIdx())
81 |         ifgs.append(ifg(atomIds=tuple(list(g)), atoms=Chem.MolFragmentToSmiles(mol, g, canonical=True), type=Chem.MolFragmentToSmiles(mol, g.union(uca),canonical=True)))
82 |     return ifgs
83 | 
84 | def main():
85 |     argc = len(sys.argv)
86 |     if argc == 1:
87 |         print('usage: ifg.py input.smi', file=sys.stderr)
88 |         exit(1)
89 |     input_fn = sys.argv[1]
90 |     for name, mol in RobustSmilesMolSupplier(input_fn):
91 |         fgs = identify_functional_groups(mol)
92 |         nb_fun_groups = len(fgs)
93 |         print("%s %d" % (name, nb_fun_groups))
94 | 
95 | if __name__ == "__main__":
96 |     main()
97 | 


--------------------------------------------------------------------------------
/bin/molenc_iupac.smi:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | from STOUT import translate_forward, translate_reverse
 5 | 
 6 | # SMILES to IUPAC name translation
 7 | 
 8 | SMILES = sys.argv[1]
 9 | IUPAC_name = translate_forward(SMILES)
10 | print("IUPAC name of "+SMILES+" is: "+IUPAC_name)
11 | 
12 | # IUPAC name to SMILES translation
13 | 
14 | SMILES = translate_reverse(IUPAC_name)
15 | print("SMILES of "+IUPAC_name+" is: "+SMILES)
16 | 


--------------------------------------------------------------------------------
/bin/molenc_lead.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Copyright (C) 2022, Francois Berenger
 4 | # Tsuda laboratory, The University of Tokyo,
 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan.
 6 | #
 7 | # lead-like filter: only lead-like molecules will be printed on stdout
 8 | 
 9 | import sys
10 | 
11 | from rdkit import Chem
12 | from rdkit.Chem import Descriptors
13 | 
14 | # Oprea's lead-like filter                                                      
15 | # Hann, M. M., & Oprea, T. I. (2004).                                           
16 | # Pursuing the leadlikeness concept in pharmaceutical research.                 
17 | # Current opinion in chemical biology, 8(3), 255-263.                           
18 | def lead_like(mol):
19 |     # MolW <= 460                                                               
20 |     if Descriptors.MolWt(mol) > 460:
21 |         return False
22 |     # -4.0 <= LogP <= 4.2                                                       
23 |     LogP = Descriptors.MolLogP(mol)
24 |     if LogP < -4.0 or LogP > 4.2:
25 |         return False
26 |     # # LogSw >= -5 # ignored                                                   
27 |     # rotB <= 10                                                                
28 |     if Descriptors.NumRotatableBonds(mol) > 10:
29 |         return False
30 |     # nRings <= 4 (number of SSSR rings, _not_ aromatic rings)                  
31 |     if len(Chem.GetSSSR(mol)) > 4:
32 |         return False
33 |     # HBD <= 5                                                                  
34 |     if Descriptors.NumHDonors(mol) > 5:
35 |         return False
36 |     # HBA <= 9                                                                  
37 |     if Descriptors.NumHAcceptors(mol) > 9:
38 |         return False
39 |     return True # lead-like then!                                               
40 | 
41 | def RobustSmilesMolSupplier(filename):
42 |     with open(filename) as f:
43 |         for line in f:
44 |             smile, name = line.strip().split("\t") # enforce TAB-separated
45 |             try:
46 |                 mol = Chem.MolFromSmiles(smile)
47 |                 yield (mol, smile, name)
48 |             except Exception:
49 |                 print("ERROR: cannot parse: %s" % line,
50 |                       file=sys.stderr, end='')
51 | 
52 | input_fn = sys.argv[1]
53 | 
54 | for mol, smile, name in RobustSmilesMolSupplier(input_fn):
55 |     if lead_like(mol):
56 |         print('%s\t%s' % (smile, name))
57 | 


--------------------------------------------------------------------------------
/bin/molenc_linker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import rdkit, typing
 4 | from rdkit import Chem
 5 | from rdkit.Chem import AllChem
 6 | 
 7 | def create_PEG_chain(length: int):
 8 |     s = ''
 9 |     for i in range(length):
10 |         s += '[CH2R0X4][CH2R0X4][OH0R0X2]' # SMARTS for one PEG unit
11 |     #return s        
12 |     return Chem.MolFromSmarts(s)
13 | 
14 | # assume longest linker is 10 units of PEG
15 | peg10 = create_PEG_chain(10)
16 | peg09 = create_PEG_chain(9)
17 | peg08 = create_PEG_chain(8)
18 | peg07 = create_PEG_chain(7)
19 | peg06 = create_PEG_chain(6)
20 | peg05 = create_PEG_chain(5)
21 | peg04 = create_PEG_chain(4)
22 | peg03 = create_PEG_chain(3)
23 | peg02 = create_PEG_chain(2)
24 | #peg01: I assume a single PEG unit is too short to be a proper linker
25 | 
26 | peg_10_downto_2 = [peg10, peg09, peg08, peg07, peg06, peg05, peg04, peg03, peg02]
27 | 
28 | # remove the PEG linker, if any
29 | # if not, the molecule is returned unchanged (either it has no linker, or
30 | # the linker is not PEG)
31 | def cut_PEG_linker(mol):
32 |     for patt in peg_10_downto_2:
33 |         if mol.HasSubstructMatch(patt):
34 |             res = AllChem.DeleteSubstructs(mol, patt)
35 |             return (True, res)
36 |     return (False, mol)
37 | 


--------------------------------------------------------------------------------
/bin/molenc_mol2smi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (C) 2020, Francois Berenger
 4 | # Yamanishi laboratory,
 5 | # Department of Bioscience and Bioinformatics,
 6 | # Faculty of Computer Science and Systems Engineering,
 7 | # Kyushu Institute of Technology,
 8 | # 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan.
 9 | 
10 | # txt molecule to SMILES
11 | 
12 | import argparse, rdkit, re, sys, time
13 | import molenc_common as common
14 | from rdkit import Chem
15 | 
16 | # create a fake molecule for the corresp. fragment
17 | def read_one_molecule(input):
18 |     res_mol = Chem.RWMol()
19 |     atoms_header = input.readline().strip()
20 |     if atoms_header == '':
21 |         raise common.End_of_file # no EOF in Python...
22 |     nb_atoms, name = common.read_atoms_header(atoms_header)
23 |     old2new = {}
24 |     for _i in range(nb_atoms):
25 |         line = input.readline().strip()
26 |         (index, nb_pi, atomic_num, nb_HA, charge, stereo) = \
27 |           common.read_atom(line)
28 |         # add atom
29 |         a = Chem.Atom(atomic_num)
30 |         a.SetFormalCharge(charge)
31 |         if stereo > 0: # set chirality
32 |             a.SetChiralTag(common.atom_stereo_code_to_chiral_tag(stereo))
33 |         j = res_mol.AddAtom(a)
34 |         # we need to convert atom indexes
35 |         old2new[index] = j
36 |     bonds_header = input.readline().strip()
37 |     nb_bonds = common.read_bonds_header(bonds_header)
38 |     stereo_bonds = []
39 |     for i in range(nb_bonds):
40 |         line = input.readline().strip()
41 |         (start_i, bt, stop_i, (stereo, c, d)) = common.read_bond(line)
42 |         start = old2new[start_i]
43 |         stop = old2new[stop_i]
44 |         # add bond
45 |         n = res_mol.AddBond(start, stop, bt)
46 |         if stereo != rdkit.Chem.rdchem.BondStereo.STEREONONE:
47 |             bi = n - 1
48 |             # convert stereo bond stereo atoms indexes
49 |             a = old2new[c]
50 |             b = old2new[d]
51 |             stereo_bonds.append((bi, stereo, a, b))
52 |     # all atoms and bonds are here now
53 |     # so stereo bonds info can be set
54 |     for (bi, stereo, a, b) in stereo_bonds:
55 |         bond = res_mol.GetBondWithIdx(bi)
56 |         bond.SetStereo(stereo)
57 |         bond.SetStereoAtoms(a, b)
58 |         print('%s stereo %s on bond %d (%d, %d)' %
59 |               (name, common.char_of_bond_stereo(stereo), bi, a, b),
60 |               file=sys.stderr)
61 |     try:
62 |         Chem.SanitizeMol(res_mol)
63 |         Chem.AssignStereochemistry(res_mol) # ! MANDATORY; AFTER SanitizeMol !
64 |     except rdkit.Chem.rdchem.KekulizeException:
65 |         print("KekulizeException in %s" % name, file=sys.stderr)
66 |     smi = Chem.MolToSmiles(res_mol)
67 |     return (smi, name)
68 | 
69 | if __name__ == '__main__':
70 |     before = time.time()
71 |     # CLI options parsing
72 |     parser = argparse.ArgumentParser(description = "txt molecule to smi")
73 |     parser.add_argument("-i", metavar = "input.mols", dest = "input_fn",
74 |                         help = "molecules input file")
75 |     parser.add_argument("-o", metavar = "output.smi", dest = "output_fn",
76 |                         help = "output file")
77 |     # parse CLI
78 |     if len(sys.argv) == 1:
79 |         # show help in case user has no clue of what to do
80 |         parser.print_help(sys.stderr)
81 |         sys.exit(1)
82 |     args = parser.parse_args()
83 |     input_fn = args.input_fn
84 |     output = open(args.output_fn, 'w')
85 |     count = 0
86 |     with open(input_fn) as input:
87 |         try:
88 |             while True:
89 |                 smi, name = read_one_molecule(input)
90 |                 count += 1
91 |                 print('%s\t%s' % (smi, name), file=output)
92 |         except common.End_of_file:
93 |             pass
94 |     after = time.time()
95 |     dt = after - before
96 |     print("%d molecules at %.2f molecule/s" %
97 |           (count, count / dt), file=sys.stderr)
98 |     output.close()
99 | 


--------------------------------------------------------------------------------
/bin/molenc_panascan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (C) 2021, Francois Berenger
 4 | # Yamanishi laboratory,
 5 | # Department of Bioscience and Bioinformatics,
 6 | # Faculty of Computer Science and Systems Engineering,
 7 | # Kyushu Institute of Technology,
 8 | # 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan.
 9 | 
10 | # Implementation of
11 | # "Positional Analogue Scanning: An Effective Strategy for
12 | # Multiparameter Optimization in Drug Design".
13 | # Pennington, L. D., Aquila, B. M., Choi, Y., Valiulin, R. A., & Muegge, I.
14 | # Journal of medicinal chemistry (2020).
15 | # https://doi.org/10.1021/acs.jmedchem.9b02092
16 | 
17 | import argparse
18 | import rdkit
19 | import time
20 | import random
21 | from rdkit import Chem
22 | from rdkit.Chem import AllChem
23 | import sys
24 | 
25 | from molenc_common import RobustSmilesMolSupplier
26 | 
27 | def positional_analog_scan(mol, smarts_patt = '[cH]',
28 |                            smi_substs = ['N','CF','CC','CO',
29 |                                          'CCN','CCl','CC(F)(F)(F)','COC']):
30 |     res = []
31 |     ss = set() # a string set
32 |     patt = Chem.MolFromSmarts(smarts_patt)
33 |     for smi in smi_substs:
34 |         subst = Chem.MolFromSmiles(smi)
35 |         analogs = AllChem.ReplaceSubstructs(mol, patt, subst)
36 |         for a in analogs:
37 |             analog_smi = Chem.MolToSmiles(a) # canonicalization
38 |             # remove duplicates
39 |             if analog_smi not in ss:
40 |                 res.append(analog_smi)
41 |                 ss.add(analog_smi)
42 |     return res
43 | 
44 | if __name__ == '__main__':
45 |     before = time.time()
46 |     # CLI options
47 |     parser = argparse.ArgumentParser(
48 |         description = "Positional Analog Scanning of each input molecule")
49 |     parser.add_argument("-i", metavar = "input.smi", dest = "input_fn",
50 |                         help = "molecules input file")
51 |     parser.add_argument("-o", metavar = "output.smi", dest = "output_fn",
52 |                         help = "analogs output file")
53 |     parser.add_argument("--rand-one", dest = "rand_one", action = "store_true",
54 |                         default = False,
55 |                         help = "output only one randomly-chosen analog \
56 |                         per input molecule")
57 |     # parse CLI ----------------------------------------------
58 |     if len(sys.argv) == 1:
59 |         # user has no clue of what to do -> usage
60 |         parser.print_help(sys.stderr)
61 |         sys.exit(1)
62 |     args = parser.parse_args()
63 |     input_fn = args.input_fn
64 |     rand_one = args.rand_one
65 |     output = open(args.output_fn, 'w')
66 |     count = 0
67 |     # work ----------------------------------------------
68 |     mol_supplier = RobustSmilesMolSupplier(input_fn)
69 |     for name, mol in mol_supplier:
70 |         analogs = positional_analog_scan(mol)
71 |         if rand_one:
72 |             l = list(analogs)
73 |             ana_smi = random.choice(l)
74 |             print("%s\t%s_ANA%03d" % (ana_smi, name, 0),
75 |                   file=output)
76 |         else: # print them all
77 |             for i, ana_smi in enumerate(analogs):
78 |                 print("%s\t%s_ANA%03d" % (ana_smi, name, i),
79 |                       file=output)
80 |         count += 1
81 |     after = time.time()
82 |     dt = after - before
83 |     print("%d molecules at %.2f mol/s" % (count, count / dt), file=sys.stderr)
84 |     output.close()
85 | 


--------------------------------------------------------------------------------
/bin/molenc_qed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Copyright (C) 2023, Francois Berenger
 4 | # Tsuda laboratory, The University of Tokyo,
 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan.
 6 | #
 7 | # Compute QED for each input SMILES
 8 | # scores about 300 molecule/s on a single core
 9 | #
10 | # requires: installing qed from sources (pip3 package broken as of 23/01/2023)
11 | # cf. https://github.com/silicos-it/qed
12 | # an open-source implementation of "Quantifying the chemical beauty of drugs"
13 | # https://doi.org/10.1038/nchem.1243
14 | 
15 | import argparse, rdkit, sys
16 | from qed import qed
17 | from rdkit import Chem
18 | 
19 | def RobustSmilesMolSupplier(filename):
20 |     with open(filename) as f:
21 |         for i, line in enumerate(f):
22 |             words = line.split()
23 |             smile = words[0]
24 |             name = words[1]
25 |             yield (i, Chem.MolFromSmiles(smile), name)
26 | 
27 | def main():
28 |     # CLI options parsing
29 |     parser = argparse.ArgumentParser(
30 |         description = "Compute Quantitative Estimate of Drug-likeness (QED)")
31 |     parser.add_argument("-i", metavar = "input_smi", dest = "input_smi",
32 |                         help = "input SMILES file")
33 |     parser.add_argument("-o", metavar = "output_tsv", dest = "output_tsv",
34 |                         help = "output CSV file")
35 |     # parse CLI
36 |     if len(sys.argv) == 1:
37 |         # show help in case user has no clue of what to do
38 |         parser.print_help(sys.stderr)
39 |         sys.exit(1)
40 |     args = parser.parse_args()
41 |     input_smi = args.input_smi
42 |     output_tsv = args.output_tsv
43 |     out_count = 0
44 |     error_count = 0
45 |     with open(output_tsv, 'w') as out_file:
46 |         for i, mol, name in RobustSmilesMolSupplier(input_smi):
47 |             if mol is None:
48 |                 error_count += 1
49 |             else:
50 |                 score = qed.default(mol, False)
51 |                 print("%s\t%f" % (name, score), file=out_file)
52 |             out_count += 1
53 |     total_count = out_count + error_count
54 |     print("read: %d errors: %d" % (out_count, error_count),
55 |           file=sys.stderr)
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/bin/molenc_rbonds_filter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Copyright (C) 2023, Francois Berenger
 4 | # Tsuda laboratory, Tokyo University,
 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan.
 6 | #
 7 | # Only keep molecules with an acceptable number of rotatable bonds
 8 | 
 9 | import argparse, re, sys, time
10 | from rdkit import Chem
11 | from rdkit.Chem import Descriptors
12 | 
13 | regex = re.compile('\s')
14 | 
15 | def find_whitespace(s):
16 |     m = re.search(regex, s)
17 |     if m == None:
18 |         return -1
19 |     else:
20 |         return m.start()
21 | 
22 | def parse_smiles_line(line):
23 |     fst_white = find_whitespace(line)
24 |     smi = ''
25 |     name = ''
26 |     if fst_white == -1:
27 |         # no whitespace separator: assume molecule has no name
28 |         # use the SMILES itself as the name, so this unnamed
29 |         # molecule will percolate instead of behing lost
30 |         smi = line
31 |         name = line
32 |     else:
33 |         smi = line[0:fst_white]
34 |         name = line[fst_white + 1:]
35 |     return Chem.MolFromSmiles(smi)
36 | 
37 | def rbonds_filter(max_rbonds, mol):
38 |     if Descriptors.NumRotatableBonds(mol) <= max_rbonds:
39 |         return True
40 |     else:
41 |         return False
42 | 
43 | if __name__ == '__main__':
44 |     before = time.time()
45 |     # CLI options parsing
46 |     parser = argparse.ArgumentParser(description = "filter out molecules w/ disallowed atoms")
47 |     parser.add_argument("-i", metavar = "input.smi", dest = "input_fn",
48 |                         help = "molecules input file")
49 |     parser.add_argument("-o", metavar = "output.smi", dest = "output_fn",
50 |                         help = "molecules output file")
51 |     parser.add_argument('-r', metavar = "MAX_ROT_BONDS_INT", dest='max_rbonds',
52 |                         default=-1, type=int,
53 |                         help = "maximum number of rotatable bonds allowed (default=NO_LIMIT")
54 |     # parse CLI ---------------------------------------------------------------
55 |     if len(sys.argv) == 1:
56 |         # user has no clue of what to do -> usage
57 |         parser.print_help(sys.stderr)
58 |         sys.exit(1)
59 |     args = parser.parse_args()
60 |     input_fn = args.input_fn
61 |     output_fn = args.output_fn
62 |     max_rbonds = args.max_rbonds
63 |     # parse CLI end -----------------------------------------------------------
64 |     count = 0
65 |     errors = 0
66 |     with open(output_fn, 'w') as out:
67 |         with open(input_fn, 'r') as input:
68 |             for line in input.readlines():
69 |                 mol = parse_smiles_line(line.strip())
70 |                 if rbonds_filter(max_rbonds, mol):
71 |                     out.write("%s" % line)
72 |                 else:
73 |                     errors += 1
74 |                 count += 1
75 |     after = time.time()
76 |     dt = after - before
77 |     print("%d molecules @ %.2fHz; removed %d" % (count, count / dt, errors),
78 |           file=sys.stderr)
79 | 


--------------------------------------------------------------------------------
/bin/molenc_regr_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # output R2 and RMSE regression statistics from a file containing pairs
 4 | # of float values (one blank-separated pair per line; no header)
 5 | 
 6 | import sklearn, sys
 7 | 
 8 | from sklearn.metrics import r2_score, root_mean_squared_error
 9 | 
10 | def read_pair(line):
11 |     tokens = line.strip().split()
12 |     x = float(tokens[0])
13 |     y = float(tokens[1])
14 |     xy = (x, y)
15 |     return xy
16 | 
17 | if __name__ == '__main__':
18 |     input_fn = sys.argv[1]
19 |     xs = []
20 |     ys = []
21 |     for line in open(input_fn).readlines():
22 |         x, y = read_pair(line)
23 |         xs.append(x)
24 |         ys.append(y)
25 |     r2 = r2_score(xs, ys)
26 |     rmse = root_mean_squared_error(xs, ys)
27 |     print('R2=%.3f RMSE=%.3f' % (r2, rmse))
28 | 


--------------------------------------------------------------------------------
/bin/molenc_scaffold.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright (C) 2020, Francois Berenger
  4 | # Yamanishi laboratory,
  5 | # Department of Bioscience and Bioinformatics,
  6 | # Faculty of Computer Science and Systems Engineering,
  7 | # Kyushu Institute of Technology,
  8 | # 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan.
  9 | #
 10 | # Compute the Bemis-Murcho generic scaffold (framework)
 11 | # of each input molecule.
 12 | #
 13 | # Bemis, G. W., & Murcko, M. A. (1996).
 14 | # "The properties of known drugs. 1. Molecular frameworks."
 15 | # Journal of medicinal chemistry, 39(15), 2887-2893.
 16 | 
 17 | import argparse, rdkit, sys
 18 | from rdkit import Chem
 19 | 
 20 | def RobustSmilesMolSupplier(filename):
 21 |     with open(filename) as f:
 22 |         for line in f:
 23 |             words = line.split()
 24 |             smi = words[0]
 25 |             name = words[1]
 26 |             mol = Chem.MolFromSmiles(smi)
 27 |             yield (smi, name, mol)
 28 | 
 29 | def find_terminal_atoms(mol):
 30 |     res = []
 31 |     for a in mol.GetAtoms():
 32 |         if len(a.GetBonds()) == 1:
 33 |             res.append(a)
 34 |     return res
 35 | 
 36 | def BemisMurckoFramework(mol):
 37 |     # keep only Heavy Atoms (HA)
 38 |     only_HA = rdkit.Chem.rdmolops.RemoveHs(mol)
 39 |     # switch all HA to Carbon
 40 |     rw_mol = Chem.RWMol(only_HA)
 41 |     for i in range(rw_mol.GetNumAtoms()):
 42 |         rw_mol.ReplaceAtom(i, Chem.Atom(6))
 43 |     # switch all non single bonds to single
 44 |     non_single_bonds = []
 45 |     for b in rw_mol.GetBonds():
 46 |         if b.GetBondType() != Chem.BondType.SINGLE:
 47 |             non_single_bonds.append(b)
 48 |     for b in non_single_bonds:
 49 |         j = b.GetBeginAtomIdx()
 50 |         k = b.GetEndAtomIdx()
 51 |         rw_mol.RemoveBond(j, k)
 52 |         rw_mol.AddBond(j, k, Chem.BondType.SINGLE)
 53 |     # as long as there are terminal atoms, remove them
 54 |     terminal_atoms = find_terminal_atoms(rw_mol)
 55 |     while terminal_atoms != []:
 56 |         for a in terminal_atoms:
 57 |             for b in a.GetBonds():
 58 |                 rw_mol.RemoveBond(b.GetBeginAtomIdx(), b.GetEndAtomIdx())
 59 |             rw_mol.RemoveAtom(a.GetIdx())
 60 |         terminal_atoms = find_terminal_atoms(rw_mol)
 61 |     return rw_mol.GetMol()
 62 | 
 63 | def main():
 64 |     # CLI options parsing
 65 |     parser = argparse.ArgumentParser(
 66 |         description = "Append Bemis-Murcko scaffold to each input molecule")
 67 |     parser.add_argument("-i", metavar = "input_smi", dest = "input_smi",
 68 |                         help = "input SMILES file")
 69 |     parser.add_argument("-o", metavar = "output_smi", dest = "output_smi",
 70 |                         help = "output SMILES file")
 71 |     parser.add_argument('--new-line', dest='new_line',
 72 |                         action='store_true', default=False,
 73 |                         help = "insert a newline before the scaffold")
 74 |     # parse CLI
 75 |     if len(sys.argv) == 1:
 76 |         # show help in case user has no clue of what to do
 77 |         parser.print_help(sys.stderr)
 78 |         sys.exit(1)
 79 |     args = parser.parse_args()
 80 |     input_smi = args.input_smi
 81 |     output_smi = args.output_smi
 82 |     new_line = args.new_line
 83 |     out_count = 0
 84 |     error_count = 0
 85 |     with open(output_smi, 'w') as out_file:
 86 |         for smi, name, mol in RobustSmilesMolSupplier(input_smi):
 87 |             if mol is None:
 88 |                 error_count += 1
 89 |             else:
 90 |                 scaff = BemisMurckoFramework(mol)
 91 |                 scaff_smi = Chem.MolToSmiles(scaff)
 92 |                 if new_line:
 93 |                     print("%s\t%s\n%s" % (smi, name, scaff_smi), file=out_file)
 94 |                 else:
 95 |                     print("%s\t%s\t%s" % (smi, name, scaff_smi), file=out_file)
 96 |                 out_count += 1
 97 |     total_count = out_count + error_count
 98 |     print("encoded: %d errors: %d total: %d" %
 99 |           (out_count, error_count, total_count),
100 |           file=sys.stderr)
101 | 
102 | if __name__ == '__main__':
103 |     main()
104 | 


--------------------------------------------------------------------------------
/bin/molenc_scan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # wildcard atom scan of molecules
 4 | # for each molecule, output all variants where
 5 | # a single heavy atom at a time is switched to the SMILES wildcard atom '*'
 6 | 
 7 | import rdkit, sys, time
 8 | from rdkit import Chem
 9 | # from rdkit.Chem import rdChemReactions
10 | 
11 | def RobustSmilesMolSupplier(filename):
12 |     with open(filename) as f:
13 |         for line in f:
14 |             words = line.split()
15 |             smile = words[0]
16 |             name = " ".join(words[1:]) # everything after the SMILES string
17 |             yield (name, smile)
18 | 
19 | if __name__ == '__main__':
20 |     before = time.time()
21 |     argc = len(sys.argv)
22 |     if argc != 2:
23 |         print("usage: %s input.smi" % sys.argv[0])
24 |         sys.exit(1)
25 |     input = sys.argv[1]
26 |     count = 0
27 |     wildcard = Chem.Atom(0)
28 |     for name, orig_smile in RobustSmilesMolSupplier(input):
29 |         mol = Chem.MolFromSmiles(orig_smile)
30 |         # output original molecule first
31 |         print("%s\t%s" % (orig_smile, name))
32 |         num_atoms = mol.GetNumAtoms()
33 |         # then output its variants
34 |         for i in range(num_atoms):
35 |             editable = Chem.EditableMol(mol)
36 |             editable.ReplaceAtom(i, wildcard, preserveProps=True)
37 |             edited = editable.GetMol()
38 |             smi = Chem.MolToSmiles(edited)
39 |             print("%s\t%s_%d" % (smi, name, i))
40 |         count += 1
41 |     after = time.time()
42 |     dt = after - before
43 |     print("%d molecules at %.2f mol/s" % (count, count / dt), file=sys.stderr)
44 | 
45 | # # original code by @Iwatobipen
46 | # # replace any aromatic carbon to aromatic nitrogen.
47 | # # TODO: does not compile
48 | # def nitrogen_scan(mol_in):
49 | #     out_mol_list = []
50 | #     used = set()
51 | #     rxn = rdChemReactions.ReactionFromSmarts("[c:1][H]>>[n:1]")
52 | #     products = rxn.RunReactants([mol_in])
53 | #     for p in products:
54 | #         smi = Chem.MolToSmiles(Chem.RemoveHs(p))
55 | #         if smi not in used:
56 | #             used.add(smi)
57 | #             out_mol_list.append(p)
58 | #     return out_mol_list
59 | 


--------------------------------------------------------------------------------
/bin/molenc_scan.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #set -x # DEBUG
 4 | 
 5 | # check params
 6 | if [ "$#" -ne 3 ]; then
 7 |     #            0         1         2          3
 8 |     echo "usage: molenc.sh input.smi output.txt features.dix"
 9 |     exit 1
10 | fi
11 | 
12 | input=$1
13 | output=$2
14 | dico=$3
15 | 
16 | std_log=$input'.std_log'
17 | tmp=`mktemp`
18 | tmp_smi=$tmp'_std.smi'
19 | tmp_scan=$tmp'_scan.smi'
20 | tmp_types=$tmp'_std.types'
21 | tmp_enc=$tmp'_std.enc'
22 | 
23 | # tell user how to install standardiser if not here
24 | which standardiser 2>&1 > /dev/null || \
25 |     echo 'ERROR: type: pip3 install chemo-standardizer'
26 | 
27 | echo standardizing molecules...
28 | (standardiser -i $input -o $tmp_smi 2>&1) > $std_log
29 | echo wildcard scan...
30 | molenc_scan.py $tmp_smi > $tmp_scan
31 | echo typing atoms...
32 | molenc_type_atoms.py $tmp_scan > $tmp_types
33 | echo encoding molecules...
34 | molenc_e -i $tmp_types -r 0:1 -o $tmp_enc
35 | molenc_d -i $tmp_enc -o $output -d $dico
36 | 
37 | # cleanup
38 | rm -f $std_log $tmp $tmp_smi $tmp_scan $tmp_types $tmp_enc
39 | 


--------------------------------------------------------------------------------
/bin/molenc_sdf2smi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | 
 5 | from rdkit import Chem
 6 | from rdkit.Chem.rdmolfiles import SmilesWriter
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('inputfile', help="sdf input file")
10 | parser.add_argument('outputfile', help="smi output file")
11 | args = parser.parse_args()
12 | sdf = Chem.SDMolSupplier(args.inputfile)
13 | writer = SmilesWriter(args.outputfile, delimiter='\t', includeHeader=False)
14 | 
15 | for mol in sdf:
16 |   writer.write(mol)
17 | writer.close()
18 | 


--------------------------------------------------------------------------------
/bin/molenc_sdf_strip.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # remove listed tags from a .sdf file
 4 | # usage: molenc_sdf_strip.py input.sdf "<TAG1>,<TAG2>,..." > stripped.sdf
 5 | 
 6 | import sys
 7 | 
 8 | sdf_fn = sys.argv[1]
 9 | tags = sys.argv[2] # coma-separated list of tags to remove
10 | 
11 | tags_list = tags.split(',')
12 | tags_set = set(tags_list)
13 | 
14 | skip = False
15 | 
16 | def endswith_any(line, tset):
17 |     for tag in tset:
18 |         if line.endswith(tag):
19 |             return True
20 |     return False
21 | 
22 | for line in open(sdf_fn).readlines():
23 |     stripped = line.strip()
24 |     if endswith_any(stripped, tags_set):
25 |         skip = True # the tag line itself
26 |     elif skip:
27 |         skip = False # the line after
28 |     else:
29 |         print(line, end='') # any other line
30 | 


--------------------------------------------------------------------------------
/bin/molenc_smi2cansmi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # SMILES to RdKit canonical SMILES
 4 | 
 5 | import sys
 6 | 
 7 | from rdkit import Chem
 8 | 
 9 | def RobustSmilesMolSupplier(filename):
10 |     with open(filename) as f:
11 |         for line in f:
12 |             smile, name = line.strip().split("\t") # enforce TAB-separated
13 |             try:
14 |                 mol = Chem.MolFromSmiles(smile)
15 |                 cano_smi = Chem.MolToSmiles(mol)
16 |                 yield (cano_smi, name)
17 |             except Exception:
18 |                 print("ERROR: cannot parse: %s" % line,
19 |                       file=sys.stderr, end='')
20 | 
21 | input_fn = sys.argv[1]
22 | 
23 | for cano_smi, name in RobustSmilesMolSupplier(input_fn):
24 |     print('%s\t%s' % (cano_smi, name))
25 | 


--------------------------------------------------------------------------------
/bin/molenc_smi2png.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import rdkit, sys
 4 | from rdkit import Chem
 5 | from rdkit.Chem import Draw
 6 | 
 7 | input_smi = sys.argv[1]
 8 | output_png = sys.argv[2]
 9 | 
10 | # WARNING: only read and consider first line of input SMILES file
11 | with open(input_smi, 'r') as input:
12 |     with open(output_png, 'wb') as output:
13 |         line = input.readline()
14 |         line.strip()
15 |         split = line.split()
16 |         smi = split[0]
17 |         name = split[1]
18 |         mol = Chem.MolFromSmiles(smi)
19 |         assert(mol != None)
20 |         d2d = Draw.MolDraw2DCairo(-1,-1)
21 |         Draw.DrawMoleculeACS1996(d2d, mol, legend=name)
22 |         pix = d2d.GetDrawingText()
23 |         output.write(pix)
24 | 


--------------------------------------------------------------------------------
/bin/molenc_stable.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Copyright (C) 2024, Francois Berenger
 4 | # Tsuda laboratory, The University of Tokyo,
 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan.
 6 | #
 7 | # stable filter: only non-reactive molecules are printed on stdout
 8 | # input line format: <SMILES:str>\t<NAME:str>
 9 | # output line format: same as input
10 | 
11 | import sys
12 | 
13 | from rdkit import Chem
14 | from rdkit.Chem import Descriptors
15 | 
16 | def RobustSmilesMolSupplier(filename):
17 |     with open(filename) as f:
18 |         for line in f:
19 |             splits = line.strip().split("\t") # enforce TAB-separated
20 |             smile = splits[0]
21 |             try:
22 |                 mol = Chem.MolFromSmiles(smile)
23 |                 yield (mol, line)
24 |             except Exception:
25 |                 print("ERROR: cannot parse: %s" % line,
26 |                       file=sys.stderr)
27 | 
28 | # Lisurek, M., Rupp, B., Wichard, J., Neuenschwander, M., von Kries, J. P.,
29 | # Frank, R., ... & Kühne, R. (2010).
30 | # Design of chemical libraries with potentially bioactive molecules applying
31 | # a maximum common substructure concept. Molecular diversity, 14(2), 401-408.
32 | # SMARTS patterns kindly provided by Michael Lisurek
33 | pat1 = Chem.MolFromSmarts('[C,c]S(=O)(=O)[F,Cl,Br,I]') # sulfonylhalide
34 | pat2 = Chem.MolFromSmarts('[C,c]S(=O)(=O)O[CX4]') # sulfone_ester
35 | pat3 = Chem.MolFromSmarts('C(=O)[F,Cl,Br,I]') # acylhalide
36 | pat4 = Chem.MolFromSmarts('O=COC=O') # acidanhydride
37 | pat5 = Chem.MolFromSmarts('c1([F,Cl,Br,I])ncccn1') # 2-halo_pyrimidine
38 | pat6 = Chem.MolFromSmarts('[H]C=O') # aldehyde
39 | pat7 = Chem.MolFromSmarts('C(=O)C(=O)') # 1,2-dicarbonyl
40 | pat8 = Chem.MolFromSmarts('C1OC1') # epoxide
41 | pat9 = Chem.MolFromSmarts('C1NC1') # aziridine
42 | pat10 = Chem.MolFromSmarts('C(=O)S') # thioester
43 | pat11 = Chem.MolFromSmarts('[#7]!@[#7]') # hydrazine
44 | pat12 = Chem.MolFromSmarts('C=[CH2]') # ethenes
45 | pat13 = Chem.MolFromSmarts('[H,*,!N][N;!R]=[C;!R]([*,H])[*,H]') # imine
46 | pat14 = Chem.MolFromSmarts('[CX4]I') # alkyl_iodide
47 | pat15 = Chem.MolFromSmarts('[Se]') # selenide
48 | pat16 = Chem.MolFromSmarts('O-O') # peroxide
49 | pat17 = Chem.MolFromSmarts('[NX3]!@[OX2]') # hetero-hetero_single_bond
50 | pat18 = Chem.MolFromSmarts('[NX3]!@[NX3]') # hetero-hetero_single_bond
51 | pat19 = Chem.MolFromSmarts('[NX3]!@[SX2]') # hetero-hetero_single_bond
52 | pat20 = Chem.MolFromSmarts('[SX2]!@[SX2]') # hetero-hetero_single_bond
53 | pat21 = Chem.MolFromSmarts('[SX2]!@[OX2]') # hetero-hetero_single_bond
54 | 
55 | def stable_filter(mol):
56 |     return (not (
57 |         mol.HasSubstructMatch(pat1) or
58 |         mol.HasSubstructMatch(pat2) or
59 |         mol.HasSubstructMatch(pat3) or
60 |         mol.HasSubstructMatch(pat4) or
61 |         mol.HasSubstructMatch(pat5) or
62 |         mol.HasSubstructMatch(pat6) or
63 |         mol.HasSubstructMatch(pat7) or
64 |         mol.HasSubstructMatch(pat8) or
65 |         mol.HasSubstructMatch(pat9) or
66 |         mol.HasSubstructMatch(pat10) or
67 |         mol.HasSubstructMatch(pat11) or
68 |         mol.HasSubstructMatch(pat12) or
69 |         mol.HasSubstructMatch(pat13) or
70 |         mol.HasSubstructMatch(pat14) or
71 |         mol.HasSubstructMatch(pat15) or
72 |         mol.HasSubstructMatch(pat16) or
73 |         mol.HasSubstructMatch(pat17) or
74 |         mol.HasSubstructMatch(pat18) or
75 |         mol.HasSubstructMatch(pat19) or
76 |         mol.HasSubstructMatch(pat20) or
77 |         mol.HasSubstructMatch(pat21)))
78 | 
79 | input_fn = sys.argv[1]
80 | 
81 | for mol, line in RobustSmilesMolSupplier(input_fn):
82 |     if stable_filter(mol):
83 |         # exact input lines replicated to the output
84 |         print(line, end='')
85 | 


--------------------------------------------------------------------------------
/bin/molenc_thash.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Copyright (C) 2024, Francois Berenger
 4 | # Tsuda laboratory, The University of Tokyo,
 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan.
 6 | #
 7 | # append to stdout <TAB>tautomer_hash to lines of a provided SMILES file
 8 | 
 9 | import rdkit, sys, typing
10 | from rdkit import Chem
11 | from rdkit.Chem import RegistrationHash
12 | from rdkit.Chem.RegistrationHash import HashLayer
13 | 
14 | input_fn = sys.argv[1]
15 | 
16 | def get_rdkit_tautomer_hash(smi: str) -> str:
17 |     mol = Chem.MolFromSmiles(smi)
18 |     layers = RegistrationHash.GetMolLayers(mol)
19 |     return layers[HashLayer.TAUTOMER_HASH]
20 | 
21 | for line in open(input_fn).readlines():
22 |     stripped = line.strip()
23 |     smi = stripped.split()[0]
24 |     taut_hash = get_rdkit_tautomer_hash(smi)
25 |     print('%s\t%s' % (stripped, taut_hash))
26 | 


--------------------------------------------------------------------------------
/bin/molenc_type_atoms.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # type atoms of a molecule a la atom pairs
  4 | # (nb. pi electrons if > 0, elt. symbol, nbHA neighbors)
  5 | # formal charges are ignored, as was the case in the seminal implementation
  6 | # of atom pairs, not sure this is very smart though
  7 | 
  8 | import argparse, molenc_common, os, rdkit, sys, time
  9 | from enum import Enum
 10 | from rdkit import Chem
 11 | from rdkit import RDConfig
 12 | from rdkit.Chem import AllChem, Descriptors
 13 | from rdkit.Chem.AtomPairs import Pairs
 14 | 
 15 | def RobustSmilesMolSupplier(filename):
 16 |     with open(filename) as f:
 17 |         for line in f:
 18 |             words = line.split()
 19 |             smile = words[0]
 20 |             name = " ".join(words[1:]) # everything after the SMILES string
 21 |             yield (name, Chem.MolFromSmiles(smile))
 22 | 
 23 | def SdfMolSupplier(fn):
 24 |     for mol in Chem.SDMolSupplier(fn):
 25 |         if mol:
 26 |             name = mol.GetProp('_Name')
 27 |             yield (name, mol)
 28 | 
 29 | def nb_heavy_atom_neighbors(a):
 30 |     res = 0
 31 |     for neighb in a.GetNeighbors():
 32 |         if neighb.GetAtomicNum() != 1:
 33 |             res += 1
 34 |     return res
 35 | 
 36 | PeriodicTable = Chem.GetPeriodicTable()
 37 | 
 38 | def string_of_charge(charge):
 39 |     if charge == 0: return ""
 40 |     elif charge == -1: return "-"
 41 |     elif charge == 1: return "+"
 42 |     else: return ("%+d" % charge)
 43 | 
 44 | def type_atom(a):
 45 |     res = None
 46 |     nb_pi_electrons = Pairs.Utils.NumPiElectrons(a)
 47 |     symbol = PeriodicTable.GetElementSymbol(a.GetAtomicNum())
 48 |     nbHA = nb_heavy_atom_neighbors(a)
 49 |     formal_charge = string_of_charge(a.GetFormalCharge())
 50 |     if nb_pi_electrons > 0:
 51 |         res = "%d%s%d%s" % (nb_pi_electrons, symbol, nbHA, formal_charge)
 52 |     else:
 53 |         res = "%s%d%s" % (symbol, nbHA, formal_charge)
 54 |     return res
 55 | 
 56 | def encode_molecule(m):
 57 |     return map(type_atom, m.GetAtoms())
 58 | 
 59 | def print_encoded_atoms(out, atoms):
 60 |     for i, a in enumerate(atoms):
 61 |         print("%d %s" % (i, a), file=out)
 62 | 
 63 | if __name__ == '__main__':
 64 |     before = time.time()
 65 |     # CLI options parsing
 66 |     parser = argparse.ArgumentParser(
 67 |         description = "compute atom types and distances")
 68 |     parser.add_argument("-i", metavar = "input.{smi|sdf}", dest = "input_fn",
 69 |                         help = "molecules input file")
 70 |     parser.add_argument("-o", metavar = "output.txt", dest = "output_fn",
 71 |                         help = "output file")
 72 |     parser.add_argument('--3D', dest='three_dimensions', action='store_true',
 73 |                         help = "consider molecules in 3D (requires SDF)")
 74 |     parser.set_defaults(three_dimensions=False)
 75 |     # parse CLI
 76 |     if len(sys.argv) == 1:
 77 |         # show help in case user has no clue of what to do
 78 |         parser.print_help(sys.stderr)
 79 |         sys.exit(1)
 80 |     args = parser.parse_args()
 81 |     input_fn = args.input_fn
 82 |     output = open(args.output_fn, 'w')
 83 |     mol_supplier = None
 84 |     three_dimensions = args.three_dimensions
 85 |     if three_dimensions or input_fn.endswith(".sdf"):
 86 |         mol_supplier = SdfMolSupplier
 87 |     elif input_fn.endswith(".smi"):
 88 |         mol_supplier = RobustSmilesMolSupplier
 89 |     else:
 90 |         print("molenc_type_atoms.py: input file not .smi or .sdf and no --3D",
 91 |               file=sys.stderr)
 92 |         sys.exit(1)
 93 |     count = 0
 94 |     for name, mol in mol_supplier(input_fn):
 95 |         print("#atoms:%d %s" % (mol.GetNumAtoms(), name), file=output)
 96 |         print_encoded_atoms(output, encode_molecule(mol))
 97 |         molenc_common.print_bonds(output, mol)
 98 |         molenc_common.print_distance_matrix(output, mol, three_dimensions)
 99 |         count += 1
100 |     after = time.time()
101 |     dt = after - before
102 |     print("%d molecules at %.2f mol/s" % (count, count / dt), file=sys.stderr)
103 |     output.close()
104 | 


--------------------------------------------------------------------------------
/bin/molenc_uniq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # only print on stdout a line if its key was never seen before
 4 | 
 5 | import sys
 6 | 
 7 | input_fn = sys.argv[1]
 8 | sep = sys.argv[2]
 9 | # user-provided field number, as in awk, start at 1
10 | field = int(sys.argv[3]) - 1
11 | 
12 | seen = {}
13 | 
14 | for line in open(input_fn).readlines():
15 |     strip = line.strip()
16 |     toks = strip.split(sep)
17 |     key = toks[field]
18 |     already_seen = seen.get(key, False)
19 |     if not already_seen:
20 |         print(line)
21 |         seen[key] = True
22 | 


--------------------------------------------------------------------------------
/bin/rgb_scale.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # pip3 install colour # to get the required library
 4 | 
 5 | from colour import Color
 6 | 
 7 | red = Color("red")
 8 | colors = list(red.range_to(Color("white"), 101))
 9 | for c in colors:
10 |     (r, g, b) = c.get_rgb()
11 |     print("%.2f %.2f %.2f" % (r, g, b))
12 | 


--------------------------------------------------------------------------------
/bin/smi2png.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # One 2D picture SVG for each SMILES line
 4 | # molecule images are created in a created pix/ directory
 5 | # and named after their corresponding molecule
 6 | 
 7 | import argparse
 8 | import rdkit
 9 | import os
10 | import sys
11 | from rdkit import Chem
12 | from rdkit.Chem import AllChem
13 | from rdkit.Chem.Draw import rdMolDraw2D
14 | 
15 | def RobustMolSupplier(filename):
16 |     with open(filename) as f:
17 |         i = 0
18 |         for line in f:
19 |             words = line.split()
20 |             index = i
21 |             i += 1
22 |             smi = words[0]
23 |             name = words[1]
24 |             yield (index, name, Chem.MolFromSmiles(smi))
25 | 
26 | if __name__ == '__main__':
27 |     # parse CLI
28 |     # show help in case user has no clue of what to do
29 |     if len(sys.argv) != 2:
30 |         sys.stderr.write("usage: %s input.smi\n" % sys.argv[0])
31 |         sys.exit(1)
32 |     input_smi = sys.argv[1]
33 |     if not (os.path.isdir('pix')):
34 |         os.mkdir('pix')
35 |     for i, name, mol in RobustMolSupplier(input_smi):
36 |         if mol is None:
37 |             continue
38 |         AllChem.Compute2DCoords(mol) # generate 2D conformer
39 |         d = rdMolDraw2D.MolDraw2DCairo(300, 300) # PNG output
40 |         # d.drawOptions().addAtomIndices = True
41 |         caption = '%d %s' % (i, name)
42 |         d.DrawMolecule(mol, legend = caption)
43 |         d.FinishDrawing()
44 |         out_fn = 'pix/%s.png' % name
45 |         print("creating %s" % out_fn)
46 |         with open(out_fn, 'wb') as out:
47 |             out.write(d.GetDrawingText())
48 | 


--------------------------------------------------------------------------------
/bin/smi2svg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # One 2D picture SVG for each SMILES line
 4 | # molecule images are named after the index of the molecule in the input file
 5 | # they are created in the current directory
 6 | 
 7 | import argparse
 8 | import rdkit
 9 | import sys
10 | from rdkit import Chem
11 | from rdkit.Chem import AllChem
12 | from rdkit.Chem.Draw import rdMolDraw2D
13 | 
14 | def RobustMolSupplier(filename):
15 |     with open(filename) as f:
16 |         i = 0
17 |         for line in f:
18 |             words = line.split()
19 |             index = i
20 |             i += 1
21 |             smi = words[0]
22 |             name = words[1]
23 |             yield (index, name, Chem.MolFromSmiles(smi))
24 | 
25 | if __name__ == '__main__':
26 |     # parse CLI
27 |     # show help in case user has no clue of what to do
28 |     if len(sys.argv) != 2:
29 |         sys.stderr.write("usage: %s input.smi\n" % sys.argv[0])
30 |         sys.exit(1)
31 |     input_smi = sys.argv[1]
32 |     for i, name, mol in RobustMolSupplier(input_smi):
33 |         if mol is None:
34 |             continue
35 |         AllChem.Compute2DCoords(mol) # generate 2D conformer
36 |         d = rdMolDraw2D.MolDraw2DSVG(200, 200)
37 |         # d.drawOptions().addAtomIndices = True
38 |         caption = '%d %s' % (i, name)
39 |         d.DrawMolecule(mol, legend = caption)
40 |         d.FinishDrawing()
41 |         out_fn = '%d.svg' % i
42 |         print('creating %s' % out_fn)
43 |         with open(out_fn, 'w') as out:
44 |             out.write(d.GetDrawingText())
45 | 


--------------------------------------------------------------------------------
/data/3.frags:
--------------------------------------------------------------------------------
  1 | #atoms:15 NCGC00261552-01_f00
  2 | 0 0,6,2,0
  3 | 16 0,7,2,0
  4 | 17 0,6,3,0
  5 | 18 1,6,3,0
  6 | 19 1,6,2,0
  7 | 20 1,6,2,0
  8 | 21 1,6,2,0
  9 | 22 1,6,2,0
 10 | 23 1,6,2,0
 11 | 24 1,6,3,0
 12 | 25 1,6,2,0
 13 | 26 1,6,2,0
 14 | 27 1,6,2,0
 15 | 28 1,6,2,0
 16 | 29 1,6,2,0
 17 | #bonds:16
 18 | 0 - 16
 19 | 16 - 17
 20 | 17 - 18
 21 | 18 : 19
 22 | 19 : 20
 23 | 20 : 21
 24 | 21 : 22
 25 | 22 : 23
 26 | 17 - 24
 27 | 24 : 25
 28 | 25 : 26
 29 | 26 : 27
 30 | 27 : 28
 31 | 28 : 29
 32 | 23 : 18
 33 | 29 : 24
 34 | #anchors:1
 35 | 0,6,2,0 0 0,6,2,0
 36 | #atoms:1 NCGC00261552-01_f01
 37 | 1 0,6,2,0
 38 | #bonds:0
 39 | #anchors:2
 40 | 0,6,2,0 1 0,7,2,0
 41 | 0,6,2,0 1 0,6,2,0
 42 | #atoms:6 NCGC00261552-01_f02
 43 | 10 1,6,3,0
 44 | 11 1,6,2,0
 45 | 12 1,6,2,0
 46 | 13 1,6,2,0
 47 | 14 1,6,2,0
 48 | 15 1,6,2,0
 49 | #bonds:6
 50 | 10 : 11
 51 | 11 : 12
 52 | 12 : 13
 53 | 13 : 14
 54 | 14 : 15
 55 | 15 : 10
 56 | #anchors:1
 57 | 1,6,3,0 10 0,6,3,0
 58 | #atoms:8 NCGC00261552-01_f03
 59 | 2 0,7,2,0
 60 | 3 0,6,3,0
 61 | 4 1,6,3,0
 62 | 5 1,6,2,0
 63 | 6 1,6,2,0
 64 | 7 1,6,2,0
 65 | 8 1,6,2,0
 66 | 9 1,6,2,0
 67 | #bonds:8
 68 | 2 - 3
 69 | 3 - 4
 70 | 4 : 5
 71 | 5 : 6
 72 | 6 : 7
 73 | 7 : 8
 74 | 8 : 9
 75 | 9 : 4
 76 | #anchors:2
 77 | 0,6,3,0 3 1,6,3,0
 78 | 0,7,2,0 2 0,6,2,0
 79 | #atoms:6 NCGC00261763-01_f00
 80 | 13 1,6,3,0
 81 | 14 1,6,2,0
 82 | 15 1,6,2,0
 83 | 16 1,6,2,0
 84 | 17 1,6,2,0
 85 | 18 1,6,2,0
 86 | #bonds:6
 87 | 13 : 14
 88 | 14 : 15
 89 | 15 : 16
 90 | 16 : 17
 91 | 17 : 18
 92 | 18 : 13
 93 | #anchors:1
 94 | 1,6,3,0 13 1,7,3,0
 95 | #atoms:6 NCGC00261763-01_f01
 96 | 7 1,6,3,0
 97 | 8 1,6,2,0
 98 | 9 1,6,2,0
 99 | 10 1,6,2,0
100 | 11 1,6,2,0
101 | 12 1,6,2,0
102 | #bonds:6
103 | 7 : 8
104 | 8 : 9
105 | 9 : 10
106 | 10 : 11
107 | 11 : 12
108 | 12 : 7
109 | #anchors:1
110 | 1,6,3,0 7 1,6,3,0
111 | #atoms:7 NCGC00261763-01_f02
112 | 0 0,6,1,0
113 | 1 1,7,2,0
114 | 2 1,6,3,0
115 | 3 1,16,2,0
116 | 4 1,7,3,0
117 | 5 1,6,3,0
118 | 6 1,7,2,0
119 | #bonds:7
120 | 0 - 1
121 | 1 = 2
122 | 2 : 3
123 | 3 : 4
124 | 4 : 5
125 | 5 : 6
126 | 6 : 2
127 | #anchors:2
128 | 1,6,3,0 5 1,6,3,0
129 | 1,7,3,0 4 1,6,3,0
130 | #atoms:1 NCGC00260832-01_f00
131 | 0 0,6,1,0
132 | #bonds:0
133 | #anchors:1
134 | 0,6,1,0 0 0,7,3,0
135 | #atoms:7 NCGC00260832-01_f01
136 | 10 0,7,2,0
137 | 11 1,6,3,0
138 | 12 1,6,2,0
139 | 13 1,6,2,0
140 | 14 1,6,2,0
141 | 15 1,7,2,0
142 | 16 1,6,2,0
143 | #bonds:7
144 | 10 - 11
145 | 11 : 12
146 | 12 : 13
147 | 13 : 14
148 | 14 : 15
149 | 15 : 16
150 | 16 : 11
151 | #anchors:1
152 | 0,7,2,0 10 1,6,3,0
153 | #atoms:12 NCGC00260832-01_f02
154 | 1 0,7,3,0
155 | 2 0,6,2,0
156 | 3 0,6,2,0
157 | 4 1,6,3,0
158 | 5 1,6,2,0
159 | 6 1,6,3,0
160 | 7 0,7,2,0
161 | 8 1,6,3,0
162 | 9 1,8,1,0
163 | 17 1,6,2,0
164 | 18 1,6,2,0
165 | 19 1,6,3,0
166 | #bonds:13
167 | 1 - 2
168 | 2 - 3
169 | 3 - 4
170 | 4 : 5
171 | 5 : 6
172 | 6 - 7
173 | 7 - 8
174 | 8 = 9
175 | 6 : 17
176 | 17 : 18
177 | 18 : 19
178 | 19 - 1
179 | 19 : 4
180 | #anchors:2
181 | 1,6,3,0 8 0,7,2,0
182 | 0,7,3,0 1 0,6,1,0
183 | 


--------------------------------------------------------------------------------
/data/3.smi:
--------------------------------------------------------------------------------
1 | C(CNC(C1=CC=CC=C1)C2=CC=CC=C2)NC(C3=CC=CC=C3)C4=CC=CC=C4	NCGC00261552-01
2 | C\N=C1/SN(C(=N1)C2=CC=CC=C2)C3=CC=CC=C3	NCGC00261763-01
3 | CN1CCC2=CC(NC(=O)NC3=CC=CN=C3)=CC=C12	NCGC00260832-01
4 | 


--------------------------------------------------------------------------------
/data/3.to_frag:
--------------------------------------------------------------------------------
  1 | #atoms:30 NCGC00261552-01
  2 | 0 0,6,2,0,0
  3 | 1 0,6,2,0,0
  4 | 2 0,7,2,0,0
  5 | 3 0,6,3,0,0
  6 | 4 1,6,3,0,0
  7 | 5 1,6,2,0,0
  8 | 6 1,6,2,0,0
  9 | 7 1,6,2,0,0
 10 | 8 1,6,2,0,0
 11 | 9 1,6,2,0,0
 12 | 10 1,6,3,0,0
 13 | 11 1,6,2,0,0
 14 | 12 1,6,2,0,0
 15 | 13 1,6,2,0,0
 16 | 14 1,6,2,0,0
 17 | 15 1,6,2,0,0
 18 | 16 0,7,2,0,0
 19 | 17 0,6,3,0,0
 20 | 18 1,6,3,0,0
 21 | 19 1,6,2,0,0
 22 | 20 1,6,2,0,0
 23 | 21 1,6,2,0,0
 24 | 22 1,6,2,0,0
 25 | 23 1,6,2,0,0
 26 | 24 1,6,3,0,0
 27 | 25 1,6,2,0,0
 28 | 26 1,6,2,0,0
 29 | 27 1,6,2,0,0
 30 | 28 1,6,2,0,0
 31 | 29 1,6,2,0,0
 32 | #bonds:33
 33 | 0 - 1 N
 34 | 1 - 2 N
 35 | 2 - 3 N
 36 | 3 - 4 N
 37 | 4 : 5 N
 38 | 5 : 6 N
 39 | 6 : 7 N
 40 | 7 : 8 N
 41 | 8 : 9 N
 42 | 3 - 10 N
 43 | 10 : 11 N
 44 | 11 : 12 N
 45 | 12 : 13 N
 46 | 13 : 14 N
 47 | 14 : 15 N
 48 | 0 - 16 N
 49 | 16 - 17 N
 50 | 17 - 18 N
 51 | 18 : 19 N
 52 | 19 : 20 N
 53 | 20 : 21 N
 54 | 21 : 22 N
 55 | 22 : 23 N
 56 | 17 - 24 N
 57 | 24 : 25 N
 58 | 25 : 26 N
 59 | 26 : 27 N
 60 | 27 : 28 N
 61 | 28 : 29 N
 62 | 9 : 4 N
 63 | 15 : 10 N
 64 | 23 : 18 N
 65 | 29 : 24 N
 66 | #cut_bonds:9:2
 67 | 0
 68 | 1
 69 | 2
 70 | 3
 71 | 9
 72 | 15
 73 | 16
 74 | 17
 75 | 23
 76 | #atoms:19 NCGC00261763-01
 77 | 0 0,6,1,0,0
 78 | 1 1,7,2,0,0
 79 | 2 1,6,3,0,0
 80 | 3 1,16,2,0,0
 81 | 4 1,7,3,0,0
 82 | 5 1,6,3,0,0
 83 | 6 1,7,2,0,0
 84 | 7 1,6,3,0,0
 85 | 8 1,6,2,0,0
 86 | 9 1,6,2,0,0
 87 | 10 1,6,2,0,0
 88 | 11 1,6,2,0,0
 89 | 12 1,6,2,0,0
 90 | 13 1,6,3,0,0
 91 | 14 1,6,2,0,0
 92 | 15 1,6,2,0,0
 93 | 16 1,6,2,0,0
 94 | 17 1,6,2,0,0
 95 | 18 1,6,2,0,0
 96 | #bonds:21
 97 | 0 - 1 N
 98 | 1 = 2 Z:0:3
 99 | 2 : 3 N
100 | 3 : 4 N
101 | 4 : 5 N
102 | 5 : 6 N
103 | 5 - 7 N
104 | 7 : 8 N
105 | 8 : 9 N
106 | 9 : 10 N
107 | 10 : 11 N
108 | 11 : 12 N
109 | 4 - 13 N
110 | 13 : 14 N
111 | 14 : 15 N
112 | 15 : 16 N
113 | 16 : 17 N
114 | 17 : 18 N
115 | 6 : 2 N
116 | 12 : 7 N
117 | 18 : 13 N
118 | #cut_bonds:2:1
119 | 6
120 | 12
121 | #atoms:20 NCGC00260832-01
122 | 0 0,6,1,0,0
123 | 1 0,7,3,0,0
124 | 2 0,6,2,0,0
125 | 3 0,6,2,0,0
126 | 4 1,6,3,0,0
127 | 5 1,6,2,0,0
128 | 6 1,6,3,0,0
129 | 7 0,7,2,0,0
130 | 8 1,6,3,0,0
131 | 9 1,8,1,0,0
132 | 10 0,7,2,0,0
133 | 11 1,6,3,0,0
134 | 12 1,6,2,0,0
135 | 13 1,6,2,0,0
136 | 14 1,6,2,0,0
137 | 15 1,7,2,0,0
138 | 16 1,6,2,0,0
139 | 17 1,6,2,0,0
140 | 18 1,6,2,0,0
141 | 19 1,6,3,0,0
142 | #bonds:22
143 | 0 - 1 N
144 | 1 - 2 N
145 | 2 - 3 N
146 | 3 - 4 N
147 | 4 : 5 N
148 | 5 : 6 N
149 | 6 - 7 N
150 | 7 - 8 N
151 | 8 = 9 N
152 | 8 - 10 N
153 | 10 - 11 N
154 | 11 : 12 N
155 | 12 : 13 N
156 | 13 : 14 N
157 | 14 : 15 N
158 | 15 : 16 N
159 | 6 : 17 N
160 | 17 : 18 N
161 | 18 : 19 N
162 | 19 - 1 N
163 | 19 : 4 N
164 | 16 : 11 N
165 | #cut_bonds:5:1
166 | 0
167 | 6
168 | 7
169 | 9
170 | 10
171 | 


--------------------------------------------------------------------------------
/data/3_frags.smi:
--------------------------------------------------------------------------------
1 | *CCNC(c1ccccc1)c1ccccc1	NCGC00261552-01_f00
2 | *c1ccccc1	NCGC00261552-01_f01
3 | *NC(*)c1ccccc1	NCGC00261552-01_f02
4 | *c1ccccc1	NCGC00261763-01_f00
5 | *c1n/c(=N/C)sn1-c1ccccc1	NCGC00261763-01_f01
6 | *c1cccnc1	NCGC00260832-01_f00
7 | *NC(=O)Nc1ccc2c(c1)CCN2C	NCGC00260832-01_f01
8 | 


--------------------------------------------------------------------------------
/data/3_frags.txt:
--------------------------------------------------------------------------------
  1 | #atoms:16 NCGC00261552-01_f00
  2 | 0 0,6,2,0,0
  3 | 1 0,6,2,0,0
  4 | 16 0,7,2,0,0
  5 | 17 0,6,3,0,0
  6 | 18 1,6,3,0,0
  7 | 19 1,6,2,0,0
  8 | 20 1,6,2,0,0
  9 | 21 1,6,2,0,0
 10 | 22 1,6,2,0,0
 11 | 23 1,6,2,0,0
 12 | 24 1,6,3,0,0
 13 | 25 1,6,2,0,0
 14 | 26 1,6,2,0,0
 15 | 27 1,6,2,0,0
 16 | 28 1,6,2,0,0
 17 | 29 1,6,2,0,0
 18 | #bonds:17
 19 | 0 - 1 N
 20 | 0 - 16 N
 21 | 16 - 17 N
 22 | 17 - 18 N
 23 | 18 : 19 N
 24 | 19 : 20 N
 25 | 20 : 21 N
 26 | 21 : 22 N
 27 | 22 : 23 N
 28 | 17 - 24 N
 29 | 24 : 25 N
 30 | 25 : 26 N
 31 | 26 : 27 N
 32 | 27 : 28 N
 33 | 28 : 29 N
 34 | 23 : 18 N
 35 | 29 : 24 N
 36 | #anchors:1
 37 | 0,6,2,0,0 1 0,7,2,0,0
 38 | #atoms:6 NCGC00261552-01_f01
 39 | 10 1,6,3,0,0
 40 | 11 1,6,2,0,0
 41 | 12 1,6,2,0,0
 42 | 13 1,6,2,0,0
 43 | 14 1,6,2,0,0
 44 | 15 1,6,2,0,0
 45 | #bonds:6
 46 | 10 : 11 N
 47 | 11 : 12 N
 48 | 12 : 13 N
 49 | 13 : 14 N
 50 | 14 : 15 N
 51 | 15 : 10 N
 52 | #anchors:1
 53 | 1,6,3,0,0 10 0,6,3,0,0
 54 | #atoms:8 NCGC00261552-01_f02
 55 | 2 0,7,2,0,0
 56 | 3 0,6,3,0,0
 57 | 4 1,6,3,0,0
 58 | 5 1,6,2,0,0
 59 | 6 1,6,2,0,0
 60 | 7 1,6,2,0,0
 61 | 8 1,6,2,0,0
 62 | 9 1,6,2,0,0
 63 | #bonds:8
 64 | 2 - 3 N
 65 | 3 - 4 N
 66 | 4 : 5 N
 67 | 5 : 6 N
 68 | 6 : 7 N
 69 | 7 : 8 N
 70 | 8 : 9 N
 71 | 9 : 4 N
 72 | #anchors:2
 73 | 0,6,3,0,0 3 1,6,3,0,0
 74 | 0,7,2,0,0 2 0,6,2,0,0
 75 | #atoms:6 NCGC00261763-01_f00
 76 | 7 1,6,3,0,0
 77 | 8 1,6,2,0,0
 78 | 9 1,6,2,0,0
 79 | 10 1,6,2,0,0
 80 | 11 1,6,2,0,0
 81 | 12 1,6,2,0,0
 82 | #bonds:6
 83 | 7 : 8 N
 84 | 8 : 9 N
 85 | 9 : 10 N
 86 | 10 : 11 N
 87 | 11 : 12 N
 88 | 12 : 7 N
 89 | #anchors:1
 90 | 1,6,3,0,0 7 1,6,3,0,0
 91 | #atoms:13 NCGC00261763-01_f01
 92 | 0 0,6,1,0,0
 93 | 1 1,7,2,0,0
 94 | 2 1,6,3,0,0
 95 | 3 1,16,2,0,0
 96 | 4 1,7,3,0,0
 97 | 5 1,6,3,0,0
 98 | 6 1,7,2,0,0
 99 | 13 1,6,3,0,0
100 | 14 1,6,2,0,0
101 | 15 1,6,2,0,0
102 | 16 1,6,2,0,0
103 | 17 1,6,2,0,0
104 | 18 1,6,2,0,0
105 | #bonds:14
106 | 0 - 1 N
107 | 1 = 2 Z:0:3
108 | 2 : 3 N
109 | 3 : 4 N
110 | 4 : 5 N
111 | 5 : 6 N
112 | 4 - 13 N
113 | 13 : 14 N
114 | 14 : 15 N
115 | 15 : 16 N
116 | 16 : 17 N
117 | 17 : 18 N
118 | 6 : 2 N
119 | 18 : 13 N
120 | #anchors:1
121 | 1,6,3,0,0 5 1,6,3,0,0
122 | #atoms:6 NCGC00260832-01_f00
123 | 11 1,6,3,0,0
124 | 12 1,6,2,0,0
125 | 13 1,6,2,0,0
126 | 14 1,6,2,0,0
127 | 15 1,7,2,0,0
128 | 16 1,6,2,0,0
129 | #bonds:6
130 | 11 : 12 N
131 | 12 : 13 N
132 | 13 : 14 N
133 | 14 : 15 N
134 | 15 : 16 N
135 | 16 : 11 N
136 | #anchors:1
137 | 1,6,3,0,0 11 0,7,2,0,0
138 | #atoms:14 NCGC00260832-01_f01
139 | 0 0,6,1,0,0
140 | 1 0,7,3,0,0
141 | 2 0,6,2,0,0
142 | 3 0,6,2,0,0
143 | 4 1,6,3,0,0
144 | 5 1,6,2,0,0
145 | 6 1,6,3,0,0
146 | 7 0,7,2,0,0
147 | 8 1,6,3,0,0
148 | 9 1,8,1,0,0
149 | 10 0,7,2,0,0
150 | 17 1,6,2,0,0
151 | 18 1,6,2,0,0
152 | 19 1,6,3,0,0
153 | #bonds:15
154 | 0 - 1 N
155 | 1 - 2 N
156 | 2 - 3 N
157 | 3 - 4 N
158 | 4 : 5 N
159 | 5 : 6 N
160 | 6 - 7 N
161 | 7 - 8 N
162 | 8 = 9 N
163 | 8 - 10 N
164 | 6 : 17 N
165 | 17 : 18 N
166 | 18 : 19 N
167 | 19 - 1 N
168 | 19 : 4 N
169 | #anchors:1
170 | 0,7,2,0,0 10 1,6,3,0,0
171 | 


--------------------------------------------------------------------------------
/data/3_genmols.smi:
--------------------------------------------------------------------------------
 1 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000001_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
 2 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000002_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
 3 | CN1CCc2cc(NC(=O)Nc3cccnc3)ccc21	genmol_000003_NCGC00260832-01_f00,NCGC00260832-01_f01
 4 | C/N=c1/nc(-c2n/c(=N/C)sn2-c2ccccc2)n(-c2ccccc2)s1	genmol_000004_NCGC00261763-01_f01,NCGC00261763-01_f01
 5 | C/N=c1/nc(-c2ccccc2)n(-c2ccccc2)s1	genmol_000005_NCGC00261763-01_f00,NCGC00261763-01_f01
 6 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000006_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
 7 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000007_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
 8 | C/N=c1/nc(-c2n/c(=N/C)sn2-c2ccccc2)n(-c2ccccc2)s1	genmol_000008_NCGC00261763-01_f01,NCGC00261763-01_f01
 9 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000009_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
10 | CN1CCc2cc(NC(=O)Nc3cccnc3)ccc21	genmol_000010_NCGC00260832-01_f00,NCGC00260832-01_f01
11 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000011_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
12 | CN1CCc2cc(NC(=O)Nc3cccnc3)ccc21	genmol_000012_NCGC00260832-01_f00,NCGC00260832-01_f01
13 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000013_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
14 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000014_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
15 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000015_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
16 | C/N=c1/nc(-c2ccccc2)n(-c2ccccc2)s1	genmol_000016_NCGC00261763-01_f00,NCGC00261763-01_f01
17 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000017_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
18 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000018_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
19 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000019_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
20 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1	genmol_000020_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02
21 | 


--------------------------------------------------------------------------------
/data/3_genmols_uniq.smi:
--------------------------------------------------------------------------------
1 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1
2 | CN1CCc2cc(NC(=O)Nc3cccnc3)ccc21
3 | C/N=c1/nc(-c2ccccc2)n(-c2ccccc2)s1
4 | C/N=c1/nc(-c2n/c(=N/C)sn2-c2ccccc2)n(-c2ccccc2)s1
5 | 


--------------------------------------------------------------------------------
/data/ALDH1_2conf.ph4:
--------------------------------------------------------------------------------
 1 | 13:22401519_1_1_1_1
 2 | ARO 1.47088 -0.706617 1.86095
 3 | ARO -3.88528 -1.98218 5.7585
 4 | ARO 1.76862 -1.26624 3.89744
 5 | HBD 2.0174 -1.9041 6.2761
 6 | HBA 2.9153 -1.4999 4.0836
 7 | HBA -0.4533 -1.1965 5.0622
 8 | HBA -7.202 -2.5145 6.7169
 9 | HBA -6.6826 -3.8445 5.0495
10 | POS -6.3641 -2.9407 5.8728
11 | NEG -7.202 -2.5145 6.7169
12 | HYD 1.47088 -0.706617 1.86095
13 | HYD -3.88528 -1.98218 5.7585
14 | HYD 1.76862 -1.26624 3.89744
15 | 13:22401519_1_1_1_2
16 | ARO 1.47088 -0.706617 1.86095
17 | ARO -3.47526 0.19586 6.40746
18 | ARO 1.76862 -1.26624 3.89744
19 | HBD 2.0174 -1.9041 6.2761
20 | HBA 2.9153 -1.4999 4.0836
21 | HBA -0.4533 -1.1965 5.0622
22 | HBA -6.4947 1.2154 7.8379
23 | HBA -5.574 2.8916 6.76
24 | POS -5.5543 1.6862 7.1378
25 | NEG -6.4947 1.2154 7.8379
26 | HYD 1.47088 -0.706617 1.86095
27 | HYD -3.47526 0.19586 6.40746
28 | HYD 1.76862 -1.26624 3.89744
29 | 14:4256362_1_1_1_1
30 | ARO 6.11932 -0.416467 -0.548583
31 | ARO 2.15405 0.669883 -1.00792
32 | ARO -3.07 3.7901 -3.20373
33 | ARO 4.28834 0.50898 -1.18316
34 | HBD -0.2095 1.8844 -1.8496
35 | HBA -3.4513 4.3099 -4.4494
36 | HBA 0.5397 3.3853 -3.4661
37 | HBA -0.2324 -0.0904 0.1262
38 | HYD 6.11932 -0.416467 -0.548583
39 | HYD 2.15405 0.669883 -1.00792
40 | HYD -3.07 3.7901 -3.20373
41 | HYD 4.28834 0.50898 -1.18316
42 | HYD -0.6502 0.6217 1.2873
43 | HYD -5.2726 3.6343 -0.7889
44 | 14:4256362_1_1_1_2
45 | ARO 6.11932 -0.416467 -0.548583
46 | ARO 2.15405 0.669883 -1.00792
47 | ARO -3.08318 3.77257 -3.14745
48 | ARO 4.28834 0.50898 -1.18316
49 | HBD -0.2095 1.8844 -1.8496
50 | HBA -3.9734 3.675 -2.068
51 | HBA 0.5397 3.3853 -3.4661
52 | HBA -0.2324 -0.0904 0.1262
53 | HYD 6.11932 -0.416467 -0.548583
54 | HYD 2.15405 0.669883 -1.00792
55 | HYD -3.08318 3.77257 -3.14745
56 | HYD 4.28834 0.50898 -1.18316
57 | HYD -0.6502 0.6217 1.2873
58 | HYD -4.1364 5.0094 -5.9878
59 | 


--------------------------------------------------------------------------------
/data/AP_test.smi:
--------------------------------------------------------------------------------
1 | On1cccc1	mol_1
2 | 


--------------------------------------------------------------------------------
/data/AP_test.smi.dix.ref:
--------------------------------------------------------------------------------
 1 | #atom_pairs
 2 | 0 1C2-0-1C2 4
 3 | 1 1C2-2-1C2 3
 4 | 2 1C2-1-1C2 3
 5 | 3 1C2-3-O1 2
 6 | 4 1C2-2-O1 2
 7 | 5 1C2-2-1N3 2
 8 | 6 1C2-1-1N3 2
 9 | 7 O1-0-O1 1
10 | 8 1N3-1-O1 1
11 | 9 1N3-0-1N3 1
12 | 


--------------------------------------------------------------------------------
/data/AP_test.txt.ref:
--------------------------------------------------------------------------------
1 | mol_1,0.0,[0:4;1:3;2:3;3:2;4:2;5:2;6:2;7:1;8:1;9:1]
2 | 


--------------------------------------------------------------------------------
/data/alcools.AD.ref:
--------------------------------------------------------------------------------
1 | (6,1,3,1,0)-0-(6,1,3,1,0) 1
2 | (6,1,3,1,0)-1-(6,2,2,2,0) 1
3 | (6,1,3,1,0)-1-(8,1,1,1,0) 1
4 | (6,1,3,1,0)-2-(8,1,1,1,0) 1
5 | (6,2,2,2,0)-0-(6,2,2,2,0) 1
6 | (6,2,2,2,0)-1-(8,1,1,1,0) 1
7 | (8,1,1,1,0)-0-(8,1,1,1,0) 1
8 | 


--------------------------------------------------------------------------------
/data/alcools.smi:
--------------------------------------------------------------------------------
1 | CO	methanol
2 | CCO	ethanol
3 | 


--------------------------------------------------------------------------------
/data/caff_coca.sdf:
--------------------------------------------------------------------------------
 1 | caffeine
 2 |  OpenBabel10151815402D
 3 | 
 4 |  14 15  0  0  0  0  0  0  0  0999 V2000
 5 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
 6 |     0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
 7 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
 8 |     0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
 9 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
10 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
11 |     0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
12 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
13 |     0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
14 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
15 |     0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
16 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
17 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
18 |     0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
19 |   1  2  1  0  0  0  0
20 |   2 13  1  0  0  0  0
21 |   2  3  1  0  0  0  0
22 |   3  4  2  0  0  0  0
23 |   3  5  1  0  0  0  0
24 |   5  9  1  0  0  0  0
25 |   5  6  2  0  0  0  0
26 |   6  7  1  0  0  0  0
27 |   6 11  1  0  0  0  0
28 |   7  8  2  0  0  0  0
29 |   8  9  1  0  0  0  0
30 |   9 10  1  0  0  0  0
31 |  11 12  1  0  0  0  0
32 |  11 13  1  0  0  0  0
33 |  13 14  2  0  0  0  0
34 | M  END
35 | $$$$
36 | cocaine
37 |  OpenBabel10151815402D
38 | 
39 |  22 24  0  0  1  0  0  0  0  0999 V2000
40 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
41 |     0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
42 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
43 |     0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
44 |     0.0000    0.0000    0.0000 C   0  0  1  0  0  0  0  0  0  0  0  0
45 |     0.0000    0.0000    0.0000 C   0  0  1  0  0  0  0  0  0  0  0  0
46 |     0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
47 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
48 |     0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
49 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
50 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
51 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
52 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
53 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
54 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
55 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
56 |     0.0000    0.0000    0.0000 C   0  0  2  0  0  0  0  0  0  0  0  0
57 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
58 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
59 |     0.0000    0.0000    0.0000 C   0  0  1  0  0  0  0  0  0  0  0  0
60 |     0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
61 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
62 |   1  2  1  0  0  0  0
63 |   2  3  1  0  0  0  0
64 |   3  4  2  0  0  0  0
65 |   3  5  1  0  0  0  0
66 |   5  6  1  0  0  0  0
67 |   5 20  1  0  0  0  0
68 |   6  7  1  0  0  0  0
69 |   6 16  1  0  0  0  0
70 |   7  8  1  0  0  0  0
71 |   8  9  2  0  0  0  0
72 |   8 10  1  0  0  0  0
73 |  10 15  2  0  0  0  0
74 |  10 11  1  0  0  0  0
75 |  11 12  2  0  0  0  0
76 |  12 13  1  0  0  0  0
77 |  13 14  2  0  0  0  0
78 |  14 15  1  0  0  0  0
79 |  16 17  1  0  0  0  0
80 |  17 18  1  0  0  0  0
81 |  17 21  1  0  0  0  0
82 |  18 19  1  0  0  0  0
83 |  19 20  1  0  0  0  0
84 |  20 21  1  0  0  0  0
85 |  21 22  1  0  0  0  0
86 | M  END
87 | $$$$
88 | 


--------------------------------------------------------------------------------
/data/caff_coca.smi:
--------------------------------------------------------------------------------
1 | Cn1c(=O)c2c(ncn2C)n(C)c1=O	caffeine
2 | COC(=O)[C@H]1[C@@H](OC(=O)c2ccccc2)C[C@@H]2CC[C@H]1N2C	cocaine
3 | 


--------------------------------------------------------------------------------
/data/caff_coca_feats.ref:
--------------------------------------------------------------------------------
  1 | #atoms:14 caffeine
  2 | 0 _
  3 | 1 a
  4 | 2 a
  5 | 3 A
  6 | 4 P a
  7 | 5 P a
  8 | 6 A P a
  9 | 7 P a
 10 | 8 P a
 11 | 9 _
 12 | 10 a
 13 | 11 _
 14 | 12 a
 15 | 13 A
 16 | #bonds:15
 17 | 0 1
 18 | 1 2
 19 | 1 12
 20 | 2 3
 21 | 2 4
 22 | 4 5
 23 | 4 8
 24 | 5 6
 25 | 5 10
 26 | 6 7
 27 | 7 8
 28 | 8 9
 29 | 10 11
 30 | 10 12
 31 | 12 13
 32 | #diameter:6
 33 | 0 1 2 3 3 4 5 5 4 5 3 4 2 3
 34 | 1 0 1 2 2 3 4 4 3 4 2 3 1 2
 35 | 2 1 0 1 1 2 3 3 2 3 3 4 2 3
 36 | 3 2 1 0 2 3 4 4 3 4 4 5 3 4
 37 | 3 2 1 2 0 1 2 2 1 2 2 3 3 4
 38 | 4 3 2 3 1 0 1 2 2 3 1 2 2 3
 39 | 5 4 3 4 2 1 0 1 2 3 2 3 3 4
 40 | 5 4 3 4 2 2 1 0 1 2 3 4 4 5
 41 | 4 3 2 3 1 2 2 1 0 1 3 4 4 5
 42 | 5 4 3 4 2 3 3 2 1 0 4 5 5 6
 43 | 3 2 3 4 2 1 2 3 3 4 0 1 1 2
 44 | 4 3 4 5 3 2 3 4 4 5 1 0 2 3
 45 | 2 1 2 3 3 2 3 4 4 5 1 2 0 1
 46 | 3 2 3 4 4 3 4 5 5 6 2 3 1 0
 47 | #atoms:22 cocaine
 48 | 0 _
 49 | 1 A
 50 | 2 _
 51 | 3 A
 52 | 4 H
 53 | 5 _
 54 | 6 A
 55 | 7 _
 56 | 8 A
 57 | 9 H a h
 58 | 10 a h
 59 | 11 a h
 60 | 12 a h
 61 | 13 a h
 62 | 14 a h
 63 | 15 _
 64 | 16 _
 65 | 17 _
 66 | 18 _
 67 | 19 _
 68 | 20 D P
 69 | 21 _
 70 | #bonds:24
 71 | 0 1
 72 | 1 2
 73 | 2 3
 74 | 2 4
 75 | 4 5
 76 | 4 19
 77 | 5 6
 78 | 5 15
 79 | 6 7
 80 | 7 8
 81 | 7 9
 82 | 9 10
 83 | 9 14
 84 | 10 11
 85 | 11 12
 86 | 12 13
 87 | 13 14
 88 | 15 16
 89 | 16 17
 90 | 16 20
 91 | 17 18
 92 | 18 19
 93 | 19 20
 94 | 20 21
 95 | #diameter:10
 96 | 0 1 2 3 3 4 5 6 7 7 8 9 10 9 8 5 6 6 5 4 5 6
 97 | 1 0 1 2 2 3 4 5 6 6 7 8 9 8 7 4 5 5 4 3 4 5
 98 | 2 1 0 1 1 2 3 4 5 5 6 7 8 7 6 3 4 4 3 2 3 4
 99 | 3 2 1 0 2 3 4 5 6 6 7 8 9 8 7 4 5 5 4 3 4 5
100 | 3 2 1 2 0 1 2 3 4 4 5 6 7 6 5 2 3 3 2 1 2 3
101 | 4 3 2 3 1 0 1 2 3 3 4 5 6 5 4 1 2 3 3 2 3 4
102 | 5 4 3 4 2 1 0 1 2 2 3 4 5 4 3 2 3 4 4 3 4 5
103 | 6 5 4 5 3 2 1 0 1 1 2 3 4 3 2 3 4 5 5 4 5 6
104 | 7 6 5 6 4 3 2 1 0 2 3 4 5 4 3 4 5 6 6 5 6 7
105 | 7 6 5 6 4 3 2 1 2 0 1 2 3 2 1 4 5 6 6 5 6 7
106 | 8 7 6 7 5 4 3 2 3 1 0 1 2 3 2 5 6 7 7 6 7 8
107 | 9 8 7 8 6 5 4 3 4 2 1 0 1 2 3 6 7 8 8 7 8 9
108 | 10 9 8 9 7 6 5 4 5 3 2 1 0 1 2 7 8 9 9 8 9 10
109 | 9 8 7 8 6 5 4 3 4 2 3 2 1 0 1 6 7 8 8 7 8 9
110 | 8 7 6 7 5 4 3 2 3 1 2 3 2 1 0 5 6 7 7 6 7 8
111 | 5 4 3 4 2 1 2 3 4 4 5 6 7 6 5 0 1 2 3 3 2 3
112 | 6 5 4 5 3 2 3 4 5 5 6 7 8 7 6 1 0 1 2 2 1 2
113 | 6 5 4 5 3 3 4 5 6 6 7 8 9 8 7 2 1 0 1 2 2 3
114 | 5 4 3 4 2 3 4 5 6 6 7 8 9 8 7 3 2 1 0 1 2 3
115 | 4 3 2 3 1 2 3 4 5 5 6 7 8 7 6 3 2 2 1 0 1 2
116 | 5 4 3 4 2 3 4 5 6 6 7 8 9 8 7 2 1 2 2 1 0 1
117 | 6 5 4 5 3 4 5 6 7 7 8 9 10 9 8 3 2 3 3 2 1 0
118 | 


--------------------------------------------------------------------------------
/data/caff_coca_types.ref:
--------------------------------------------------------------------------------
  1 | #atoms:14 caffeine
  2 | 0 C1
  3 | 1 1N3
  4 | 2 1C3
  5 | 3 1O1
  6 | 4 1C3
  7 | 5 1C3
  8 | 6 1N2
  9 | 7 1C2
 10 | 8 1N3
 11 | 9 C1
 12 | 10 1N3
 13 | 11 C1
 14 | 12 1C3
 15 | 13 1O1
 16 | #bonds:15
 17 | 0 1
 18 | 1 2
 19 | 1 12
 20 | 2 3
 21 | 2 4
 22 | 4 5
 23 | 4 8
 24 | 5 6
 25 | 5 10
 26 | 6 7
 27 | 7 8
 28 | 8 9
 29 | 10 11
 30 | 10 12
 31 | 12 13
 32 | #diameter:6
 33 | 0 1 2 3 3 4 5 5 4 5 3 4 2 3
 34 | 1 0 1 2 2 3 4 4 3 4 2 3 1 2
 35 | 2 1 0 1 1 2 3 3 2 3 3 4 2 3
 36 | 3 2 1 0 2 3 4 4 3 4 4 5 3 4
 37 | 3 2 1 2 0 1 2 2 1 2 2 3 3 4
 38 | 4 3 2 3 1 0 1 2 2 3 1 2 2 3
 39 | 5 4 3 4 2 1 0 1 2 3 2 3 3 4
 40 | 5 4 3 4 2 2 1 0 1 2 3 4 4 5
 41 | 4 3 2 3 1 2 2 1 0 1 3 4 4 5
 42 | 5 4 3 4 2 3 3 2 1 0 4 5 5 6
 43 | 3 2 3 4 2 1 2 3 3 4 0 1 1 2
 44 | 4 3 4 5 3 2 3 4 4 5 1 0 2 3
 45 | 2 1 2 3 3 2 3 4 4 5 1 2 0 1
 46 | 3 2 3 4 4 3 4 5 5 6 2 3 1 0
 47 | #atoms:22 cocaine
 48 | 0 C1
 49 | 1 O2
 50 | 2 1C3
 51 | 3 1O1
 52 | 4 C3
 53 | 5 C3
 54 | 6 O2
 55 | 7 1C3
 56 | 8 1O1
 57 | 9 1C3
 58 | 10 1C2
 59 | 11 1C2
 60 | 12 1C2
 61 | 13 1C2
 62 | 14 1C2
 63 | 15 C2
 64 | 16 C3
 65 | 17 C2
 66 | 18 C2
 67 | 19 C3
 68 | 20 N3
 69 | 21 C1
 70 | #bonds:24
 71 | 0 1
 72 | 1 2
 73 | 2 3
 74 | 2 4
 75 | 4 5
 76 | 4 19
 77 | 5 6
 78 | 5 15
 79 | 6 7
 80 | 7 8
 81 | 7 9
 82 | 9 10
 83 | 9 14
 84 | 10 11
 85 | 11 12
 86 | 12 13
 87 | 13 14
 88 | 15 16
 89 | 16 17
 90 | 16 20
 91 | 17 18
 92 | 18 19
 93 | 19 20
 94 | 20 21
 95 | #diameter:10
 96 | 0 1 2 3 3 4 5 6 7 7 8 9 10 9 8 5 6 6 5 4 5 6
 97 | 1 0 1 2 2 3 4 5 6 6 7 8 9 8 7 4 5 5 4 3 4 5
 98 | 2 1 0 1 1 2 3 4 5 5 6 7 8 7 6 3 4 4 3 2 3 4
 99 | 3 2 1 0 2 3 4 5 6 6 7 8 9 8 7 4 5 5 4 3 4 5
100 | 3 2 1 2 0 1 2 3 4 4 5 6 7 6 5 2 3 3 2 1 2 3
101 | 4 3 2 3 1 0 1 2 3 3 4 5 6 5 4 1 2 3 3 2 3 4
102 | 5 4 3 4 2 1 0 1 2 2 3 4 5 4 3 2 3 4 4 3 4 5
103 | 6 5 4 5 3 2 1 0 1 1 2 3 4 3 2 3 4 5 5 4 5 6
104 | 7 6 5 6 4 3 2 1 0 2 3 4 5 4 3 4 5 6 6 5 6 7
105 | 7 6 5 6 4 3 2 1 2 0 1 2 3 2 1 4 5 6 6 5 6 7
106 | 8 7 6 7 5 4 3 2 3 1 0 1 2 3 2 5 6 7 7 6 7 8
107 | 9 8 7 8 6 5 4 3 4 2 1 0 1 2 3 6 7 8 8 7 8 9
108 | 10 9 8 9 7 6 5 4 5 3 2 1 0 1 2 7 8 9 9 8 9 10
109 | 9 8 7 8 6 5 4 3 4 2 3 2 1 0 1 6 7 8 8 7 8 9
110 | 8 7 6 7 5 4 3 2 3 1 2 3 2 1 0 5 6 7 7 6 7 8
111 | 5 4 3 4 2 1 2 3 4 4 5 6 7 6 5 0 1 2 3 3 2 3
112 | 6 5 4 5 3 2 3 4 5 5 6 7 8 7 6 1 0 1 2 2 1 2
113 | 6 5 4 5 3 3 4 5 6 6 7 8 9 8 7 2 1 0 1 2 2 3
114 | 5 4 3 4 2 3 4 5 6 6 7 8 9 8 7 3 2 1 0 1 2 3
115 | 4 3 2 3 1 2 3 4 5 5 6 7 8 7 6 3 2 2 1 0 1 2
116 | 5 4 3 4 2 3 4 5 6 6 7 8 9 8 7 2 1 2 2 1 0 1
117 | 6 5 4 5 3 4 5 6 7 7 8 9 10 9 8 3 2 3 3 2 1 0
118 | 


--------------------------------------------------------------------------------
/data/caffeine.sdf:
--------------------------------------------------------------------------------
 1 | caffeine
 2 |  OpenBabel10151815402D
 3 | 
 4 |  14 15  0  0  0  0  0  0  0  0999 V2000
 5 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
 6 |     0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
 7 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
 8 |     0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
 9 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
10 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
11 |     0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
12 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
13 |     0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
14 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
15 |     0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
16 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
17 |     0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
18 |     0.0000    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
19 |   1  2  1  0  0  0  0
20 |   2 13  1  0  0  0  0
21 |   2  3  1  0  0  0  0
22 |   3  4  2  0  0  0  0
23 |   3  5  1  0  0  0  0
24 |   5  9  1  0  0  0  0
25 |   5  6  2  0  0  0  0
26 |   6  7  1  0  0  0  0
27 |   6 11  1  0  0  0  0
28 |   7  8  2  0  0  0  0
29 |   8  9  1  0  0  0  0
30 |   9 10  1  0  0  0  0
31 |  11 12  1  0  0  0  0
32 |  11 13  1  0  0  0  0
33 |  13 14  2  0  0  0  0
34 | M  END
35 | $$$$
36 | 


--------------------------------------------------------------------------------
/data/caffeine.smi:
--------------------------------------------------------------------------------
1 | Cn1c(=O)c2c(ncn2C)n(C)c1=O	caffeine
2 | 


--------------------------------------------------------------------------------
/data/caffeine_3d.sdf:
--------------------------------------------------------------------------------
 1 | caffeine
 2 |  OpenBabel10171811233D
 3 | 
 4 |  24 25  0  0  0  0  0  0  0  0999 V2000
 5 |    -1.4537    2.7848    0.2699 C   0  0  0  0  0  0  0  0  0  0  0  0
 6 |    -1.0108    1.4083    0.1062 N   0  0  0  0  0  0  0  0  0  0  0  0
 7 |     0.3015    1.1323    0.0489 C   0  0  0  0  0  0  0  0  0  0  0  0
 8 |     1.1081    2.0920    0.1407 O   0  0  0  0  0  0  0  0  0  0  0  0
 9 |     0.8161   -0.1286   -0.1033 C   0  0  0  0  0  0  0  0  0  0  0  0
10 |    -0.0929   -1.1771   -0.2031 C   0  0  0  0  0  0  0  0  0  0  0  0
11 |     0.6111   -2.3242   -0.3462 N   0  0  0  0  0  0  0  0  0  0  0  0
12 |     1.9386   -2.0269   -0.3392 C   0  0  0  0  0  0  0  0  0  0  0  0
13 |     2.0299   -0.6962   -0.1913 N   0  0  0  0  0  0  0  0  0  0  0  0
14 |     3.2729    0.0261   -0.1349 C   0  0  0  0  0  0  0  0  0  0  0  0
15 |    -1.4004   -0.8770   -0.1432 N   0  0  0  0  0  0  0  0  0  0  0  0
16 |    -2.3540   -1.9596   -0.2459 C   0  0  0  0  0  0  0  0  0  0  0  0
17 |    -1.8697    0.3771    0.0073 C   0  0  0  0  0  0  0  0  0  0  0  0
18 |    -3.0974    0.6510    0.0627 O   0  0  0  0  0  0  0  0  0  0  0  0
19 |    -0.6884    3.3191    0.8569 H   0  0  0  0  0  0  0  0  0  0  0  0
20 |    -1.5024    3.2204   -0.7549 H   0  0  0  0  0  0  0  0  0  0  0  0
21 |    -2.4690    2.8350    0.7286 H   0  0  0  0  0  0  0  0  0  0  0  0
22 |     2.7299   -2.7636   -0.4379 H   0  0  0  0  0  0  0  0  0  0  0  0
23 |     3.4783    0.4186    0.8888 H   0  0  0  0  0  0  0  0  0  0  0  0
24 |     4.1200   -0.5981   -0.4606 H   0  0  0  0  0  0  0  0  0  0  0  0
25 |     3.2700    0.9110   -0.8337 H   0  0  0  0  0  0  0  0  0  0  0  0
26 |    -1.8812   -2.8834    0.1466 H   0  0  0  0  0  0  0  0  0  0  0  0
27 |    -2.6277   -2.0396   -1.3222 H   0  0  0  0  0  0  0  0  0  0  0  0
28 |    -3.2286   -1.7014    0.3855 H   0  0  0  0  0  0  0  0  0  0  0  0
29 |   1  2  1  0  0  0  0
30 |   1 15  1  0  0  0  0
31 |   1 16  1  0  0  0  0
32 |   1 17  1  0  0  0  0
33 |   2  3  1  0  0  0  0
34 |   3  4  2  0  0  0  0
35 |   3  5  1  0  0  0  0
36 |   5  6  2  0  0  0  0
37 |   6  7  1  0  0  0  0
38 |   6 11  1  0  0  0  0
39 |   7  8  2  0  0  0  0
40 |   8  9  1  0  0  0  0
41 |   8 18  1  0  0  0  0
42 |   9 10  1  0  0  0  0
43 |   9  5  1  0  0  0  0
44 |  10 19  1  0  0  0  0
45 |  10 20  1  0  0  0  0
46 |  10 21  1  0  0  0  0
47 |  11 12  1  0  0  0  0
48 |  11 13  1  0  0  0  0
49 |  12 22  1  0  0  0  0
50 |  12 23  1  0  0  0  0
51 |  12 24  1  0  0  0  0
52 |  13 14  2  0  0  0  0
53 |  13  2  1  0  0  0  0
54 | M  END
55 | $$$$
56 | 


--------------------------------------------------------------------------------
/data/chemical_formulas.txt:
--------------------------------------------------------------------------------
  1 | Preliminary data related to chemical formulas
  2 | 
  3 | #all digits plus all elements (128 symbols)
  4 | 0
  5 | 1
  6 | 2
  7 | 3
  8 | 4
  9 | 5
 10 | 6
 11 | 7
 12 | 8
 13 | 9
 14 | Ac
 15 | Ag
 16 | Al
 17 | Am
 18 | Ar
 19 | As
 20 | At
 21 | Au
 22 | B
 23 | Ba
 24 | Be
 25 | Bh
 26 | Bi
 27 | Bk
 28 | Br
 29 | C
 30 | Ca
 31 | Cd
 32 | Ce
 33 | Cf
 34 | Cl
 35 | Cm
 36 | Cn
 37 | Co
 38 | Cr
 39 | Cs
 40 | Cu
 41 | Db
 42 | Ds
 43 | Dy
 44 | Er
 45 | Es
 46 | Eu
 47 | F
 48 | Fe
 49 | Fl
 50 | Fm
 51 | Fr
 52 | Ga
 53 | Gd
 54 | Ge
 55 | H
 56 | He
 57 | Hf
 58 | Hg
 59 | Ho
 60 | Hs
 61 | I
 62 | In
 63 | Ir
 64 | K
 65 | Kr
 66 | La
 67 | Li
 68 | Lr
 69 | Lu
 70 | Lv
 71 | Mc
 72 | Md
 73 | Mg
 74 | Mn
 75 | Mo
 76 | Mt
 77 | N
 78 | Na
 79 | Nb
 80 | Nd
 81 | Ne
 82 | Nh
 83 | Ni
 84 | No
 85 | Np
 86 | O
 87 | Og
 88 | Os
 89 | P
 90 | Pa
 91 | Pb
 92 | Pd
 93 | Pm
 94 | Po
 95 | Pr
 96 | Pt
 97 | Pu
 98 | Ra
 99 | Rb
100 | Re
101 | Rf
102 | Rg
103 | Rh
104 | Rn
105 | Ru
106 | S
107 | Sb
108 | Sc
109 | Se
110 | Sg
111 | Si
112 | Sm
113 | Sn
114 | Sr
115 | Ta
116 | Tb
117 | Tc
118 | Te
119 | Th
120 | Ti
121 | Tl
122 | Tm
123 | Ts
124 | U
125 | V
126 | W
127 | Xe
128 | Y
129 | Yb
130 | Zn
131 | Zr
132 | 
133 | #all digits plus all characters (55 symbols)
134 | 0
135 | 1
136 | 2
137 | 3
138 | 4
139 | 5
140 | 6
141 | 7
142 | 8
143 | 9
144 | A
145 | B
146 | C
147 | D
148 | E
149 | F
150 | G
151 | H
152 | I
153 | K
154 | L
155 | M
156 | N
157 | O
158 | P
159 | R
160 | S
161 | T
162 | U
163 | V
164 | W
165 | X
166 | Y
167 | Z
168 | a
169 | b
170 | c
171 | d
172 | e
173 | f
174 | g
175 | h
176 | i
177 | k
178 | l
179 | m
180 | n
181 | o
182 | p
183 | r
184 | s
185 | t
186 | u
187 | v
188 | y
189 | 


--------------------------------------------------------------------------------
/data/cisapride.smi:
--------------------------------------------------------------------------------
1 | COc1cc(N)c(Cl)cc1C(=O)N[C@@H]1CCN(CCCOc2ccc(F)cc2)C[C@@H]1OC	Cisapride
2 | 


--------------------------------------------------------------------------------
/data/co_1conf.sdf:
--------------------------------------------------------------------------------
 1 | carbon_monoxide
 2 |  OpenBabel06242216463D
 3 | 
 4 |   2  1  0  0  0  0  0  0  0  0999 V2000
 5 |     1.1581   -0.0369   -0.0512 C   0  5  0  0  0  0  0  0  0  0  0  0
 6 |     2.2151   -0.0369   -0.0512 O   0  3  0  0  0  0  0  0  0  0  0  0
 7 |   1  2  3  0  0  0  0
 8 | M  CHG  2   1  -1   2   1
 9 | M  END
10 | $$$$
11 | 


--------------------------------------------------------------------------------
/data/cocaine.smi:
--------------------------------------------------------------------------------
1 | COC(=O)[C@H]1[C@@H](OC(=O)c2ccccc2)C[C@@H]2CC[C@H]1N2C	cocaine
2 | 


--------------------------------------------------------------------------------
/data/ethanol.smi:
--------------------------------------------------------------------------------
1 | CCO	ethanol
2 | 


--------------------------------------------------------------------------------
/data/ethanol.uhd.dix.ref:
--------------------------------------------------------------------------------
 1 | UHD-1.0.0
 2 | C	1
 3 | C,CH2O	2
 4 | C,CH2O,H4	3
 5 | C,CH3	4
 6 | C,CH3,H2O	5
 7 | C,CH3,H2O,H	6
 8 | H	7
 9 | H,C	8
10 | H,C,CH2	9
11 | H,C,CH2,H2O	10
12 | H,C,CH2,H2O,H	11
13 | H,C,CHO	12
14 | H,C,CHO,H4	13
15 | H,O	14
16 | H,O,C	15
17 | H,O,C,CH2	16
18 | H,O,C,CH2,H3	17
19 | O	18
20 | O,CH	19
21 | O,CH,CH2	20
22 | O,CH,CH2,H3	21
23 | 


--------------------------------------------------------------------------------
/data/ethanol.uhd.ref:
--------------------------------------------------------------------------------
1 | ethanol,0.0,[1:2;2:1;3:1;4:1;5:1;6:1;7:6;8:5;9:3;10:3;11:3;12:2;13:2;14:1;15:1;16:1;17:1;18:1;19:1;20:1;21:1]
2 | 


--------------------------------------------------------------------------------
/data/gen_mols.txt:
--------------------------------------------------------------------------------
 1 | #atoms:13 genmol_000001_NCGC00260832-01_f01,NCGC00261763-01_f01
 2 | 0 1,6,3,0
 3 | 1 1,6,2,0
 4 | 2 1,6,2,0
 5 | 3 1,6,2,0
 6 | 4 1,6,2,0
 7 | 5 1,6,2,0
 8 | 6 0,7,2,0
 9 | 7 1,6,3,0
10 | 8 1,6,2,0
11 | 9 1,6,2,0
12 | 10 1,6,2,0
13 | 11 1,7,2,0
14 | 12 1,6,2,0
15 | #bonds:13
16 | 0 ~ 1
17 | 1 ~ 2
18 | 2 ~ 3
19 | 3 ~ 4
20 | 4 ~ 5
21 | 5 ~ 0
22 | 6 - 7
23 | 7 ~ 8
24 | 8 ~ 9
25 | 9 ~ 10
26 | 10 ~ 11
27 | 11 ~ 12
28 | 12 ~ 7
29 | #atoms:12 genmol_000002_NCGC00261552-01_f02,NCGC00261552-01_f02
30 | 0 1,6,3,0
31 | 1 1,6,2,0
32 | 2 1,6,2,0
33 | 3 1,6,2,0
34 | 4 1,6,2,0
35 | 5 1,6,2,0
36 | 6 1,6,3,0
37 | 7 1,6,2,0
38 | 8 1,6,2,0
39 | 9 1,6,2,0
40 | 10 1,6,2,0
41 | 11 1,6,2,0
42 | #bonds:12
43 | 0 ~ 1
44 | 1 ~ 2
45 | 2 ~ 3
46 | 3 ~ 4
47 | 4 ~ 5
48 | 5 ~ 0
49 | 6 ~ 7
50 | 7 ~ 8
51 | 8 ~ 9
52 | 9 ~ 10
53 | 10 ~ 11
54 | 11 ~ 6
55 | #atoms:2 genmol_000003_NCGC00260832-01_f00,NCGC00260832-01_f00
56 | 0 0,6,1,0
57 | 1 0,6,1,0
58 | #bonds:0
59 | 


--------------------------------------------------------------------------------
/data/h2o_1conf.sdf:
--------------------------------------------------------------------------------
 1 | water
 2 |  OpenBabel07042215033D
 3 | 
 4 |   3  2  0  0  0  0  0  0  0  0999 V2000
 5 |     0.9794    0.0672    0.0986 H   0  0  0  0  0  0  0  0  0  0  0  0
 6 |     1.9473    0.0539    0.0541 O   0  0  0  0  0  0  0  0  0  0  0  0
 7 |     2.2261    0.3184    0.9436 H   0  0  0  0  0  0  0  0  0  0  0  0
 8 |   1  2  1  0  0  0  0
 9 |   2  3  1  0  0  0  0
10 | M  END
11 | $$$$
12 | 


--------------------------------------------------------------------------------
/data/merge.txt:
--------------------------------------------------------------------------------
1 | n1 1 -3
2 | n2 2 -2
3 | n3 3 -1
4 | n4 4 1
5 | n5 5 2
6 | n6 6 3
7 | 


--------------------------------------------------------------------------------
/data/ptable.txt:
--------------------------------------------------------------------------------
  1 | #anum	Symbol	prime
  2 | 1	H	2
  3 | 2	He	3
  4 | 3	Li	5
  5 | 4	Be	7
  6 | 5	B	11
  7 | 6	C	13
  8 | 7	N	17
  9 | 8	O	19
 10 | 9	F	23
 11 | 10	Ne	29
 12 | 11	Na	31
 13 | 12	Mg	37
 14 | 13	Al	41
 15 | 14	Si	43
 16 | 15	P	47
 17 | 16	S	53
 18 | 17	Cl	59
 19 | 18	Ar	61
 20 | 19	K	67
 21 | 20	Ca	71
 22 | 21	Sc	73
 23 | 22	Ti	79
 24 | 23	V	83
 25 | 24	Cr	89
 26 | 25	Mn	97
 27 | 26	Fe	101
 28 | 27	Co	103
 29 | 28	Ni	107
 30 | 29	Cu	109
 31 | 30	Zn	113
 32 | 31	Ga	127
 33 | 32	Ge	131
 34 | 33	As	137
 35 | 34	Se	139
 36 | 35	Br	149
 37 | 36	Kr	151
 38 | 37	Rb	157
 39 | 38	Sr	163
 40 | 39	Y	167
 41 | 40	Zr	173
 42 | 41	Nb	179
 43 | 42	Mo	181
 44 | 43	Tc	191
 45 | 44	Ru	193
 46 | 45	Rh	197
 47 | 46	Pd	199
 48 | 47	Ag	211
 49 | 48	Cd	223
 50 | 49	In	227
 51 | 50	Sn	229
 52 | 51	Sb	233
 53 | 52	Te	239
 54 | 53	I	241
 55 | 54	Xe	251
 56 | 55	Cs	257
 57 | 56	Ba	263
 58 | 57	La	269
 59 | 58	Ce	271
 60 | 59	Pr	277
 61 | 60	Nd	281
 62 | 61	Pm	283
 63 | 62	Sm	293
 64 | 63	Eu	307
 65 | 64	Gd	311
 66 | 65	Tb	313
 67 | 66	Dy	317
 68 | 67	Ho	331
 69 | 68	Er	337
 70 | 69	Tm	347
 71 | 70	Yb	349
 72 | 71	Lu	353
 73 | 72	Hf	359
 74 | 73	Ta	367
 75 | 74	W	373
 76 | 75	Re	379
 77 | 76	Os	383
 78 | 77	Ir	389
 79 | 78	Pt	397
 80 | 79	Au	401
 81 | 80	Hg	409
 82 | 81	Tl	419
 83 | 82	Pb	421
 84 | 83	Bi	431
 85 | 84	Po	433
 86 | 85	At	439
 87 | 86	Rn	443
 88 | 87	Fr	449
 89 | 88	Ra	457
 90 | 89	Ac	461
 91 | 90	Th	463
 92 | 91	Pa	467
 93 | 92	U	479
 94 | 93	Np	487
 95 | 94	Pu	491
 96 | 95	Am	499
 97 | 96	Cm	503
 98 | 97	Bk	509
 99 | 98	Cf	521
100 | 99	Es	523
101 | 100	Fm	541
102 | 101	Md	547
103 | 102	No	557
104 | 103	Lr	563
105 | 104	Rf	569
106 | 105	Db	571
107 | 106	Sg	577
108 | 107	Bh	587
109 | 108	Hs	593
110 | 109	Mt	599
111 | 110	Ds	601
112 | 111	Rg	607
113 | 112	Cn	613
114 | 113	Nh	617
115 | 114	Fl	619
116 | 115	Mc	631
117 | 116	Lv	641
118 | 117	Ts	643
119 | 118	Og	647
120 | 


--------------------------------------------------------------------------------
/data/test_HYD_group.sdf:
--------------------------------------------------------------------------------
 1 | test
 2 |  OpenBabel08222215263D
 3 | 
 4 |  17 16  0  0  0  0  0  0  0  0999 V2000
 5 |     0.9307   -0.0311   -0.0651 C   0  0  0  0  0  0  0  0  0  0  0  0
 6 |     0.4214   -0.4462   -1.4445 C   0  0  0  0  0  0  0  0  0  0  0  0
 7 |     0.4214   -1.0181    0.9841 C   0  0  0  0  0  0  0  0  0  0  0  0
 8 |     0.4214    1.3711    0.2651 C   0  0  0  0  0  0  0  0  0  0  0  0
 9 |     2.4586   -0.0311   -0.0651 C   0  0  0  0  0  0  0  0  0  0  0  0
10 |    -0.6743   -0.4546   -1.4725 H   0  0  0  0  0  0  0  0  0  0  0  0
11 |     0.7728   -1.4507   -1.7071 H   0  0  0  0  0  0  0  0  0  0  0  0
12 |     0.7728    0.2465   -2.2179 H   0  0  0  0  0  0  0  0  0  0  0  0
13 |    -0.6743   -1.0382    1.0055 H   0  0  0  0  0  0  0  0  0  0  0  0
14 |     0.7728   -0.7433    1.9853 H   0  0  0  0  0  0  0  0  0  0  0  0
15 |     0.7728   -2.0342    0.7709 H   0  0  0  0  0  0  0  0  0  0  0  0
16 |    -0.6743    1.3996    0.2718 H   0  0  0  0  0  0  0  0  0  0  0  0
17 |     0.7728    2.1008   -0.4735 H   0  0  0  0  0  0  0  0  0  0  0  0
18 |     0.7728    1.6945    1.2517 H   0  0  0  0  0  0  0  0  0  0  0  0
19 |     2.8515   -1.0271   -0.2996 H   0  0  0  0  0  0  0  0  0  0  0  0
20 |     2.8515    0.2638    0.9148 H   0  0  0  0  0  0  0  0  0  0  0  0
21 |     2.8515    0.6701   -0.8104 H   0  0  0  0  0  0  0  0  0  0  0  0
22 |   1  2  1  0  0  0  0
23 |   1  3  1  0  0  0  0
24 |   1  4  1  0  0  0  0
25 |   1  5  1  0  0  0  0
26 |   2  6  1  0  0  0  0
27 |   2  7  1  0  0  0  0
28 |   2  8  1  0  0  0  0
29 |   3  9  1  0  0  0  0
30 |   3 10  1  0  0  0  0
31 |   3 11  1  0  0  0  0
32 |   4 12  1  0  0  0  0
33 |   4 13  1  0  0  0  0
34 |   4 14  1  0  0  0  0
35 |   5 15  1  0  0  0  0
36 |   5 16  1  0  0  0  0
37 |   5 17  1  0  0  0  0
38 | M  END
39 | $$$$
40 | 


--------------------------------------------------------------------------------
/data/test_HYD_group.smi:
--------------------------------------------------------------------------------
1 | C(C)(C)(C)(C)	test
2 | 


--------------------------------------------------------------------------------
/data/test_mols.txt:
--------------------------------------------------------------------------------
1 | 1,0.0,[0:1;2:1;3:5]
2 | 2,0.0,[0:3;1:3;2:3;3:3]
3 | 3,0.0,[2:4]
4 | 4,0.0,[0:5]
5 | 5,0.0,[1:5]
6 | 6,0.0,[2:5]
7 | 7,0.0,[3:5]
8 | 


--------------------------------------------------------------------------------
/deepsmi_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #set -x
 4 | 
 5 | head ~/src/FMGO/data/TCM_20k.smi > input.smi
 6 | dos2unix input.smi
 7 | ./bin/molenc_deepsmi.py --no-rings    -i input.smi   -o output.dsmi
 8 | ./bin/molenc_deepsmi.py --no-rings -d -i output.dsmi -o output.smi
 9 | diff input.smi output.smi
10 | rm -f output.{dsmi,smi}
11 | 
12 | ./bin/molenc_deepsmi.py --no-branches    -i input.smi   -o output.dsmi
13 | ./bin/molenc_deepsmi.py --no-branches -d -i output.dsmi -o output.smi
14 | diff input.smi output.smi
15 | rm -f output.{dsmi,smi}
16 | 
17 | ./bin/molenc_deepsmi.py --no-branches --no-rings    -i input.smi   -o output.dsmi
18 | ./bin/molenc_deepsmi.py --no-branches --no-rings -d -i output.dsmi -o output.smi
19 | diff input.smi output.smi
20 | rm -f output.{dsmi,smi}
21 | 


--------------------------------------------------------------------------------
/doc/Ester_KDD_1996_DBSCANclustering.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UnixJunkie/molenc/edc27db8206e6cbca4409b962426c94f3d14e18d/doc/Ester_KDD_1996_DBSCANclustering.pdf


--------------------------------------------------------------------------------
/doc/Shrivastava_2016_ExactWeightedMinwiseHashing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UnixJunkie/molenc/edc27db8206e6cbca4409b962426c94f3d14e18d/doc/Shrivastava_2016_ExactWeightedMinwiseHashing.pdf


--------------------------------------------------------------------------------
/dune-project:
--------------------------------------------------------------------------------
1 | (lang dune 1.11)
2 | (name molenc)
3 | 


--------------------------------------------------------------------------------
/fcodec:
--------------------------------------------------------------------------------
1 | _build/default/src/molenc_fcodec.exe


--------------------------------------------------------------------------------
/histo.gpl:
--------------------------------------------------------------------------------
 1 | 
 2 | set xlabel 'score'
 3 | set ylabel 'frequency'
 4 | 
 5 | # gauss1(x) = a1 / (sigma1*sqrt(2.*pi)) * exp(-(x-mu1)**2. / (2.*sigma1**2))
 6 | # gauss2(x) = a2 / (sigma2*sqrt(2.*pi)) * exp(-(x-mu2)**2. / (2.*sigma2**2))
 7 | 
 8 | gauss1(x) = a1/(sqrt(2*pi)*sigma1)*exp(-(x-mean1)**2/(2*sigma1**2))
 9 | gauss2(x) = a2/(sqrt(2*pi)*sigma2)*exp(-(x-mean2)**2/(2*sigma2**2))
10 | 
11 | # FBR: we need to init the mean with a good value so that the optim
12 | #      will converge
13 | mean1 = -8
14 | mean2 = -8
15 | 
16 | fit gauss1(x) '/tmp/lean_histo_abe414.txt' u 1:2 via a1,sigma1,mean1
17 | fit gauss2(x) '/tmp/lean_histo_abe414.txt' u 1:3 via a2,sigma2,mean2
18 | 
19 | plot '/tmp/lean_histo_abe414.txt' u 1:2         w l t 'smaller sample', \
20 |      ''                           u ($1+0.01):3 w l t 'bigger sample', \
21 |      gauss1(x) t 'smaller fit', \
22 |      gauss2(x) t 'bigger fit'
23 | 


--------------------------------------------------------------------------------
/kb_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | 
 5 | ~/src/molenc/kbe -i all_uniq_std.txt -k 64 > test_64_1xCPU.txt
 6 | sort test_64_1xCPU.txt -o test_64_1xCPU.txt
 7 | 
 8 | ~/src/molenc/kbe -np 16 -i all_uniq_std.txt -k 64 > test_64_16xCPU.txt
 9 | sort test_64_16xCPU.txt -o test_64_16xCPU.txt
10 | 
11 | diff test_64_1xCPU.txt test_64_16xCPU.txt
12 | 


--------------------------------------------------------------------------------
/mol_frag_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # set -x #DEBUG
 4 | 
 5 | # --view --> call mview on .smi files
 6 | MVIEW=""
 7 | if [ "$1" == "--view" ] || [ "$2" == "--view" ]; then
 8 |     MVIEW="1"
 9 | fi
10 | 
11 | # --big --> work on the "big" dataset
12 | BIG=""
13 | if [ "$1" == "--big" ] || [ "$2" == "--big" ]; then
14 |     BIG="1"
15 | fi
16 | 
17 | if [ "$BIG" == "" ]; then
18 |     # clean
19 |     rm -f data/3.to_frag data/3_frags.txt data/3_frags.smi \
20 |        data/3_genmols.txt data/3_genmols.smi
21 |     # regen
22 |     ./bin/molenc_frag.py -i data/3.smi -o data/3.to_frag
23 |     [ "$MVIEW" == "1" ] && mview data/3.smi &
24 |     ./molenc_frag -im data/3.to_frag -of data/3_frags.txt -s 1234
25 |     ./bin/molenc_frag2smi.py -i data/3_frags.txt -o data/3_frags.smi
26 |     [ "$MVIEW" == "1" ] && mview data/3_frags.smi &
27 |     ./molenc_frag -if data/3_frags.txt -om data/3_genmols.txt -s 1234 -n 20
28 |     ./bin/molenc_mol2smi.py -i data/3_genmols.txt -o data/3_genmols.smi
29 |     cut -f1 data/3_genmols.smi | sort -u > data/3_genmols_uniq.smi
30 |     [ "$MVIEW" == "1" ] && mview data/3_genmols_uniq.smi &
31 | else
32 |     IN=data/chembl_antivirals
33 |     ./bin/molenc_frag.py -i $IN.smi -o $IN.to_frag --draw
34 |     ./molenc_frag -im $IN.to_frag -of $IN.frags -s 1234
35 |     ./bin/molenc_frag2smi.py -i $IN.frags -o $IN.frags.smi
36 |     ./molenc_frag -if $IN.frags -om $IN.mols -s 1234 -n 50
37 |     ./bin/molenc_mol2smi.py -i $IN.mols -o $IN.mols.smi
38 | fi
39 | 


--------------------------------------------------------------------------------
/molenc_frag:
--------------------------------------------------------------------------------
1 | ./_build/install/default/bin/molenc_frag


--------------------------------------------------------------------------------
/rfp:
--------------------------------------------------------------------------------
1 | _build/default/src/molenc_RFP.exe


--------------------------------------------------------------------------------
/smisur_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x #DEBUG
 4 | 
 5 | # clean
 6 | rm -f data/chembl_antivirals.frags.smi data/chembl_antivirals.genmols.txt
 7 | # fragment
 8 | ./bin/molenc_smisur.py --seed 1234 \
 9 |                        -i data/chembl_antivirals.smi \
10 |                        -o data/chembl_antivirals.frags.smi
11 | # assemble
12 | ./bin/molenc_smisur.py --seed 1234 --assemble \
13 |                        -i data/chembl_antivirals.frags.smi \
14 |                        -o data/chembl_antivirals.genmols.txt
15 | 


--------------------------------------------------------------------------------
/src/MSE_mol.ml:
--------------------------------------------------------------------------------
  1 | (* Copyright (C) 2020, Francois Berenger
  2 | 
  3 |    Yamanishi laboratory,
  4 |    Department of Bioscience and Bioinformatics,
  5 |    Faculty of Computer Science and Systems Engineering,
  6 |    Kyushu Institute of Technology,
  7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
  8 | 
  9 | (* Multi-Scale-Encoded molecule *)
 10 | 
 11 | open Printf
 12 | 
 13 | module L = MyList
 14 | module Log = Dolog.Log
 15 | module String = BatString
 16 | module StringMap = BatMap.String
 17 | 
 18 | type t = { name: string; map: int StringMap.t }
 19 | 
 20 | let create name map =
 21 |   { name; map }
 22 | 
 23 | let get_name x =
 24 |   x.name
 25 | 
 26 | let get_map x =
 27 |   x.map
 28 | 
 29 | let feat_count_of_string s =
 30 |   try Scanf.sscanf s "%s %d" (fun s d -> (s, d))
 31 |   with exn -> (eprintf "MSE_mol.feat_count_of_string: cannot parse: %s" s;
 32 |                raise exn)
 33 | 
 34 | (* to construct one molecules with all its constituent lines
 35 |    already read from the input file *)
 36 | let read_one = function
 37 |   | [] -> failwith "MSE_mol.read_one: empty list"
 38 |   | name_line :: feat_count_strs ->
 39 |     (* molecule separator is a line starting with a '#' char *)
 40 |     assert(String.get name_line 0 = '#');
 41 |     let name = String.lchop name_line in (* remove it *)
 42 |     let map =
 43 |       List.fold_left (fun acc line ->
 44 |           let feat, count = feat_count_of_string line in
 45 |           (* feature cannot already be here; otherwise,
 46 |              there was a problem during encoding of the molecule *)
 47 |           if StringMap.mem feat acc then
 48 |             Log.warn "mol: %s dup feat: %s" name feat;
 49 |           StringMap.add feat count acc
 50 |         ) StringMap.empty feat_count_strs in
 51 |     create name map
 52 | 
 53 | let previous_name = ref ""
 54 | 
 55 | exception Break
 56 | 
 57 | (* get lines for just one molecule (i.e. for one call to read_one after) *)
 58 | let get_lines input =
 59 |   let acc = ref [] in
 60 |   if !previous_name = "" then
 61 |     begin
 62 |       let line = input_line input in
 63 |       assert(BatString.starts_with line "#"); (* enforce name line *)
 64 |       previous_name := line
 65 |     end;
 66 |   acc := [!previous_name];
 67 |   try
 68 |     while true do
 69 |       let line' = input_line input in
 70 |       if BatString.starts_with line' "#" then
 71 |         (* this is the start of another molecule *)
 72 |         begin
 73 |           previous_name := line';
 74 |           raise Break
 75 |         end
 76 |       else
 77 |         acc := line' :: !acc
 78 |     done;
 79 |     assert(false) (* for typing: should never be reached at exec *)
 80 |   with Break -> L.rev !acc
 81 |      | End_of_file ->
 82 |        begin
 83 |          previous_name := "";
 84 |          L.rev !acc
 85 |        end
 86 | 
 87 | let of_lines lines =
 88 |   let rec loop acc ls =
 89 |     match ls with
 90 |     | [] -> L.rev acc
 91 |     | _ ->
 92 |       let name_l, rest =
 93 |         L.fold_while
 94 |           (fun _acc l -> String.starts_with l "#")
 95 |           (fun acc x -> x :: acc) [] ls in
 96 |       (match name_l with
 97 |        | [name] ->
 98 |          (let feat_counts, remaining_mols =
 99 |             L.fold_while
100 |               (fun _acc l -> not (String.starts_with l "#"))
101 |               (fun acc x -> x :: acc) [] rest in
102 |           let mol = read_one (name :: feat_counts) in
103 |           loop (mol :: acc) remaining_mols)
104 |        | _ -> assert(false)) in
105 |   loop [] lines
106 | 


--------------------------------------------------------------------------------
/src/ap_types.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | (* read files output by ./bin/type_atoms.py *)
10 | 
11 | module A = Array
12 | module IntSet = BatSet.Int
13 | module L = BatList
14 | 
15 | let read_one counter input =
16 |   (* "#atoms:14 caffeine" *)
17 |   let atoms_header = input_line input in
18 |   let nb_atoms, mol_name =
19 |     Scanf.sscanf atoms_header "#atoms:%d %s"
20 |       (fun nb_atoms name -> (nb_atoms, name)) in
21 |   (* read atoms *)
22 |   let atom_lines = Utls.read_n_lines nb_atoms input in
23 |   let atoms =
24 |     L.map (fun l ->
25 |         Scanf.sscanf l "%d %s"
26 |           (fun _index typ -> PiEltHA.of_string typ)
27 |       ) atom_lines in
28 |   let atom_types = A.of_list atoms in
29 |   (* read bonds header; like "#bonds:15" *)
30 |   let bonds_header = input_line input in
31 |   let nb_bonds = Scanf.sscanf bonds_header "#bonds:%d" (fun n -> n) in
32 |   (* read bonds *)
33 |   let bond_lines = Utls.read_n_lines nb_bonds input in
34 |   let succs_table = A.make nb_atoms IntSet.empty in
35 |   L.iter (fun bond_line ->
36 |       Scanf.sscanf bond_line "%d %d" (fun start stop ->
37 |           assert(start <> stop);
38 |           (* we need to add the bond two times, because the molecular
39 |              graph is undirected *)
40 |           (* start -> stop *)
41 |           succs_table.(start) <- IntSet.add stop succs_table.(start);
42 |           (* stop -> start *)
43 |           succs_table.(stop) <- IntSet.add start succs_table.(stop)
44 |         )
45 |     ) bond_lines;
46 |   (* read distance matrix *)
47 |   (* matrix header line *)
48 |   let matrix_header = input_line input in
49 |   let diameter = Scanf.sscanf matrix_header "#diameter:%d" (fun n -> n) in
50 |   (* matrix' content *)
51 |   let matrix_lines = Utls.read_n_lines nb_atoms input in
52 |   let matrix = Array.make_matrix nb_atoms nb_atoms 0 in
53 |   L.iteri (fun i line ->
54 |       let dist_strings = BatString.split_on_string line ~by:" " in
55 |       L.iteri (fun j str ->
56 |           let d = int_of_string str in
57 |           matrix.(i).(j) <- d
58 |         ) dist_strings
59 |     ) matrix_lines;
60 |   let nodes =
61 |     A.mapi (fun i typ ->
62 |         Node.create typ succs_table.(i)
63 |       ) atom_types in
64 |   incr counter;
65 |   Mini_mol.create mol_name nodes diameter matrix
66 | 


--------------------------------------------------------------------------------
/src/atom_env.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | (* atom environment *)
10 | 
11 | open Printf
12 | 
13 | module L = BatList
14 | module Log = Dolog.Log
15 | 
16 | (*   layer = (depth, counted-atoms) *)
17 | type layer = int * ((PiEltHA.t * int) list)
18 | (*       center-atom layers *)
19 | type t = layer list
20 | 
21 | let counted_types_to_string (l: (PiEltHA.t * int) list): string =
22 |   let buff = Buffer.create 80 in
23 |   L.iteri (fun i (x, count) ->
24 |       bprintf buff (if i = 0 then "%s:%d" else ",%s:%d")
25 |         (PiEltHA.to_string x) count
26 |     ) l;
27 |   Buffer.contents buff
28 | 
29 | let counted_types_of_string (s: string): (PiEltHA.t * int) list =
30 |   let strings = BatString.split_on_string s ~by:"," in
31 |   L.map (fun str -> Scanf.sscanf str "%s:%d" Utls.make_pair) strings
32 | 
33 | let layer_to_string ((depth, counted_types): layer): string =
34 |   sprintf "%d_%s" depth (counted_types_to_string counted_types)
35 | 
36 | let layer_of_string (str: string): layer =
37 |   Scanf.sscanf str "%d_%s" (fun d s ->
38 |       (d, counted_types_of_string s)
39 |     )
40 | 
41 | let to_string (layers: t): string =
42 |   let buff = Buffer.create 80 in
43 |   L.iteri (fun i layer ->
44 |       bprintf buff (if i = 0 then "%s" else ";%s")
45 |         (layer_to_string layer)
46 |     ) layers;
47 |   Buffer.contents buff
48 | 
49 | let of_string (s: string): t =
50 |   let layer_strings = BatString.split_on_string s ~by:";" in
51 |   L.map layer_of_string layer_strings
52 | 
53 | (* parse the 1st line of a .idx file *)
54 | let parse_index_comment fn =
55 |   let header, index_lines = Utls.maybe_extract_comment_header fn in
56 |   match header with
57 |   | None -> (-1, [])
58 |   | Some comment ->
59 |     let radius = Scanf.sscanf comment "#radius=%d" (fun r -> r) in
60 |     (radius, index_lines)
61 | 
62 | (* parse the 1st line of a .mop2d file *)
63 | let parse_molecules_comment fn =
64 |   let header, mol_lines = Utls.maybe_extract_comment_header fn in
65 |   match header with
66 |   | None -> (-1, "/dev/null", mol_lines)
67 |   | Some comment ->
68 |     let radius, index_fn =
69 |       Scanf.sscanf comment "#radius=%d;index=%s"
70 |         (fun r fn -> (r, fn)) in
71 |     (radius, index_fn, mol_lines)
72 | 
73 | (* parse the 1st line of an already opened .mop2d file
74 |    (and advance the file pointer) *)
75 | let parse_comment input =
76 |   try (* we are parsing a valid .mop2d file *)
77 |     Scanf.sscanf (input_line input)
78 |       "#radius=%d;index=%s" (fun r fn -> (r, fn))
79 |   with (* we are not *)
80 |     Scanf.Scan_failure _ -> (-1, "/dev/null")
81 | 
82 | (* extract the MOP2D atom env. to bitstring index HT *)
83 | let restore_mop2d_index fn =
84 |   let radius, index_lines = parse_index_comment fn in
85 |   let mop2d_envs = L.map of_string index_lines in
86 |   let res = Hashtbl.create 11 in
87 |   L.iteri (fun i env ->
88 |       (* eprintf "%s\n" (Mop2d_env.to_string env); *)
89 |       assert(not (Hashtbl.mem res env));
90 |       Hashtbl.add res env i
91 |     ) mop2d_envs;
92 |   Log.info "index size: %d" (Hashtbl.length res);
93 |   (radius, res)
94 | 


--------------------------------------------------------------------------------
/src/atom_pair.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan.
 8 | 
 9 |    An atom pair *)
10 | 
11 | (* FBR: TO DROP *)
12 | 
13 | type t = { src: PiEltHA.t; (* source atom *)
14 |            dst: PiEltHA.t; (* destination atom *)
15 |            dist: int } (* distance between them in bonds *)
16 | 
17 | (* canonicalization is obtained by sorting the types *)
18 | let create x y dist =
19 |   if PiEltHA.compare x y <= 0 then
20 |     { src = x; dst = y; dist }
21 |   else
22 |     { src = y; dst = x; dist }
23 | 
24 | let to_string { src; dst; dist } =
25 |   Printf.sprintf "%s-%d-%s" src dist dst
26 | 
27 | let dist x =
28 |   x.dist
29 | 


--------------------------------------------------------------------------------
/src/bloom.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | (* A counted Bloom filter *)
 3 | 
 4 | module A = BatArray
 5 | module Fp = Fingerprint
 6 | module L = BatList
 7 | module Log = Dolog.Log
 8 | 
 9 | type t = int array array (* input feature index (0..N-1) to output feature
10 |                             indexes mapping (0..M-1) *)
11 | 
12 | let distinct_rands rng n bound =
13 |   let rec loop acc count =
14 |     if count = n then
15 |       acc
16 |     else
17 |       let cand = Random.State.int rng bound in
18 |       if List.mem cand acc then
19 |         loop acc count (* retry *)
20 |       else
21 |         loop (cand :: acc) (count + 1) in
22 |   loop [] 0
23 | 
24 | (* n: input vector dimension
25 |    k: number of "hash" functions;
26 |       number of output features "turned ON" by a single input feature
27 |    m: output vector dimension *)
28 | let init n k m =
29 |   let res = Array.make_matrix n k 0 in
30 |   let rng = Random.State.make [|3141596|] in
31 |   for i = 0 to n - 1 do
32 |     let rands = distinct_rands rng k m in
33 |     L.iteri (fun j rand ->
34 |         res.(i).(j) <- rand
35 |       ) rands
36 |   done;
37 |   (* log the number of collisions
38 |      (different input features mapping to the same set of output features *)
39 |   let collisions = ref 0 in
40 |   let sorted = A.copy res in
41 |   A.sort compare sorted;
42 |   for i = 1 to n - 1 do
43 |     if sorted.(i - 1) = sorted.(i) then
44 |       incr collisions;
45 |   done;
46 |   (if !collisions > 0 then
47 |      Log.warn "Bloom.init(%d,%d,%d): %d collisions" n k m !collisions
48 |   );
49 |   (n, k, m, res)
50 | 
51 | let encode (_n, k, m, mappings) fp =
52 |   let kvs = Fp.key_value_pairs fp in (* sparse input vector *)
53 |   let res = A.make m 0 in (* dense output vector *)
54 |   L.iter (fun (key, value) ->
55 |       let output_indexes = mappings.(key) in
56 |       (* increment all corresponding output features *)
57 |       for i = 0 to k - 1 do
58 |         let j = output_indexes.(i) in
59 |         res.(j) <- res.(j) + value
60 |       done
61 |     ) kvs;
62 |   res
63 | 


--------------------------------------------------------------------------------
/src/bond.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | (* very small bond module to help compute atom environments (a la molprint2d)
10 |    from MOL2 files *)
11 | 
12 | open Printf
13 | 
14 | type t = { idx: int ;
15 |            src: int ;
16 |            dst: int }
17 | 
18 | (* indexes start at 1 in the MOL2 file format *)
19 | let create (idx: int) (src: int) (dst: int): t =
20 |   { idx = idx - 1; src = src - 1; dst = dst - 1 }
21 | 
22 | let dummy = create (-1) (-1) (-1)
23 | 
24 | (* example line (output of OpenBabel):
25 |    "     1     1    11   ar" *)
26 | let of_mol2_line l =
27 |   try Scanf.sscanf l " %d %d %d %s"
28 |         (fun idx src dst _bullshit -> create idx src dst)
29 |   with _ -> failwith ("Bond.of_mol2_line: could not parse: " ^ l)
30 | 
31 | let to_string (a: t): string =
32 |   sprintf "%d %d %d" a.idx a.src a.dst
33 | 


--------------------------------------------------------------------------------
/src/dune:
--------------------------------------------------------------------------------
 1 | 
 2 | (library
 3 |   (name molenc)
 4 |   (public_name molenc)
 5 |   (modules ap_types atom_env fingerprint fpMol atom_pair mini_mol MSE_mol
 6 |            myList node piEltHA scale utls WMH norm bloom index rdkit sdf_3D gram ptable)
 7 |   (libraries batteries dolog bst parany pyml vector3 line_oriented str))
 8 | 
 9 | ;; installed executables / public targets
10 | (executables
11 |   (names encoder decoder filter butina pubchem_decoder uniq to_dense get_mol
12 |          rank ap_encoder prune merge MST lig_box shannon fragmentable_mol
13 |          indexer pareto finder AP_BBAD split sdf_read dsmi BBAD molenc_AP molenc_UHD shuf)
14 |   (public_names molenc_e molenc_d molenc_filter molenc_cluster
15 |                 molenc_pubchem_decoder molenc_uniq molenc_dense molenc_get
16 |                 molenc_rank molenc_ap molenc_prune molenc_merge molenc_mst
17 |                 molenc_ligbox molenc_shannon molenc_frag molenc_indexer
18 |                 molenc_pareto molenc_finder molenc_apbbad molenc_split
19 |                 molenc_sdf_read molenc_dsmi molenc_bbad molenc_AP molenc_UHD molenc_shuf)
20 |   (modules encoder decoder filter butina pubchem_decoder uniq to_dense get_mol
21 |            sybyl syb_atom mol2 sdf smi ph4 rank ap_encoder prune merge MST
22 |            palette gnuplot lig_box shannon fragmentable_mol indexer pareto
23 |            finder AP_BBAD split sdf_read dsmi BBAD molenc_AP molenc_UHD shuf)
24 |   (libraries molenc bst batteries dolog minicli parany cpm dokeysto
25 |              ocamlgraph vector3 line_oriented pyml))
26 | 
27 | ;; never installed executables
28 | (executables
29 |   (names fp_test wmh_test wmh_bench wmh_unit_test test_RS)
30 |   (modules fp_test wmh_test wmh_bench wmh_unit_test test_RS)
31 |   (libraries molenc batteries dolog minicli))
32 | 


--------------------------------------------------------------------------------
/src/encoder.ml:
--------------------------------------------------------------------------------
  1 | (* Copyright (C) 2020, Francois Berenger
  2 | 
  3 |    Yamanishi laboratory,
  4 |    Department of Bioscience and Bioinformatics,
  5 |    Faculty of Computer Science and Systems Engineering,
  6 |    Kyushu Institute of Technology,
  7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
  8 | 
  9 | (* molecular encoder: a molecule is a list of atom environments.
 10 |    canonicalization is done by sorting (atoms in an environment
 11 |    as well as the list of environments that constitutes
 12 |    the entirely encoded molecule) *)
 13 | 
 14 | open Printf
 15 | 
 16 | module Ap_types = Molenc.Ap_types
 17 | module Atom_env = Molenc.Atom_env
 18 | module CLI = Minicli.CLI
 19 | module L = BatList
 20 | module Log = Dolog.Log
 21 | module LO = Line_oriented
 22 | module Mini_mol = Molenc.Mini_mol
 23 | module Ht = BatHashtbl
 24 | module Scale = Molenc.Scale
 25 | module StringSet = BatSet.String
 26 | module Utls = Molenc.Utls
 27 | 
 28 | let read_one counter input () =
 29 |   try
 30 |     let m = Ap_types.read_one counter input in
 31 |     if !counter mod 1000 = 0 then
 32 |       (* user feedback *)
 33 |       eprintf "read %d\r%!" !counter;
 34 |     m
 35 |   with End_of_file ->
 36 |     begin
 37 |       Log.info "read %d" !counter;
 38 |       raise Parany.End_of_input
 39 |     end
 40 | 
 41 | let process_one radii m =
 42 |   let buff = Buffer.create 1024 in
 43 |   let name = Mini_mol.get_name m in
 44 |   bprintf buff "#%s\n" name;
 45 |   let seen_envs = Ht.create 1000 in
 46 |   L.iter (fun radius ->
 47 |       let envs = Mini_mol.encode radius m in
 48 |       L.iter (fun (env, count) ->
 49 |           (* only output envs that were not already encountered
 50 |              at lower radius *)
 51 |           if not (Ht.mem seen_envs env) then
 52 |             begin
 53 |               bprintf buff "%s %d\n" (Atom_env.to_string env) count;
 54 |               Ht.add seen_envs env ()
 55 |             end
 56 |         ) envs
 57 |     ) radii;
 58 |   Buffer.contents buff
 59 | 
 60 | let write_one output str =
 61 |   fprintf output "%s" str
 62 | 
 63 | let main () =
 64 |   Log.(set_log_level INFO);
 65 |   Log.color_on ();
 66 |   let argc, args = CLI.init () in
 67 |   if argc = 1 then
 68 |     (eprintf "usage:\n  \
 69 |               %s -i molecules.{types|ph4} -r {radius|srad:frad} -o out.idx\n  \
 70 |               -i <filename>: where to read molecules from\n  \
 71 |               -r {<int>|<int>:<int>}: encoding radius or radii range\n  \
 72 |               -d <filename>: read feature dico from file\n  \
 73 |               -o <filename>: where to write encoded molecules\n  \
 74 |               [-n <int>]: max jobs in parallel\n"
 75 |        Sys.argv.(0);
 76 |      exit 1);
 77 |   let input_fn = CLI.get_string ["-i"] args in
 78 |   let output_fn = CLI.get_string ["-o"] args in
 79 |   let nprocs = CLI.get_int_def ["-n"] args 1 in
 80 |   let scale =
 81 |     if L.mem "-r" args && L.mem "-d" args then
 82 |       (* enforce that radius ranges are equal *)
 83 |       let r_scale = Scale.of_string (CLI.get_string ["-r"] args) in
 84 |       let d_scale = Scale.of_dictionary_header (CLI.get_string ["-d"] args) in
 85 |       Utls.enforce (r_scale = d_scale)
 86 |         (sprintf "Encoder: -r and -d don't agree: r_scale=%s d_scale=%s"
 87 |            (Scale.to_string r_scale) (Scale.to_string d_scale));
 88 |       r_scale
 89 |     else
 90 |       match CLI.get_string_opt ["-r"] args with
 91 |       | Some r_str -> Scale.of_string r_str
 92 |       | None ->
 93 |         let dico_fn = CLI.get_string ["-d"] args in
 94 |         Scale.of_dictionary_header dico_fn in
 95 |   let radii = Scale.to_list scale in
 96 |   LO.with_infile_outfile input_fn output_fn (fun input output ->
 97 |       (* format header *)
 98 |       fprintf output "#radius=%s\n%!" (Scale.to_string scale);
 99 |       Parany.run ~preserve:true ~csize:1 nprocs
100 |         ~demux:(read_one (ref 0) input)
101 |         ~work:(process_one radii)
102 |         ~mux:(write_one output)
103 |     )
104 | 
105 | let () = main ()
106 | 


--------------------------------------------------------------------------------
/src/finder.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2021, Francois Berenger
 2 |    Tsuda laboratory,
 3 |    Graduate School of Frontier Sciences,
 4 |    The University of Tokyo,
 5 |    5-1-5 Kashiwa-no-ha,
 6 |    Kashiwa, Chiba 277-8561, Japan.
 7 | 
 8 |    Find name, Tanimoto-score and SMILES of nearest neighbor molecules.
 9 |    Output a valid SMILES file. *)
10 | 
11 | open Printf
12 | 
13 | module A = BatArray
14 | module Bstree = Molenc.Index.Bstree
15 | module CLI = Minicli.CLI
16 | module FpMol = Molenc.FpMol
17 | module Ht = Hashtbl
18 | module L = BatList
19 | module LO = Line_oriented
20 | module Log = Dolog.Log
21 | module MolIndex = Molenc.Index
22 | module S = BatString
23 | module Utls = Molenc.Utls
24 | 
25 | let split_smiles_line l =
26 |   (* Expect '\t' separated SMILES *)
27 |   S.split l ~by:"\t"
28 | 
29 | let ht_insert_name_if_not_there name2smi name smi =
30 |   if Ht.mem name2smi name then
31 |     let () = Log.fatal "Finder.ht_insert_name_if_not_there: \
32 |                         already seen molecule name: %s" name in
33 |     exit 1
34 |   else
35 |     Ht.add name2smi name smi
36 | 
37 | let process_smiles_file name2smi name2activity fn =
38 |   LO.iteri fn (fun i line ->
39 |       let smi, name = split_smiles_line line in
40 |       ht_insert_name_if_not_there name2smi name smi;
41 |       (* also index the molecule under its "raw" name
42 |          (without postfix pIC50 value) *)
43 |       if S.contains name '_' then
44 |         begin
45 |           let raw_name, pIC50 = S.split name ~by:"_" in
46 |           ht_insert_name_if_not_there name2smi raw_name smi;
47 |           ht_insert_name_if_not_there name2activity raw_name pIC50
48 |         end;
49 |       if (i mod 1000) = 0 then
50 |         printf "Loaded molecules: %d\r%!" (Ht.length name2smi)
51 |     )
52 | 
53 | let main () =
54 |   Log.(set_log_level INFO);
55 |   Log.color_on ();
56 |   let argc, args = CLI.init () in
57 |   if argc = 1 then
58 |     begin
59 |       eprintf "usage:\n\
60 |                %s\n  \
61 |                -i <filename.AP>: encoded molecules input file\n  \
62 |                --bst-fns <fn1[,fn2[,fn3...]]>: list of BST index files\n  \
63 |                --smi-fns <fn1[,fn2[,fn3...]]>: list of SMILES files\n  \
64 |                -np <int>: nprocs (default=1)\n"
65 |         Sys.argv.(0);
66 |       exit 1
67 |     end;
68 |   let input_fn = CLI.get_string ["-i"] args in
69 |   let output_fn = CLI.get_string ["-o"] args in
70 |   let bst_fns = S.split_on_char ',' (CLI.get_string ["--bst-fns"] args) in
71 |   let smi_fns = S.split_on_char ',' (CLI.get_string ["--smi-fns"] args) in
72 |   let nprocs = CLI.get_int_def ["-np"] args 1 in
73 |   CLI.finalize (); (* ------------------------------------------------------ *)
74 |   (* populate the name to SMILES LUT *)
75 |   let name2smi = Ht.create 1_000_000 in
76 |   let name2activity = Ht.create 1_000_000 in
77 |   L.iter (process_smiles_file name2smi name2activity) smi_fns;
78 |   let encoded_molecules_in =
79 |     A.of_list (Molenc.FpMol.molecules_of_file input_fn) in
80 |   LO.with_out_file output_fn (fun out ->
81 |       let fp_name_dists =
82 |         MolIndex.nearest_neighbor_names_a
83 |           nprocs bst_fns encoded_molecules_in in
84 |       A.iter (fun (_fp, name, dist) ->
85 |           let smi, pIC50 =
86 |             try (Ht.find name2smi name, Ht.find name2activity name)
87 |             with Not_found ->
88 |               let () = Log.fatal "Finder.main: not in Ht: %s" name in
89 |               exit 1 in
90 |           let tani = 1.0 -. dist in
91 |           fprintf out "%s\t%s_T=%.2f_pIC50=%s\n" smi name tani pIC50
92 |         ) fp_name_dists
93 |     )
94 |   
95 | let () = main ()
96 | 


--------------------------------------------------------------------------------
/src/formula.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | module A = BatArray
 3 | module L = BatList
 4 | module Log = Dolog.Log
 5 | module SMap = BatMap.String
 6 | 
 7 | open Printf
 8 | 
 9 | type formula_item = Element of string
10 |                   | Count of int
11 | 
12 | (* parse list of tokens *)
13 | let rec count_elements = function
14 |   | [] -> []
15 |   | [Element symb] -> [(symb, 1)]
16 |   | [Count _] -> assert(false) (* should have been processed before *)
17 |   | (Element s0) :: (Element s1) :: rest ->
18 |     (s0, 1) :: count_elements (Element s1 :: rest)
19 |   | (Element symb) :: (Count c) :: rest ->
20 |     (symb, c) :: (count_elements rest)
21 |   | _ -> assert(false)
22 | 
23 | let parse_int s =
24 |   try int_of_string s
25 |   with exn ->
26 |     (Log.fatal "Formula.parse_int: cannot parse: %s" s;
27 |      raise exn)
28 | 
29 | (* formula -> int *)
30 | let encode _debug f =
31 |   let element_counts = A.make 119 0 in
32 |   (* lexer: tokenize chemical elements starting from two chars ones *)
33 |   let element_counts_0 =
34 |     Str.bounded_full_split Ptable.elements_regexp f 1024 in
35 |   let element_counts_1 =
36 |     L.map (function Str.Delim symbol -> Element symbol
37 |                   | Str.Text count -> Count (parse_int count)
38 |       ) element_counts_0 in
39 |   let element_counts_2 = count_elements element_counts_1 in
40 |   L.iter (fun (symb, count) ->
41 |       let anum = Ptable.anum_of_symbol symb in
42 |       (* robust even to extended formulas like CH3CH2OH
43 |          instead of the proper C2H6O *)
44 |       element_counts.(anum) <- element_counts.(anum) + count
45 |     ) element_counts_2;
46 |   (* potentially too large number to fit OCaml's 64 bits signed integers *)
47 |   let big_int =
48 |     A.fold_lefti (fun acc anum count ->
49 |         if count > 0 then
50 |           let p = Z.of_int (Ptable.prime_for_anum anum) in
51 |           (* sum of powers of primes; also called "Godel numbering" *)
52 |           Z.mul acc (Z.pow p count)
53 |         else
54 |           acc
55 |       ) Z.one element_counts in
56 |   Z.to_int big_int
57 | 
58 | let z_2 = Z.of_int 2
59 | let z_3 = Z.of_int 3
60 | let z_5 = Z.of_int 5
61 | let z_7 = Z.of_int 7
62 | 
63 | (* Godel numbering for radius up to three bonds *)
64 | let encode_envs e0 e1 e2 e3 =
65 |   Z.(to_int (pow z_2 e0 *
66 |                pow z_3 e1 *
67 |                  pow z_5 e2 *
68 |                    pow z_7 e3))
69 |   
70 | let find_exponent composite prime =
71 |   let rec loop acc x =
72 |     if x mod prime = 0 then
73 |       loop (acc + 1) (x / prime)
74 |     else
75 |       acc in
76 |   loop 0 composite
77 | 
78 | (* int -> formula *)
79 | let decode (code: int): string =
80 |   let counts = A.map (find_exponent code) Ptable.all_primes in
81 |   let prime_counts = A.combine Ptable.all_primes counts in
82 |   let symb2count =
83 |     A.fold (fun acc (prime, count) ->
84 |         if count > 0 then
85 |           let symb = Ptable.symbol_for_prime prime in
86 |           SMap.add symb count acc
87 |         else
88 |           acc
89 |       ) SMap.empty prime_counts in
90 |   (* get back formula as a string *)
91 |   let buff = Buffer.create 128 in
92 |   SMap.iter (bprintf buff "%s%d") symb2count;
93 |   Buffer.contents buff
94 | 


--------------------------------------------------------------------------------
/src/fpMol.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | (* A fingerprint-encoded molecule *)
10 | 
11 | module A = Array
12 | module Fp = Fingerprint
13 | module Ht = Hashtbl
14 | module L = MyList
15 | module LO = Line_oriented
16 | 
17 | open Printf
18 | 
19 | type t = { name: string;
20 |            index: int; (* position in input file *)
21 |            value: float;
22 |            fp: Fp.t }
23 | 
24 | let create name index value bitstring =
25 |   { name; index; value; fp = Fp.of_string bitstring }
26 | 
27 | (* read one molecule from an FP file *)
28 | let read_one_mol line =
29 |   try Scanf.sscanf line "%s@,%f,%s"
30 |         (fun name value bitstring ->
31 |            (name, value, bitstring)
32 |         )
33 |   with Scanf.Scan_failure msg ->
34 |     failwith ("FpMol.read_one_mol: fmt: %s@,%f,%s err: " ^ msg ^
35 |               " line: " ^ line)
36 | 
37 | let parse_one index line =
38 |   let name, value, bitstring = read_one_mol line in
39 |   create name index value bitstring
40 | 
41 | (* go back to the line format you came from *)
42 | let to_string (m: t): string =
43 |   sprintf "%s,%g,[%s]"
44 |     m.name
45 |     m.value
46 |     (Fp.to_string m.fp)
47 | 
48 | let to_out out m =
49 |   fprintf out "%s\n" (to_string m)
50 | 
51 | let molecules_of_file fn =
52 |   LO.mapi fn parse_one
53 | 
54 | let dist m1 m2 =
55 |   Fp.distance m1.fp m2.fp
56 | 
57 | let tani m1 m2 =
58 |   Fp.tanimoto m1.fp m2.fp
59 | 
60 | let get_name x =
61 |   x.name
62 | 
63 | let get_value x =
64 |   x.value
65 | 
66 | let get_index x =
67 |   x.index
68 | 
69 | let get_fp x =
70 |   x.fp
71 | 
72 | let nb_features x =
73 |   Fp.nb_features x.fp
74 | 
75 | let mol_is_active line =
76 |   BatString.starts_with line "active"
77 | 
78 | let is_active x =
79 |   mol_is_active x.name
80 | 
81 | let drop_features to_drop x =
82 |   { x with fp = Fp.drop_features to_drop x.fp }
83 | 


--------------------------------------------------------------------------------
/src/fp_test.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | (* regression tests for Fingerprint.tanimoto *)
10 | 
11 | module Fp = Molenc.Fingerprint
12 | module Log = Dolog.Log
13 | 
14 | let () =
15 |   Log.color_on ();
16 |   Log.set_log_level Log.INFO;
17 |   let tani = Fp.tanimoto in
18 |   let fp1 = Fp.of_string "[0:1;1:2;5:1;11:4]" in
19 |   let fp2 = Fp.of_string "[1:1;3:2;11:3]" in
20 |   let fp3 = Fp.of_string "[]" in
21 |   let fp4 = Fp.of_string "[1:2;5:1;11:2]" in
22 |   assert(tani fp1 fp1 = 1.0);
23 |   assert(tani fp2 fp2 = 1.0);
24 |   assert(tani fp3 fp3 = 0.0);
25 |   assert(tani fp4 fp4 = 1.0);
26 |   assert(tani fp1 fp2 = 4.0 /. 10.0);
27 |   assert(tani fp1 fp3 = 0.0);
28 |   assert(tani fp1 fp4 = 5.0 /. 8.0);
29 |   Log.info "OK"
30 | 


--------------------------------------------------------------------------------
/src/gen_bindings.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # pyml_bindgen rdkit_wrapper_specs.txt rdkit_wrapper Rdkit \
 4 | #              --caml-module=Rdkit --of-pyo-ret-type=no_check > rdkit.ml
 5 | 
 6 | pyml_bindgen rdkit_wrapper_specs.txt rdkit_wrapper Rdkit \
 7 |              --embed-python-source rdkit_wrapper.py \
 8 |              --caml-module=Rdkit --of-pyo-ret-type=no_check > rdkit.ml
 9 | 
10 | # format the generated code
11 | ocamlformat --inplace --enable-outside-detected-project rdkit.ml
12 | 


--------------------------------------------------------------------------------
/src/gnuplot.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | module Fn = Filename
 3 | module Log = Dolog.Log
 4 | module Utls = Molenc.Utls
 5 | 
 6 | open Printf
 7 | 
 8 | (* WARNING: leaks tmp files.
 9 |    we require the means because the Gaussian fitting may not converge *)
10 | let plot_histograms histo_data_fn mean1 mean2 =
11 |   let gnuplot_script_fn = Fn.temp_file "gnuplot_" ".gpl" in
12 |   let gnuplot_log_fn = Fn.temp_file "gnuplot_" ".log" in
13 |   Utls.string_list_to_file gnuplot_script_fn
14 |     ["set xlabel 'score'";
15 |      "set ylabel 'frequency'";
16 |      "gauss1(x) = a1/(sqrt(2*pi)*sigma1)*exp(-(x-mean1)**2/(2*sigma1**2))";
17 |      "gauss2(x) = a2/(sqrt(2*pi)*sigma2)*exp(-(x-mean2)**2/(2*sigma2**2))";
18 |      sprintf "mean1 = %f" mean1;
19 |      sprintf "mean2 = %f" mean2;
20 |      sprintf "fit gauss1(x) '%s' u 1:2 via a1,sigma1,mean1" histo_data_fn;
21 |      sprintf "fit gauss2(x) '%s' u 1:3 via a2,sigma2,mean2" histo_data_fn;
22 |      sprintf "plot '%s' u 1:2 w l t 'smaller sample', \\" histo_data_fn;
23 |      "''        u 1:3 w l t 'bigger sample', \\";
24 |      "gauss1(x) t 'smaller fit', \\";
25 |      "gauss2(x) t 'bigger fit'"];
26 |   Log.info "gnuplot script: %s log: %s" gnuplot_script_fn gnuplot_log_fn;
27 |   Utls.run_command (sprintf "(gnuplot -persist %s 2>&1) > %s"
28 |                       gnuplot_script_fn gnuplot_log_fn)
29 | 


--------------------------------------------------------------------------------
/src/gram.ml:
--------------------------------------------------------------------------------
 1 | open Printf
 2 | 
 3 | module A = BatArray
 4 | module L = BatList
 5 | 
 6 | (* Parallel Gram matrix initialization *)
 7 | let emit_one (i: int ref) (n: int) ((): unit): int =
 8 |   if !i >= n then raise Parany.End_of_input
 9 |   else
10 |     let res = !i in
11 |     incr i;
12 |     res
13 | 
14 | let process_one (dist: 'a -> 'a -> float) (samples: 'a array) (n: int) (i: int):
15 |   (int * float array) =
16 |   let res = A.create_float (n - i) in
17 |   let si = samples.(i) in
18 |   for j = i to n - 1 do
19 |     res.(j - i) <- dist si samples.(j)
20 |   done;
21 |   (i, res)
22 | 
23 | let gather_one (res: float array array) ((i, xs): (int * float array)): unit =
24 |   A.iteri (fun j' x ->
25 |       let j = j' + i in
26 |       res.(i).(j) <- x;
27 |       res.(j).(i) <- x (* symmetric matrix *)
28 |     ) xs
29 | 
30 | let initialize_matrix dist ncores csize samples res =
31 |   let n = A.length samples in
32 |   assert(n > 0);
33 |   assert(ncores >= 1);
34 |   if ncores = 1 then (* Sequential *)
35 |     begin
36 |       for i = 0 to n - 1 do
37 |         (* WARNING: we initialize the diagonal while it is all 0s *)
38 |         for j = i to n - 1 do
39 |           let x = dist samples.(i) samples.(j) in
40 |           res.(i).(j) <- x;
41 |           (* WARNING: we could remove the next one *)
42 |           res.(j).(i) <- x (* symmetric matrix *)
43 |         done;
44 |         printf "done: %d/%d\r%!" (i + 1) n;
45 |       done;
46 |       printf "\n%!";
47 |     end
48 |   else (* parallel *)
49 |     Parany.run ~csize ncores
50 |       ~demux:(emit_one (ref 0) n)
51 |       ~work:(process_one dist samples n)
52 |       ~mux:(gather_one res)
53 | 
54 | (* partial display *)
55 | let print_corners mat =
56 |   let m = A.length mat in
57 |   let n = A.length mat.(0) in
58 |   let idots = ref false in
59 |   for i = 0 to m - 1 do
60 |     if i < 3 || i > m - 4 then
61 |       begin
62 |         let jdots = ref false in
63 |         for j = 0 to n - 1 do
64 |           if j < 3 || j > n - 4 then
65 |             printf (if j <> 0 then "\t%6.2f" else "%6.2f")
66 |               mat.(i).(j)
67 |           else if not !jdots then
68 |             (printf "\t..."; jdots := true)
69 |         done;
70 |         printf "\n"
71 |       end
72 |     else if not !idots then
73 |       (printf "\t\t\t...\n"; idots := true)
74 |   done;
75 |   flush stdout
76 | 


--------------------------------------------------------------------------------
/src/index.ml:
--------------------------------------------------------------------------------
  1 | 
  2 | module A = BatArray
  3 | module L = BatList
  4 | module Log = Dolog.Log
  5 | 
  6 | module Bstree = struct
  7 | 
  8 |   include Bst.Bisec_tree.Make (FpMol)
  9 | 
 10 |   let of_molecules l =
 11 |     create 1 Two_bands (A.of_list l)
 12 | end
 13 | 
 14 | (* For each molecule, find its nearest neighbor name and distance,
 15 |    over all Bsts; parallelized over molecules. *)
 16 | let nearest_neighbor_names ncores bst_fns mols =
 17 |   match bst_fns with
 18 |   | [] -> []
 19 |   | fn :: fns ->
 20 |     let annot_mols =
 21 |       (* load one bst *)
 22 |       Log.info "loading %s..." fn;
 23 |       let (bst: Bstree.t) = Utls.restore fn in
 24 |       Parany.Parmap.parmap ncores
 25 |         (fun mol ->
 26 |            let nn, dist = Bstree.nearest_neighbor mol bst in
 27 |            (mol, FpMol.get_name nn, dist)
 28 |         ) mols in
 29 |     (* fold on the other BSTs *)
 30 |     L.fold_left (fun annotated bst_fn ->
 31 |         (* load another bst *)
 32 |         Log.info "loading %s..." bst_fn;
 33 |         let (bst: Bstree.t) = Utls.restore bst_fn in
 34 |         Parany.Parmap.parmap ncores (fun (mol, nn_name, dist) ->
 35 |             if dist = 0.0 then
 36 |               (* already nearest *)
 37 |               (mol, nn_name, dist)
 38 |             else
 39 |               let curr_nn, curr_dist = Bstree.nearest_neighbor mol bst in
 40 |               if curr_dist < dist then
 41 |                 (mol, FpMol.get_name curr_nn, curr_dist)
 42 |               else
 43 |                 (mol, nn_name, dist)
 44 |           ) annotated
 45 |       ) annot_mols fns
 46 | 
 47 | let bst_nearest_name_dist bst mol =
 48 |   let nn, dist = Bstree.nearest_neighbor mol bst in
 49 |   (FpMol.get_name nn, dist)
 50 | 
 51 | (* For each molecule, find its nearest neighbor name and distance,
 52 |    over all Bsts; parallelized over bisector trees (indexed chunks) *)
 53 | let nearest_neighbor_names_a
 54 |     (ncores: int) (bst_fns: string list) (mols_a: FpMol.t array)
 55 |   : (FpMol.t * string * float) array =
 56 |   match bst_fns with
 57 |   | [] -> [||]
 58 |   | fn :: fns' ->
 59 |     (* init accumulater, for the muxer process.
 60 |        This is the only calculation parallelized over molecules.
 61 |        Remaining calculations will be paralellized over Bsts. *)
 62 |     let annot_mols =
 63 |       Log.info "loading %s..." fn;
 64 |       let (bst: Bstree.t) = Utls.restore fn in
 65 |       Parany.Parmap.array_parmap ncores
 66 |         (fun mol ->
 67 |            let name, dist = bst_nearest_name_dist bst mol in
 68 |            (mol, name, dist))
 69 |         (mols_a.(0), "", 1.0)
 70 |         mols_a in
 71 |     let fns = A.of_list fns' in
 72 |     let () =
 73 |       Parany.run ncores
 74 |         ~demux:(
 75 |           let i = ref 1 in (* fn already processed *)
 76 |           let n = A.length fns in
 77 |           fun () ->
 78 |             if !i < n then
 79 |               let res = !i in
 80 |               incr i;
 81 |               res
 82 |             else
 83 |               raise Parany.End_of_input
 84 |         )
 85 |         ~work:(fun i ->
 86 |             Log.info "loading %s..." fns.(i);
 87 |             let (bst: Bstree.t) = Utls.restore fns.(i) in
 88 |             A.map (bst_nearest_name_dist bst) mols_a
 89 |           )
 90 |         ~mux:(
 91 |           let m = A.length mols_a in
 92 |           fun nearest_name_dists ->
 93 |             assert(A.length nearest_name_dists = m);
 94 |             for i = 0 to m - 1 do
 95 |               let mol, _prev_nearest_name, prev_dist =
 96 |                 A.unsafe_get annot_mols i in
 97 |               if prev_dist = 0.0 then
 98 |                 (* already nearest *) ()
 99 |               else
100 |                 let curr_nearest_name, curr_dist =
101 |                   A.unsafe_get nearest_name_dists i in
102 |                 if curr_dist < prev_dist then (* update acc *)
103 |                   A.unsafe_set annot_mols i (mol, curr_nearest_name, curr_dist)
104 |             done
105 |         ) in
106 |     annot_mols
107 | 


--------------------------------------------------------------------------------
/src/indexer.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2021, Francois Berenger
 2 |    Tsuda laboratory, Tokyo university, Japan.
 3 | 
 4 |    Indexing of fingerprint encoded molecules into bisector trees
 5 |    stored on disk. *)
 6 | 
 7 | open Printf
 8 | 
 9 | module A = BatArray
10 | module Bstree = Molenc.Index.Bstree
11 | module CLI = Minicli.CLI
12 | module FpMol = Molenc.FpMol
13 | module Ht = Hashtbl
14 | module L = BatList
15 | module LO = Line_oriented
16 | module Log = Dolog.Log
17 | module Utls = Molenc.Utls
18 | 
19 | let verbose = ref false
20 | 
21 | let read_one_chunk input_fn in_mol_count chunk_index csize input () =
22 |   let res = ref [] in
23 |   try
24 |     for _i = 1 to csize do
25 |       let line = input_line input in
26 |       let mol = (!in_mol_count, line) in
27 |       incr in_mol_count;
28 |       res := mol :: !res
29 |     done;
30 |     let idx = !chunk_index in
31 |     incr chunk_index;
32 |     (idx, !res)
33 |   with End_of_file ->
34 |     if !res = [] then
35 |       (Log.info "read %d from %s" !in_mol_count input_fn;
36 |        raise Parany.End_of_input)
37 |     else
38 |       (* last chunk, maybe not full *)
39 |       (!chunk_index, !res)
40 | 
41 | let index_one_chunk input_fn (i, chunk') =
42 |   let chunk = L.rev_map (fun (i, line) -> FpMol.parse_one i line) chunk' in
43 |   assert(i <= 9999);
44 |   let output_fn = sprintf "%s.%04d.bst" input_fn i in
45 |   Log.info "creating %s" output_fn;
46 |   let bst = Bstree.of_molecules chunk in
47 |   LO.save output_fn bst;
48 |   Utls.run_command (sprintf "gzip -f %s" output_fn)
49 | 
50 | let main () =
51 |   Log.(set_log_level INFO);
52 |   Log.color_on ();
53 |   let argc, args = CLI.init () in
54 |   let default_block_size = ref 50_000 in
55 |   if argc = 1 then
56 |     begin
57 |       eprintf "usage:\n\
58 |                %s\n  \
59 |                -i <filename>: molecules input file\n  \
60 |                -ifs <filename>: file containing a list of files\n  \
61 |                -np <int>: nprocs (default=1)\n  \
62 |                -c <int>: chunk size (molecules/bloc; default=%d)\n  \
63 |                [-v]: verbose mode\n"
64 |         Sys.argv.(0) !default_block_size;
65 |       exit 1
66 |     end;
67 |   let input_fns =
68 |     match (CLI.get_string_opt ["-i"] args,
69 |            CLI.get_string_opt ["-ifs"] args) with
70 |     | (None, None)
71 |     | (Some _, Some _) -> failwith "provide either -i or -ifs"
72 |     | (Some fn, None) -> [fn]
73 |     | (None, Some fn) -> LO.lines_of_file fn in
74 |   let nprocs = CLI.get_int_def ["-np"] args 1 in
75 |   let csize = CLI.get_int_def ["-c"] args !default_block_size in
76 |   if CLI.get_set_bool ["-v"] args then
77 |     verbose := true;
78 |   CLI.finalize (); (* ------------------------------------------------------ *)
79 |   let chunk_count = ref 0 in
80 |   let in_mol_count = ref 0 in
81 |   L.iter (fun input_fn ->
82 |       Log.info "%d molecules in %s" (LO.length input_fn) input_fn;
83 |       LO.with_in_file input_fn (fun input ->
84 |           Parany.run nprocs
85 |             ~demux:(read_one_chunk input_fn in_mol_count chunk_count csize input)
86 |             ~work:(index_one_chunk input_fn)
87 |             ~mux:(fun () -> ())
88 |         )
89 |     ) input_fns
90 | 
91 | let () = main ()
92 | 


--------------------------------------------------------------------------------
/src/intSet.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | open Printf
10 | 
11 | include BatSet.Int
12 | 
13 | let to_string s =
14 |   let buff = Buffer.create 11 in
15 |   Buffer.add_char buff '[';
16 |   iter (fun x ->
17 |       Buffer.add_string buff
18 |         (if Buffer.length buff = 1
19 |          then sprintf "%d" x
20 |          else sprintf ";%d" x)
21 |     ) s;
22 |   Buffer.add_char buff ']';
23 |   Buffer.contents buff
24 | 


--------------------------------------------------------------------------------
/src/mini_mol.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | (* mini molecule module *)
10 | 
11 | module A = BatArray
12 | module IntSet = BatSet.Int
13 | module Ht = BatHashtbl
14 | module L = BatList
15 | module StringMap = BatMap.String
16 | 
17 | type t = { name: string;
18 |            graph: Node.t array;
19 |            diameter: int;
20 |            matrix: int array array }
21 | 
22 | let get_name m = m.name
23 | 
24 | let get_graph m = m.graph
25 | 
26 | let nb_atoms m =
27 |   A.length m.graph
28 | 
29 | let create name graph diameter matrix =
30 |   { name; graph; diameter; matrix }
31 | 
32 | let get_typ (m: t) (i: int) =
33 |   Node.get_typ (A.unsafe_get m.graph i)
34 | 
35 | let get_succs (m: t) (i: int) =
36 |   Node.get_succs (A.unsafe_get m.graph i)
37 | 
38 | (* list (sorted-uniq-counted) atom types of all atoms
39 |    at given distance from center atom *)
40 | let types_at_distance (center: int) (curr_height: int) (mol: t) =
41 |   let matrix_line = mol.matrix.(center) in
42 |   let unsorted =
43 |     A.fold_lefti (fun acc i x ->
44 |         if x = curr_height then
45 |           (get_typ mol i) :: acc
46 |         else
47 |           acc
48 |       ) [] matrix_line in
49 |   (* layer at 'curr_height' *)
50 |   (curr_height, Utls.list_uniq_count unsorted)
51 | 
52 | let encode (max_height: int) (mol: t): (Atom_env.t * int) list =
53 |   (* compute atom envs. of given atom up to maximum height allowed *)
54 |   (* we cannot go deeper than 'maxi' on this molecule *)
55 |   let maxi = min max_height mol.diameter in
56 |   let encode_atom (n_i: int): Atom_env.t =
57 |     let depths = L.range 0 `To maxi in
58 |     let layers =
59 |       L.map (fun height ->
60 |           types_at_distance n_i height mol
61 |         ) depths in
62 |     (* non empty layers *)
63 |     L.filter (fun (_h, typs) -> typs <> []) layers
64 |   in
65 |   let nb_atoms = A.length mol.graph in
66 |   let atom_indexes = L.range 0 `To (nb_atoms - 1) in
67 |   (* canonicalize the encoding of the molecule by sorting its atom envs
68 |      and counting duplicates *)
69 |   let atom_envs = L.map encode_atom atom_indexes in
70 |   Utls.list_uniq_count atom_envs
71 | 
72 | (* encode the molecule to counted atom pairs *)
73 | let atom_pairs (mol: t): (Atom_pair.t * int) list =
74 |   let n = nb_atoms mol in
75 |   assert(n >= 1); (* at least one heavy atom *)
76 |   let max_nb_pairs = max 1 (n * (n - 1) / 2) in
77 |   let pair2count = Ht.create max_nb_pairs in
78 |   for i = 0 to n - 1 do
79 |     let type_i = get_typ mol i in
80 |     for j = i to n - 1 do
81 |       let type_j = get_typ mol j in
82 |       let dist = A.unsafe_get (A.unsafe_get mol.matrix i) j in
83 |       let pair = Atom_pair.create type_i type_j dist in
84 |       let prev_count = Ht.find_default pair2count pair 0 in
85 |       Ht.replace pair2count pair (prev_count + 1)
86 |     done;
87 |   done;
88 |   (* canonicalization will be done later; when the features (string) are
89 |    * converted to feature ids (int) *)
90 |   Ht.bindings pair2count
91 | 


--------------------------------------------------------------------------------
/src/mol2.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | module At = Syb_atom
10 | module IntSet = BatSet.Int
11 | module S = BatString
12 | 
13 | let molecule_header = "@<TRIPOS>MOLECULE"
14 | let atoms_header = "@<TRIPOS>ATOM"
15 | let bonds_header = "@<TRIPOS>BOND"
16 | 
17 | (* parse line just after the molecule name line *)
18 | let read_header l =
19 |   try Scanf.sscanf l " %d %d %d %d %d "
20 |         (fun nb_atoms nb_bonds _ _ _ -> (nb_atoms, nb_bonds))
21 |   with _ -> failwith ("read_header: could not parse: " ^ l)
22 | 
23 | exception Read_one
24 | 
25 | let buff = Buffer.create 10240
26 | 
27 | (* read one molecule from a MOL2 file *)
28 | let read_one_raw (input: in_channel): string =
29 |   try
30 |     while true do
31 |       let line = input_line input in
32 |       if line = molecule_header && Buffer.length buff <> 0 then
33 |         (* just finished reading one *)
34 |         raise Read_one
35 |       else
36 |         (Buffer.add_string buff line;
37 |          Buffer.add_char buff '\n')
38 |     done;
39 |     assert(false)
40 |   with
41 |   | Read_one ->
42 |     let res = Buffer.contents buff in
43 |     Buffer.reset buff;
44 |     Buffer.add_string buff molecule_header; (* put in buffer next mol's header *)
45 |     Buffer.add_char buff '\n';
46 |     res
47 |   | End_of_file ->
48 |     if Buffer.length buff = 0 then
49 |       raise End_of_file
50 |     else
51 |       let res = Buffer.contents buff in
52 |       Buffer.reset buff;
53 |       res
54 | 
55 | let get_name mol_lines =
56 |   let _header, rest = S.split mol_lines ~by:"\n" in
57 |   let name, _tail = S.split rest ~by:"\n" in
58 |   name
59 | 


--------------------------------------------------------------------------------
/src/molenc_fcodec.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2024, Francois Berenger
 2 |  * Tsuda laboratory, Tokyo University,
 3 |  * 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan.
 4 |  *
 5 |  * chemical formula integer encoder/decoder *)
 6 | 
 7 | open Printf
 8 | 
 9 | module A = BatArray
10 | module CLI = Minicli.CLI
11 | module Formula = Molenc.Formula
12 | module SMap = BatMap.String
13 | module L = BatList
14 | module LO = Line_oriented
15 | module Log = Dolog.Log
16 | module Rdkit = Molenc.Rdkit.Rdkit
17 | module S = BatString
18 | 
19 | (* because the Rdkit module uses Pyml *)
20 | let () = Py.initialize ~version:3 ()
21 | 
22 | let formula_of_elements (elts: string array): string =
23 |   let elt2count =
24 |     A.fold (fun acc elt ->
25 |         let prev_count = SMap.find_default 0 elt acc in
26 |         SMap.add elt (prev_count + 1) acc
27 |       ) SMap.empty elts in
28 |   let buff = Buffer.create 128 in
29 |   SMap.iter (bprintf buff "%s%d") elt2count;
30 |   Buffer.contents buff
31 | 
32 | type work_result = OK of string
33 |                  | Overflow of string
34 | 
35 | let main () =
36 |   Log.(set_log_level INFO);
37 |   Log.color_on ();
38 |   Log.(set_prefix_builder short_prefix_builder);
39 |   let argc, args = CLI.init () in
40 |   (if argc = 1 then
41 |      (eprintf "usage:\n  \
42 |                %s -i in.smi\n  \
43 |                -i <input.smi>: input molecules\n  \
44 |                [-np <int>]: parallelize on NCORES (default=1)\n  \
45 |                [-c <int>]: chunk size (default=50)\n"
46 |         Sys.argv.(0);
47 |       exit 1)
48 |   );
49 |   let smiles_fn = CLI.get_string ["-i"] args in
50 |   let nprocs = CLI.get_int_def ["-np"] args 1 in
51 |   let csize = CLI.get_int_def ["-c"] args 50 in
52 |   CLI.finalize (); (* ------------------------------------------------------ *)
53 |   (* read each molecule *)
54 |   LO.with_in_file smiles_fn (fun input ->
55 |       Parany.run nprocs ~csize
56 |         ~demux:(fun () ->
57 |           try input_line input
58 |           with End_of_file -> raise Parany.End_of_input
59 |         )
60 |         ~work:(fun line ->
61 |           let smi, _name = S.split ~by:"\t" line in
62 |           let mol_H =
63 |             let mol = Rdkit.__init__ ~smi () in
64 |             Rdkit.add_hydrogens mol () in
65 |           let elements = Rdkit.get_elements mol_H () in
66 |           (* get chemical formula *)
67 |           let formula = formula_of_elements elements in
68 |           (* Log.info "formula: %s" formula; *)
69 |           (* encode to integer *)
70 |           try
71 |             let code = Formula.encode false formula in
72 |             (* Log.info "code: %d" code; *)
73 |             (* decode to chemical formula *)
74 |             let formula' = Formula.decode code in
75 |             (* Log.info "formula': %s" formula'; *)
76 |             assert(formula = formula');
77 |             OK line
78 |           with Z.Overflow ->
79 |             Overflow line
80 |         )
81 |         ~mux:(function
82 |           | OK line -> printf "%s\n%!" line
83 |           | Overflow line -> eprintf "%s\n%!" line
84 |         )
85 |     )
86 | 
87 | let () = main ()
88 | 


--------------------------------------------------------------------------------
/src/myList.ml:
--------------------------------------------------------------------------------
  1 | (* Copyright (C) 2020, Francois Berenger
  2 | 
  3 |    Yamanishi laboratory,
  4 |    Department of Bioscience and Bioinformatics,
  5 |    Faculty of Computer Science and Systems Engineering,
  6 |    Kyushu Institute of Technology,
  7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
  8 | 
  9 | include BatList
 10 | 
 11 | let to_string to_str l =
 12 |   let buff = Buffer.create 80 in
 13 |   Buffer.add_char buff '[';
 14 |   iteri (fun i x ->
 15 |       if i > 0 then Buffer.add_char buff ';';
 16 |       Buffer.add_string buff (to_str x);
 17 |     ) l;
 18 |   Buffer.add_char buff ']';
 19 |   Buffer.contents buff
 20 | 
 21 | let of_string of_str s =
 22 |   let s' = BatString.chop ~l:1 ~r:1 s in
 23 |   if s' = "" then
 24 |     (* the empty list case was not handled in the past *)
 25 |     []
 26 |   else
 27 |     begin
 28 |       if String.contains s' ']' then
 29 |         failwith ("MyList.of_string: sub lists inside: " ^ s);
 30 |       map of_str (BatString.split_on_string s' ~by:";")
 31 |     end
 32 | 
 33 | (* only map 'f' on elements satisfying 'p' *)
 34 | let filter_map p f l =
 35 |   let res =
 36 |     fold_left (fun acc x ->
 37 |         if p x then (f x) :: acc
 38 |         else acc
 39 |       ) [] l in
 40 |   rev res
 41 | 
 42 | (* split a list into n parts (the last part might have
 43 |    a different number of elements) *)
 44 | let nparts n l =
 45 |   let len = length l in
 46 |   let res = ref [] in
 47 |   let curr = ref l in
 48 |   let m = BatFloat.round_to_int (float len /. float n) in
 49 |   for _ = 1 to n - 1 do
 50 |     let xs, ys = takedrop m !curr in
 51 |     curr := ys;
 52 |     res := xs :: !res
 53 |   done;
 54 |   rev (!curr :: !res)
 55 | 
 56 | (* create folds of cross validation; each fold consists in (train, test) *)
 57 | let cv_folds n l =
 58 |   let test_sets = nparts n l in
 59 |   let rec loop acc prev curr =
 60 |     match curr with
 61 |     | [] -> acc
 62 |     | x :: xs ->
 63 |       let before_after = flatten (rev_append prev xs) in
 64 |       let prev' = x :: prev in
 65 |       let train_test = (before_after, x) in
 66 |       let acc' = train_test :: acc in
 67 |       loop acc' prev' xs in
 68 |   loop [] [] test_sets
 69 | 
 70 | (* List.combine for 4 lists *)
 71 | let combine4 l1 l2 l3 l4 =
 72 |   let rec loop acc = function
 73 |     | ([], [], [], []) -> rev acc
 74 |     | (w :: ws, x :: xs, y :: ys, z :: zs) ->
 75 |       loop ((w, x, y, z) :: acc) (ws, xs, ys, zs)
 76 |     | _ -> raise (Invalid_argument "MyList.combine4: list lengths differ")
 77 |   in
 78 |   loop [] (l1, l2, l3, l4)
 79 | 
 80 | let really_take n l =
 81 |   let res = take n l in
 82 |   assert(length res = n);
 83 |   res
 84 | 
 85 | (* non reproducible randomization of a list *)
 86 | let random_shuffle l =
 87 |   let rng = BatRandom.State.make_self_init () in
 88 |   shuffle ~state:rng l
 89 | 
 90 | let rev_combine l1 l2 =
 91 |   let rec loop acc l r =
 92 |     match (l, r) with
 93 |     | ([], []) -> acc
 94 |     | (x :: xs, y :: ys) -> loop ((x, y) :: acc) xs ys
 95 |     | _ -> raise (Invalid_argument "MyList.rev_combine: list lengths differ")
 96 |   in
 97 |   loop [] l1 l2
 98 | 
 99 | (* filter using bit-mask [m] *)
100 | let filter_mask m l =
101 |   let rec loop acc = function
102 |     | [] -> acc
103 |     | (p, x) :: rest -> loop (if p then x :: acc else acc) rest
104 |   in
105 |   loop [] (rev_combine m l)
106 | 


--------------------------------------------------------------------------------
/src/node.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | module IntSet = BatSet.Int
10 | 
11 | type t = { typ: PiEltHA.t ; (* atom type *)
12 |            succs: IntSet.t } (* indexes of its direct successors
13 |                                 in the molecular graph (it is bonded to them) *)
14 | 
15 | let create typ succs =
16 |   { typ; succs }
17 | 
18 | let dummy = create PiEltHA.dummy IntSet.empty
19 | 
20 | let add_succ (n: t) (succ: int): t =
21 |   create n.typ (IntSet.add succ n.succs)
22 | 
23 | let get_succs (n: t): IntSet.t =
24 |   n.succs
25 | 
26 | let get_typ (n: t): PiEltHA.t =
27 |   n.typ
28 | 


--------------------------------------------------------------------------------
/src/norm.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | open Printf
 3 | 
 4 | module IntMap = BatMap.Int
 5 | 
 6 | (* max norm is probably to be preferred if we are going to minwise hash
 7 |  * the fingerprints later on *)
 8 | type norm = Max_norm (* max feature value in current instance *)
 9 |           | L1_norm (* Manhatan distance *)
10 | 
11 | let of_string = function
12 |   | "l1" -> L1_norm
13 |   | "max" -> Max_norm
14 |   | other -> failwith (sprintf "Decoder: unknown norm: %s" other)
15 | 
16 | let map_norm style map =
17 |   float
18 |     (match style with
19 |      | L1_norm -> IntMap.fold (fun _k v acc -> v + acc) map 0
20 |      | Max_norm -> IntMap.fold (fun _k v acc -> max v acc) map 0)
21 | 


--------------------------------------------------------------------------------
/src/ph4.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2022, Francois Berenger
 2 | 
 3 |    Tsuda Laboratory, Graduate School of Frontier Sciences,
 4 |    The University of Tokyo, Japan.
 5 | 
 6 |    Support .ph4 files. *)
 7 | 
 8 | (* A .ph4 files has format:
 9 | ---
10 | <num_feats:int>:<mol_name:string>
11 | ARO 1.47088 -0.706617 1.86095
12 | .
13 | .
14 | .
15 | --- *)
16 | 
17 | module S = BatString
18 | 
19 | let parse_header_line line =
20 |   let num_feats, name = S.split ~by:":" line in
21 |   (int_of_string num_feats, name)
22 | 
23 | exception Read_one
24 | 
25 | (* read one molecule from a .ph4 file *)
26 | let read_one (input: in_channel): string =
27 |   let buff = Buffer.create 2048 in
28 |   try
29 |     let line = input_line input in
30 |     let num_feats, _name = parse_header_line line in
31 |     Buffer.add_string buff line;
32 |     Buffer.add_char buff '\n';
33 |     for _i = 1 to num_feats do
34 |       let line = input_line input in
35 |       Buffer.add_string buff line;
36 |       Buffer.add_char buff '\n'
37 |     done;
38 |     raise Read_one
39 |   with | End_of_file | Read_one ->
40 |     let res = Buffer.contents buff in
41 |     if res = "" then
42 |       raise End_of_file
43 |     else res
44 | 
45 | let get_name ph4_lines =
46 |   let header, _rest = S.split ph4_lines ~by:"\n" in
47 |   let _num_feats, name = parse_header_line header in
48 |   name
49 | 


--------------------------------------------------------------------------------
/src/ph4_atom.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | (* small module to help compute atom environments from *.ph4 files *)
10 | 
11 | open Printf
12 | 
13 | type t = { idx: int;
14 |            typ: Ph4.t }
15 | 
16 | let create (idx: int) (typ: Ph4.t): t =
17 |   (* indexes start at 1 in MOL2 files *)
18 |   { idx = idx - 1; typ }
19 | 
20 | let dummy = create (-1) Ph4.Non
21 | 
22 | let of_ph4_line (l: string): t =
23 |   try Scanf.sscanf l "%d %c"
24 |         (fun idx char -> create idx (Ph4.of_char char))
25 |   with Scanf.Scan_failure msg ->
26 |     failwith (sprintf "Ph4_atom.of_ph4_line: could not parse: %s: %s" l msg)
27 | 
28 | let to_string (a: t): string =
29 |   sprintf "%d %s" a.idx (Ph4.to_string a.typ)
30 | 


--------------------------------------------------------------------------------
/src/piEltHA.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | (* format: (#pi elec.,elt,#HA) cf. bin/type_atoms.py for details *)
10 | 
11 | type t = string
12 | 
13 | let dummy = ""
14 | 
15 | let to_string x =
16 |   x
17 | 
18 | let of_string s =
19 |   s
20 | 
21 | let compare x y =
22 |   String.compare x y
23 | 


--------------------------------------------------------------------------------
/src/prune.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | (* prune features from a dictionary file, provided a list of feature indexes
 3 |  * to remove *)
 4 | 
 5 | module CLI = Minicli.CLI
 6 | module Ht = Hashtbl
 7 | module L = BatList
 8 | module Log = Dolog.Log
 9 | module LO = Line_oriented
10 | module S = BatString
11 | module Utls = Molenc.Utls
12 | 
13 | open Printf
14 | 
15 | let prune_dico features_to_drop in_dico_fn out_dico_fn =
16 |   let n = L.length features_to_drop in
17 |   Utls.enforce (n > 0) "Model.prune_dico: |features_to_drop| = 0";
18 |   let to_drop = Ht.create n in
19 |   L.iter (fun i ->
20 |       Ht.add to_drop i ()
21 |     ) features_to_drop;
22 |   Log.info "pruning %d features" (Ht.length to_drop);
23 |   LO.with_out_file out_dico_fn (fun out ->
24 |       let new_feat_id = ref 0 in
25 |       LO.iter in_dico_fn (fun line ->
26 |           if S.starts_with line "#" then
27 |             (* preserve comments *)
28 |             fprintf out "%s\n" line
29 |           else
30 |             try
31 |               (* this is the format for an Atom Pairs dictionary *)
32 |               Scanf.sscanf line "%d %s@ %d" (fun featId featStr featCount ->
33 |                   (* drop some features and update feature ids *)
34 |                   if not (Ht.mem to_drop featId) then
35 |                     (fprintf out "%d %s %d\n" !new_feat_id featStr featCount;
36 |                      incr new_feat_id)
37 |                 )
38 |             with exn -> (Log.fatal "dico %s: cannot parse line %s"
39 |                            in_dico_fn line;
40 |                          raise exn)
41 |         )
42 |     )
43 | 
44 | let main () =
45 |   Log.(set_log_level INFO);
46 |   Log.color_on ();
47 |   let argc, args = CLI.init () in
48 |   let show_help = CLI.get_set_bool ["-h";"--help"] args in
49 |   if argc = 1 || show_help then
50 |     (eprintf "usage:\n  \
51 |               %s -i input.dix -o output.dix -f features.txt\n  \
52 |               -i <filename>: input AP dictionary\n  \
53 |               -o <filename>: output AP dictionary\n  \
54 |               -f <filename>: file with list of features to drop\n  \
55 |               (format: one integer per line)\n"
56 |        Sys.argv.(0);
57 |      exit 1)
58 |   else
59 |     let input_fn = CLI.get_string ["-i"] args in
60 |     let output_fn = CLI.get_string ["-o"] args in
61 |     let features_fn = CLI.get_string ["-f"] args in
62 |     CLI.finalize();
63 |     let features = LO.map features_fn int_of_string in
64 |     prune_dico features input_fn output_fn
65 | 
66 | let () = main ()
67 | 


--------------------------------------------------------------------------------
/src/pubchem_decoder.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | (* decode pubchem FPs (881 bits) to liblinear format
10 |    Here is a pubchem FP example line:
11 | ---
12 | 4036230,0.0,0001010110...01010100001
13 | ---
14 |  *)
15 | 
16 | open Printf
17 | 
18 | module CLI = Minicli.CLI
19 | module Log = Dolog.Log
20 | module LO = Line_oriented
21 | module String = BatString
22 | module Utls = Molenc.Utls
23 | 
24 | let liblinear_line_of_pubchem_line line =
25 |   match String.split_on_char ',' line with
26 |   | [name; _IC50; bitstring] ->
27 |     let is_active = String.starts_with name "active" in
28 |     let nb_bits = String.length bitstring in
29 |     assert(nb_bits = 881 || nb_bits = 2048 || nb_bits = 16384);
30 |     let buff = Buffer.create 1024 in
31 |     Buffer.add_string buff (if is_active then "+1" else "-1");
32 |     String.iteri (fun i c ->
33 |         if c = '1' then
34 |           let k = i + 1 in (* in liblinear: feature indexes start at 1 *)
35 |           Printf.bprintf buff " %d:1" k
36 |       ) bitstring;
37 |     Buffer.contents buff
38 |   | _ -> failwith ("Pubchem_decoder: invalide line: " ^ line)
39 | 
40 | let main () =
41 |   Log.(set_log_level INFO);
42 |   Log.color_on ();
43 |   let argc, args = CLI.init () in
44 |   if argc = 1 then
45 |     (eprintf "usage: %s\n  \
46 |               -i <filename>: encoded molecules\n  \
47 |               -o <filename>: decoded molecules for liblinear\n"
48 |        Sys.argv.(0);
49 |      exit 1);
50 |   let input_fn = CLI.get_string ["-i"] args in
51 |   let output_fn = CLI.get_string ["-o"] args in
52 |   CLI.finalize ();
53 |   let line_counter = ref 0 in
54 |   LO.with_infile_outfile input_fn output_fn (fun input output ->
55 |       try
56 |         while true do
57 |           let in_line = input_line input in
58 |           incr line_counter;
59 |           if !line_counter mod 1000 = 0 then
60 |             eprintf "read: %d\r%!" !line_counter;
61 |           let out_line = liblinear_line_of_pubchem_line in_line in
62 |           fprintf output "%s\n" out_line
63 |         done
64 |       with End_of_file -> ()
65 |     );
66 |   eprintf "read: %d\n" !line_counter
67 | 
68 | let () = main ()
69 | 


--------------------------------------------------------------------------------
/src/rank.ml:
--------------------------------------------------------------------------------
 1 | (* compute ranks associated to each score
 2 |    equal scores will be given equal ranks *)
 3 | 
 4 | open Printf
 5 | 
 6 | module CLI = Minicli.CLI
 7 | module Ht = Hashtbl
 8 | module L = BatList
 9 | module Log = Dolog.Log
10 | module LO = Line_oriented
11 | module String = BatString
12 | module Utls = Molenc.Utls
13 | 
14 | let main () =
15 |   Log.(set_log_level INFO);
16 |   Log.color_on ();
17 |   let argc, args = CLI.init () in
18 |   if argc = 1 then
19 |     begin
20 |       eprintf "usage:\n\
21 |                %s\n  \
22 |                -i <filename>: input scores file\n  \
23 |                -o <filename>: output rank and scores file\n  \
24 |                -f <int>: score field (>= 1)\n  \
25 |                -d <char>: field separator (default=\\t)\n \
26 |                [-r]: increasing scores order (default=decreasing)\n"
27 |         Sys.argv.(0);
28 |       exit 1
29 |     end;
30 |   let input_fn = CLI.get_string ["-i"] args in
31 |   let output_fn = CLI.get_string ["-o"] args in
32 |   let sep = CLI.get_char_def ["-d"] args '\t' in
33 |   let field = (CLI.get_int ["-f"] args) - 1 in
34 |   let revert = CLI.get_set_bool ["-r"] args in
35 |   CLI.finalize();
36 |   let all_scores = ref [] in
37 |   (* read all scores *)
38 |   LO.iter input_fn (fun line ->
39 |       let score_field = String.cut_on_char sep field line in
40 |       let score =
41 |         try Scanf.sscanf score_field "%f" (fun x -> x)
42 |         with exn ->
43 |           begin
44 |             Log.fatal "Rank: cannot parse float: %s" score_field;
45 |             raise exn
46 |           end in
47 |       all_scores := score :: !all_scores
48 |     );
49 |   (* create the score to rank LUT *)
50 |   let uniq_scores =
51 |     let cmp =
52 |       if revert then BatFloat.compare (* increasing sort *)
53 |       else
54 |         (* default: scores in decreasing order; i.e. the highest score
55 |            gets the lowest rank *)
56 |         (fun x y -> BatFloat.compare y x) in
57 |     L.sort_uniq cmp !all_scores in
58 |   let score2rank = Ht.create (L.length uniq_scores) in
59 |   L.iteri (fun i score ->
60 |       Ht.add score2rank score i
61 |     ) uniq_scores;
62 |   (* output all lines, allong with their rank *)
63 |   LO.with_out_file output_fn (fun output ->
64 |       LO.iter input_fn (fun line ->
65 |           let score_field = String.cut_on_char sep field line in
66 |           let score =
67 |             try Scanf.sscanf score_field "%f" (fun x -> x)
68 |             with exn ->
69 |               begin
70 |                 Log.fatal "Rank: cannot parse float: %s" score_field;
71 |                 raise exn
72 |               end in
73 |           let rank = Ht.find score2rank score in
74 |           fprintf output "%s%c%d\n" line sep rank
75 |         )
76 |     )
77 | 
78 | let () = main ()
79 | 


--------------------------------------------------------------------------------
/src/rdkit_wrapper_specs.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | val __init__: smi:string -> unit -> t
 3 | 
 4 | val add_hydrogens: t -> unit -> t
 5 | 
 6 | val type_atom: t -> i:int -> unit -> int array
 7 | 
 8 | val type_EltFCaroNeighbs: t -> i:int -> unit -> int array
 9 | 
10 | val type_atom_simple: t -> i:int -> unit -> int array
11 | 
12 | val daylight_type_heavy_atom: t -> i:int -> unit -> int array
13 | 
14 | val get_num_atoms: t -> unit -> int
15 | 
16 | val get_diameter: t -> unit -> int
17 | 
18 | val get_distance: t -> i:int -> j:int -> unit -> int
19 | 
20 | val get_distances: t -> i:int -> unit -> int array
21 | 
22 | val get_deep_smiles: t -> seed:int -> n:int -> randomize:bool -> smi:string -> unit -> string array
23 | 
24 | val get_elements: t -> unit -> string array
25 | 
26 | val get_anums: t -> unit -> int array
27 | 


--------------------------------------------------------------------------------
/src/scale.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | open Printf
10 | 
11 | type t = Single of int (* encoding radius *)
12 |        | Multi of int * int (* start-stop encoding radii *)
13 | 
14 | let of_string s =
15 |   if BatString.contains s ':' then
16 |     let istr, jstr = BatString.split s ~by:":" in
17 |     let i, j = int_of_string istr, int_of_string jstr in
18 |     assert(i >= 0 && j >= 0 && i <= j);
19 |     Multi (i, j)
20 |   else
21 |     Single (int_of_string s)
22 | 
23 | (* Example first line: ^#radius=0..1$
24 |    i.e. the radius indicator is everything after "#radius=" *)
25 | let of_dictionary_header fn =
26 |   let header = Utls.get_first_line fn in
27 |   let prfx = "#radius=" in
28 |   let prfx_len = String.length prfx in
29 |   Utls.enforce (BatString.starts_with header prfx)
30 |     "Scale.of_dictionary_header: not a circular FP dictionary header; \
31 |      --pairs CLI option probably missing";
32 |   let s = BatString.lchop ~n:prfx_len header in
33 |   of_string s
34 | 
35 | let to_string = function
36 |   | Single i -> sprintf "%d" i
37 |   | Multi (i, j) -> sprintf "%d:%d" i j
38 | 
39 | let to_list = function
40 |   | Single i -> [i]
41 |   | Multi (i, j) -> BatList.range i `To j
42 | 


--------------------------------------------------------------------------------
/src/sdf.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | (* one molecule in SDF format (i.e. consecutive lines from a .sdf file) *)
 3 | type t = string
 4 | 
 5 | exception Read_one
 6 | 
 7 | let read_one (input: in_channel): t =
 8 |   let buff = Buffer.create 10240 in
 9 |   try
10 |     while true do
11 |       let line = input_line input in
12 |       if line = "$$$$" then (* end of molecule in SDF format *)
13 |         (Buffer.add_string buff line;
14 |          Buffer.add_char buff '\n';
15 |          raise Read_one)
16 |       else
17 |         (Buffer.add_string buff line;
18 |          Buffer.add_char buff '\n')
19 |     done;
20 |     assert(false)
21 |   with
22 |   | End_of_file | Read_one ->
23 |     let res = Buffer.contents buff in
24 |     if res = "" then
25 |       raise End_of_file
26 |     else
27 |       res
28 | 
29 | (* return the inchi string, no trailing '\n' *)
30 | let get_inchi (mol: t): string =
31 |   let line_before = "> <PUBCHEM_IUPAC_INCHI>\n" in
32 |   let n = String.length line_before in
33 |   try
34 |     let i = BatString.find mol line_before in
35 |     let j = i + n in
36 |     let k = BatString.find_from mol j "\n" in
37 |     BatString.sub mol j (k - j)
38 |   with Not_found ->
39 |     failwith ("Sdf.get_inchi: no inchi for: " ^ mol)
40 | 
41 | let get_inchikey (mol: t): string =
42 |   let line_before = "> <PUBCHEM_IUPAC_INCHIKEY>\n" in
43 |   let n = String.length line_before in
44 |   try
45 |     let i = BatString.find mol line_before in
46 |     let j = i + n in
47 |     let k = BatString.find_from mol j "\n" in
48 |     BatString.sub mol j (k - j)
49 |   with Not_found ->
50 |     failwith ("Sdf.get_inchikey: no inchikey for: " ^ mol)
51 | 
52 | let get_fst_line m =
53 |   fst (BatString.split m ~by:"\n")
54 | 


--------------------------------------------------------------------------------
/src/sdf_read.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2022, Francois Berenger
 2 | 
 3 |    Tsuda laboratory, Graduate School of Frontier Sciences,
 4 |    The University of Tokyo, Japan.
 5 | 
 6 |    Dump a .sdf file in txt format. *)
 7 | 
 8 | open Printf
 9 | 
10 | module A = BatArray
11 | module L = BatList
12 | module LO = Line_oriented
13 | module Sdf_3D = Molenc.Sdf_3D
14 | module V3 = Vector3
15 | 
16 | let main () =
17 |   let input_fn = Sys.argv.(1) in
18 |   LO.with_in_file input_fn (fun input ->
19 |       try
20 |         while true do
21 |           let mol = Sdf_3D.read_one_molecule input in
22 |           let name = Sdf_3D.(mol.name) in
23 |           let elts = Sdf_3D.(mol.elements) in
24 |           let coords = Sdf_3D.(mol.coords) in
25 |           let bonds = Sdf_3D.(mol.bonds) in
26 |           printf "%s\n" name;
27 |           A.iter2 (fun xyz anum ->
28 |               let (x, y, z) = V3.to_triplet xyz in
29 |               let elt = Sdf_3D.symbol_of_anum anum in
30 |               printf "%10.4f%10.4f%10.4f %s\n" x y z elt
31 |             ) coords elts;
32 |           A.iteri (fun src_a connected_atoms ->
33 |               L.iter (fun dst_a ->
34 |                   printf "%3d%3d\n" (1 + src_a) (1 + dst_a)
35 |                 ) connected_atoms
36 |             ) bonds
37 |         done
38 |       with End_of_file -> ()
39 |     )
40 | 
41 | let () = main ()
42 | 


--------------------------------------------------------------------------------
/src/shannon.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | (* Compute Shannon information entropy of atom pair features *)
10 | 
11 | open Printf
12 | 
13 | module CLI = Minicli.CLI
14 | module Fp = Molenc.Fingerprint
15 | module FpMol = Molenc.FpMol
16 | module Ht = BatHashtbl
17 | module L = BatList
18 | module LO = Line_oriented
19 | module Log = Dolog.Log
20 | module Utls = Molenc.Utls
21 | 
22 | let log2_scale = 1.0 /. (log 2.0)
23 | 
24 | let log2 x =
25 |   log2_scale *. (log x)
26 | 
27 | let shannon_entropy n val_counts =
28 |   let res = ref 0.0 in
29 |   Ht.iter (fun _value count ->
30 |       let p_i = (float count) /. n in
31 |       res := !res +. (p_i *. log2 p_i)
32 |     ) val_counts;
33 |   -1.0 *. !res
34 | 
35 | let main () =
36 |   Log.(set_log_level INFO);
37 |   Log.color_on ();
38 |   let argc, args = CLI.init () in
39 |   if argc = 1 then
40 |     (eprintf "usage:\n  \
41 |               %s -i mols.txt -n num_features -o entropy.txt\n  \
42 |               -i <filename>: input molecules file\n  \
43 |               -n <int>: number of features\n  \
44 |               -o <filename>: output file\n"
45 |        Sys.argv.(0);
46 |      exit 1);
47 |   let input_fn = CLI.get_string ["-i"] args in
48 |   let nb_features = CLI.get_int ["-n"] args in
49 |   let output_fn = CLI.get_string ["-o"] args in
50 |   CLI.finalize ();
51 |   Log.info "reading molecules...";
52 |   let all_molecules = LO.map input_fn (FpMol.parse_one 0) in
53 |   Log.info "read: %d" (L.length all_molecules);
54 |   Log.info "computing entropy...";
55 |   let ht = Ht.create nb_features in
56 |   for i = 0 to nb_features - 1 do
57 |     Ht.add ht i (Ht.create 11)
58 |   done;
59 |   L.iter (fun mol ->
60 |       Fp.kv_iter (fun feat_id feat_count ->
61 |           let acc = Ht.find ht feat_id in
62 |           let prev_count = Ht.find_default acc feat_count 0 in
63 |           Ht.replace acc feat_count (prev_count + 1)
64 |         ) (FpMol.get_fp mol)
65 |     ) all_molecules;
66 |   let total = ref 0 in
67 |   for i = 0 to nb_features - 1 do
68 |     let acc = Ht.find ht i in
69 |     Ht.iter (fun _k v ->
70 |         total := !total + v
71 |       ) acc
72 |   done;
73 |   Log.info "total: %d" !total;
74 |   let n = float !total in
75 |   (* entropy of each feature *)
76 |   let feat_ent = ref [] in
77 |   for i = 0 to nb_features - 1 do
78 |     let acc = Ht.find ht i in
79 |     let ent = shannon_entropy n acc in
80 |     if ent > 0.0 then
81 |       feat_ent := (i, ent) :: !feat_ent
82 |   done;
83 |   (* sort features by decreasing entropy *)
84 |   let feat_encr_decr =
85 |     L.sort (fun (_i, ei) (_j, ej) ->
86 |         BatFloat.compare ej ei
87 |       ) !feat_ent in
88 |   let cumulated = ref 0.0 in
89 |   LO.with_out_file output_fn (fun out ->
90 |       L.iter (fun (feat, ent) ->
91 |           fprintf out "%d %f %f\n" feat ent !cumulated;
92 |           cumulated := !cumulated +. ent
93 |         ) feat_encr_decr
94 |     )
95 | 
96 | let () = main ()
97 | 


--------------------------------------------------------------------------------
/src/shuf.ml:
--------------------------------------------------------------------------------
 1 | (* replacement for UNIX's shuf command, but seedable for reproducibility *)
 2 | 
 3 | open Printf
 4 | 
 5 | module A = BatArray
 6 | module CLI = Minicli.CLI
 7 | module LO = Line_oriented
 8 | module Log = Dolog.Log
 9 | module RNG = BatRandom.State
10 | 
11 | exception Early_stop
12 | 
13 | let main () =
14 |   Log.(set_log_level INFO);
15 |   Log.color_on ();
16 |   let argc, args = CLI.init () in
17 |   (if argc = 1 then
18 |      begin
19 |        eprintf "usage:\n\
20 |                 %s\n  \
21 |                 -i <filename>: input file\n  \
22 |                 -o <filename>: output file\n  \
23 |                 [-n <int>]: output at most N lines (default=all)\n  \
24 |                 [-s <int>]: random seed (default=none)\n"
25 |         Sys.argv.(0);
26 |        exit 1
27 |      end
28 |   );
29 |   let input_fn = CLI.get_string ["-i"] args in
30 |   let output_fn = CLI.get_string ["-o"] args in
31 |   let maybe_n = CLI.get_int_opt ["-n"] args in
32 |   let maybe_seed = CLI.get_int_opt ["-s"] args in
33 |   CLI.finalize (); (* ------------------------------------------------------ *)
34 |   (* input *)
35 |   let all_lines = A.of_list (LO.lines_of_file input_fn) in
36 |   let count = A.length all_lines in
37 |   (* output all or not? *)
38 |   let output_n = match maybe_n with
39 |     | None -> count
40 |     | Some m -> min count m in
41 |   let rng = match maybe_seed with
42 |     | None -> RNG.make_self_init ()
43 |     | Some s -> RNG.make [|s|] in
44 |   (* shuffle *)
45 |   A.shuffle ~state:rng all_lines;
46 |   (* output *)
47 |   LO.with_out_file output_fn (fun out ->
48 |       try
49 |         A.iteri (fun i line ->
50 |             if i < output_n then
51 |               fprintf out "%s\n" line
52 |             else
53 |               raise Early_stop
54 |           ) all_lines
55 |       with Early_stop -> ()
56 |     )
57 | 
58 | let () = main ()
59 | 


--------------------------------------------------------------------------------
/src/smi.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | (* read one molecule from a SMILES file *)
 3 | let read_one (input: in_channel): string =
 4 |   let line = input_line input in
 5 |   (* strip protects against trailing '\r'
 6 |      we append '\n' because all other formats
 7 |      end molecules with a '\n' *)
 8 |   (BatString.strip line) ^ "\n"
 9 | 
10 | let get_name smiles_line =
11 |   let _smiles, name =
12 |     try BatString.split smiles_line ~by:"\t"
13 |     with Not_found -> failwith "Smi.get_name: smiles file not using tabs" in
14 |   (* all other file formats expect molecule name only,
15 |    * not molecule name followed by EOL *)
16 |   if BatString.ends_with name "\r\n" then
17 |     BatString.rchop ~n:2 name
18 |   else if BatString.ends_with name "\n" then
19 |     BatString.rchop ~n:1 name
20 |   else
21 |     name
22 | 


--------------------------------------------------------------------------------
/src/syb_atom.ml:
--------------------------------------------------------------------------------
 1 | (* Copyright (C) 2020, Francois Berenger
 2 | 
 3 |    Yamanishi laboratory,
 4 |    Department of Bioscience and Bioinformatics,
 5 |    Faculty of Computer Science and Systems Engineering,
 6 |    Kyushu Institute of Technology,
 7 |    680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)
 8 | 
 9 | (* very small atom module to help compute atom environments (a la molprint2d)
10 |    from MOL2 files *)
11 | 
12 | open Printf
13 | 
14 | type t = { idx: int;
15 |            typ: Sybyl.t }
16 | 
17 | let create (idx: int) (typ: Sybyl.t): t =
18 |   (* indexes start at 1 in MOL2 files *)
19 |   { idx = idx - 1; typ }
20 | 
21 | let dummy = create (-1) Sybyl.Du
22 | 
23 | (* example line (output of OpenBabel):
24 |    "      1 S          -0.0218    1.7554    0.0117 S.2     1  LIG1       -0.0637" *)
25 | let of_mol2_line (l: string): t =
26 |   try Scanf.sscanf l " %d %s %f %f %f %s@ %s@ %s@ %s"
27 |         (fun idx _name _x _y _z typ _bs0 _bs1 _bs2 ->
28 |            create idx (Sybyl.of_string typ))
29 |   with Scanf.Scan_failure msg ->
30 |     failwith (sprintf "Atom.of_mol2_line: could not parse: %s: %s" l msg)
31 | 
32 | let to_string (a: t): string =
33 |   sprintf "%d %s" a.idx (Sybyl.to_string a.typ)
34 | 


--------------------------------------------------------------------------------
/src/test_RS.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | (* initial tests of random coordinates sub-sampling wihout replacement
 3 |    (for fast but approximate Jaccard computation) *)
 4 | 
 5 | open Printf
 6 | 
 7 | module A = BatArray
 8 | module CLI = Minicli.CLI
 9 | module Fp = Molenc.Fingerprint
10 | module FpMol = Molenc.FpMol
11 | module Ht = Hashtbl
12 | module L = BatList
13 | module Log = Dolog.Log
14 | module Utls = Molenc.Utls
15 | 
16 | let main () =
17 |   Log.color_on ();
18 |   Log.set_log_level Log.INFO;
19 |   let argc, args = CLI.init () in
20 |   if argc = 1 then
21 |     (eprintf "usage:\n\
22 |               %s -p <drop_frac:FLOAT> -i <FILE> [-n <repeats:INT>]\n"
23 |        Sys.argv.(0);
24 |      exit 1);
25 |   let input_fn = CLI.get_string ["-i"] args in
26 |   let drop_p = CLI.get_float ["-p"] args in
27 |   assert(drop_p > 0.0 && drop_p < 1.0);
28 |   let nb_iter = CLI.get_int_def ["-n"] args 10_000 in
29 |   CLI.finalize ();
30 |   let alpha = 1.0 /. (1.0 -. drop_p) in
31 |   (* read all molecules *)
32 |   let molecules = FpMol.molecules_of_file input_fn in
33 |   let nb_mols = L.length molecules in
34 |   Log.info "nb_mols: %d" nb_mols;
35 |   let fingerprints = A.of_list (L.map FpMol.get_fp molecules) in
36 |   let nb_features = L.max (L.map FpMol.nb_features molecules) in
37 |   Log.info "nb_features: %d" nb_features;
38 |   let feat_id_max = nb_features - 1 in
39 |   let rand_feat_ids =
40 |     let all_features = L.range 0 `To feat_id_max in
41 |     L.shuffle all_features in
42 |   let truncated =
43 |     let n = Utls.ceili (drop_p *. (float nb_features)) in
44 |     let to_drop = Ht.create n in
45 |     let candidates = L.take n rand_feat_ids in
46 |     L.iter (fun i ->
47 |         Ht.add to_drop i ()
48 |       ) candidates;
49 |     A.map (Fp.drop_features to_drop) fingerprints in
50 |   for _ = 1 to nb_iter do
51 |     let i = Random.int nb_mols in
52 |     let j = Random.int nb_mols in
53 |     let fp_i = fingerprints.(i) in
54 |     let fp_j = fingerprints.(j) in
55 |     let sum_min, sum_max = Fp.sum_min_max fp_i fp_j in
56 |     let exact_tani = Fp.tanimoto fp_i fp_j in
57 |     let tfp_i = truncated.(i) in
58 |     let tfp_j = truncated.(j) in
59 |     let est_tani = Fp.tanimoto tfp_i tfp_j in
60 |     let est_sum_min, est_sum_max = Fp.sum_min_max tfp_i tfp_j in
61 |     let x = float sum_min /. float est_sum_min in
62 |     let y = float sum_max /. float est_sum_max in
63 |     printf "Tani:\t%.3f\t%.3f\t%.3f" exact_tani est_tani
64 |       (abs_float (exact_tani -. est_tani));
65 |     printf "\tn: %d\t%d\t%.3f\t%.3f\t%.3f" sum_min est_sum_min
66 |       x alpha ((x -. alpha) /. x);
67 |     printf "\tu: %d\t%d\t%.3f\t%.3f\t%.3f\n" sum_max est_sum_max
68 |       y alpha ((y -. alpha) /. y)
69 |   done
70 | 
71 | (* FBR: on stderr, print the average absolute error in each case *)
72 | 
73 | (* FBR: write unit test *)
74 | 
75 | let () = main ()
76 | 


--------------------------------------------------------------------------------
/src/to_dense.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | (* read a molenc output file (.txt) and output it in dense csv format for R *)
 3 | 
 4 | open Printf
 5 | 
 6 | module A = BatArray
 7 | module Bloom = Molenc.Bloom
 8 | module CLI = Minicli.CLI
 9 | module Log = Dolog.Log
10 | module LO = Line_oriented
11 | module Fp = Molenc.Fingerprint
12 | module Utls = Molenc.Utls
13 | 
14 | let expand_line nb_features maybe_bloom line =
15 |   try
16 |     Scanf.sscanf line "%s@,%f,%s"
17 |       (fun _name pIC50 fp_str ->
18 |          printf "%f" pIC50;
19 |          let fp = Fp.of_string fp_str in
20 |          (match maybe_bloom with
21 |           | None -> Fp.to_dense_printf nb_features fp;
22 |           | Some params ->
23 |             let bloom = Bloom.encode params fp in
24 |             A.iter (printf " %d") bloom
25 |          );
26 |          printf "\n"
27 |       )
28 |   with exn ->
29 |     (Log.fatal "cannot parse: %s" line;
30 |      raise exn)
31 | 
32 | let main () =
33 |   Log.color_on ();
34 |   Log.set_log_level Log.INFO;
35 |   Log.info "start";
36 |   let argc, args = CLI.init () in
37 |   let show_help = CLI.get_set_bool ["-h";"--help"] args in
38 |   if argc = 1 || show_help then
39 |     (eprintf "usage:\n\
40 |               %s [-np <int>] -i <molecules.txt> -n <nb_features>\n\
41 |               [--bloom <int>,<int>]: k,m counted Bloom filter params\n"
42 |        Sys.argv.(0);
43 |      exit 1);
44 |   let _nprocs = CLI.get_int_def ["-np"] args 1 in
45 |   let input_fn = CLI.get_string ["-i"] args in
46 |   let input_features = CLI.get_int ["-n"] args in
47 |   let output_features, maybe_bloom =
48 |     match CLI.get_string_opt ["--bloom"] args with
49 |     | None -> (input_features, None)
50 |     | Some k_m ->
51 |       Scanf.sscanf k_m "%d,%d" (fun k m ->
52 |           Utls.enforce (m < input_features) "m >= input_features";
53 |           (m, Some (Bloom.init input_features k m))
54 |         ) in
55 |   CLI.finalize ();
56 |   (* CSV header made of column numbers: IC50 in first column then features *)
57 |   printf "0";
58 |   for i = 1 to output_features do
59 |     printf " %d" i
60 |   done;
61 |   printf "\n";
62 |   (* dense data lines, with optional counted Bloom filter encoding *)
63 |   LO.iter input_fn (expand_line input_features maybe_bloom)
64 | 
65 | let () = main ()
66 | 


--------------------------------------------------------------------------------
/src/uniq.ml:
--------------------------------------------------------------------------------
  1 | 
  2 | (* uniq filter: keep only line if given field was never seen before  *)
  3 | 
  4 | open Printf
  5 | 
  6 | module CLI = Minicli.CLI
  7 | module Db = Dokeysto.Db.RW
  8 | module Ht = Hashtbl
  9 | module Log = Dolog.Log
 10 | module LO = Line_oriented
 11 | module String = BatString
 12 | module Utls = Molenc.Utls
 13 | 
 14 | module type HT = sig
 15 |   type t
 16 |   val create: string -> t
 17 |   val mem: t -> string -> bool
 18 |   val add: t -> string -> unit
 19 |   val close: t -> unit
 20 | end
 21 | 
 22 | module HtOnDisk: HT = struct
 23 | 
 24 |   type t = Dokeysto.Db.RW.t
 25 | 
 26 |   let create input_fn =
 27 |     Db.create (input_fn ^ ".uniq.db")
 28 | 
 29 |   let mem db field =
 30 |     Db.mem db field
 31 | 
 32 |   let add db field =
 33 |     Db.add db field ""
 34 | 
 35 |   let close db =
 36 |     Db.close db
 37 | end
 38 | 
 39 | module HtInRAM: HT = struct
 40 | 
 41 |   type t = (string, unit) Ht.t
 42 | 
 43 |   let create _input_fn =
 44 |     (* DO NOT ever try to read the whole input file in case of --in-RAM:
 45 |      * we want to be able to read molecules from a UNIX pipe *)
 46 |     Ht.create 1_000_000
 47 | 
 48 |   let mem db field =
 49 |     Ht.mem db field
 50 | 
 51 |   let add db field =
 52 |     Ht.add db field ()
 53 | 
 54 |   let close _db =
 55 |     ()
 56 | end
 57 | 
 58 | let main () =
 59 |   Log.(set_log_level INFO);
 60 |   Log.color_on ();
 61 |   let argc, args = CLI.init () in
 62 |   if argc = 1 then
 63 |     begin
 64 |       eprintf "usage:\n\
 65 |                %s\n  \
 66 |                -i <filename>: input file\n  \
 67 |                -d <char>: field separator (default=\\t)\n  \
 68 |                -f <int>: field to filter on\n  \
 69 |                [--force]: erase index files, if any\n  \
 70 |                [--sorted]: file already sorted on that field\n  \
 71 |                [--in-RAM]: Ht in RAM rather than on disk\n"
 72 |         Sys.argv.(0);
 73 |       exit 1
 74 |     end;
 75 |   let mod_db =
 76 |     if CLI.get_set_bool ["--in-RAM"] args then
 77 |       (module HtInRAM: HT)
 78 |     else (module HtOnDisk: HT) in
 79 |   let module DB = (val mod_db: HT) in
 80 |   let sorted = CLI.get_set_bool ["--sorted"] args in
 81 |   let force = CLI.get_set_bool ["--force"] args in
 82 |   let input_fn = CLI.get_string ["-i"] args in
 83 |   (if force then
 84 |      (Utls.rm_file (input_fn ^ ".uniq.db");
 85 |       Utls.rm_file (input_fn ^ ".uniq.db.idx"))
 86 |   );
 87 |   let db = DB.create input_fn in
 88 |   let prev_field = ref "" in
 89 |   let uniq_field_check, register_field =
 90 |     if sorted then
 91 |       ((fun field -> !prev_field <> field),
 92 |        (fun field -> prev_field := field))
 93 |     else
 94 |       ((fun field -> not (DB.mem db field)),
 95 |        (fun field -> DB.add db field))
 96 |   in
 97 |   let sep = CLI.get_char_def ["-d"] args '\t' in
 98 |   let field_num = (CLI.get_int ["-f"] args) - 1 in
 99 |   let count = ref 0 in
100 |   LO.with_in_file input_fn (fun input ->
101 |       try
102 |         while true do
103 |           let line = input_line input in
104 |           let field_str = String.cut_on_char sep field_num line in
105 |           (if uniq_field_check field_str then
106 |              (register_field field_str;
107 |               printf "%s\n" line)
108 |           );
109 |           incr count;
110 |           (if !count mod 1000 = 0 then
111 |              eprintf "done: %d\r%!" !count
112 |           )
113 |         done
114 |       with End_of_file -> DB.close db
115 |     )
116 | 
117 | let () = main ()
118 | 


--------------------------------------------------------------------------------
/src/wmh_bench.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | open Printf
 3 | 
 4 | module A = BatArray
 5 | module CLI = Minicli.CLI
 6 | module Fp = Molenc.Fingerprint
 7 | module FpMol = Molenc.FpMol
 8 | module L = BatList
 9 | module Log = Dolog.Log
10 | module Utls = Molenc.Utls
11 | module WMH = Molenc.WMH
12 | 
13 | let main () =
14 |   Log.color_on ();
15 |   Log.set_log_level Log.INFO;
16 |   let argc, args = CLI.init () in
17 |   if argc = 1 then
18 |     (eprintf "usage:\n\
19 |               %s -i encoded_molecules.txt\n" Sys.argv.(0);
20 |      exit 1);
21 |   let input_fn = CLI.get_string ["-i"] args in
22 |   (* read all molecules *)
23 |   let molecules = FpMol.molecules_of_file input_fn in
24 |   let nb_features = L.max (L.map FpMol.nb_features molecules) in
25 |   let sparse_fingerprints = A.of_list (L.map FpMol.get_fp molecules) in
26 |   let bounds = WMH.bounds nb_features sparse_fingerprints in
27 |   let idx2feat = WMH.lookup_table bounds in
28 |   let rand_bound = A.length idx2feat in
29 |   let feat2acc_bound = WMH.acc_bounds_table bounds in
30 |   let dense_fingerprints = A.map (WMH.to_dense nb_features) sparse_fingerprints in
31 |   let n = A.length sparse_fingerprints in
32 |   Log.info "read %d molecules" n;
33 |   let ks = [40] in
34 |   (* bench hashing and scoring speeds *)
35 |   L.iter (fun k ->
36 |       (* hash them (and compute hashing rate) *)
37 |       let seeds = WMH.get_seeds k in
38 |       let rands = WMH.gen_rands seeds rand_bound in
39 |       let dt0, hashes = Utls.time_it (fun () ->
40 |           A.map (WMH.hash rands idx2feat feat2acc_bound) dense_fingerprints
41 |         ) in
42 |       Log.info "k: %d hashing-rate: %.2f" k (float n /. dt0);
43 |       (* compute estimated tani for the same pairs (and compute scoring rate) *)
44 |       let dt2, _est_dists = Utls.time_it (fun () ->
45 |           let res = A.make n 0.0 in
46 |           for i = 0 to n - 1 do
47 |             let i1 = Random.int n in
48 |             let i2 = Random.int n in
49 |             let m1 = A.get hashes i1 in
50 |             let m2 = A.get hashes i2 in
51 |             let tani = WMH.estimate_jaccard m1 m2 in
52 |             A.set res i tani
53 |           done;
54 |           res) in
55 |       let est_tani_rate = (float n) /. dt2 in
56 |       Log.info "k: %d est-Tani-rate: %.2f" k est_tani_rate
57 |     ) ks
58 | 
59 | let () = main ()
60 | 


--------------------------------------------------------------------------------
/src/wmh_test.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | open Printf
 3 | 
 4 | module A = BatArray
 5 | module CLI = Minicli.CLI
 6 | module Fp = Molenc.Fingerprint
 7 | module FpMol = Molenc.FpMol
 8 | module L = BatList
 9 | module Log = Dolog.Log
10 | module Utls = Molenc.Utls
11 | module WMH = Molenc.WMH
12 | 
13 | let main () =
14 |   Log.color_on ();
15 |   Log.set_log_level Log.INFO;
16 |   let argc, args = CLI.init () in
17 |   if argc = 1 then
18 |     (eprintf "usage:\n\
19 |               %s -i encoded_molecules.txt\n" Sys.argv.(0);
20 |      exit 1);
21 |   let input_fn = CLI.get_string ["-i"] args in
22 |   (* read all molecules *)
23 |   let molecules = FpMol.molecules_of_file input_fn in
24 |   let nb_features = L.max (L.map FpMol.nb_features molecules) in
25 |   let sparse_fingerprints = A.of_list (L.map FpMol.get_fp molecules) in
26 |   let bounds = WMH.bounds nb_features sparse_fingerprints in
27 |   let idx2feat = WMH.lookup_table bounds in
28 |   let rand_bound = A.length idx2feat in
29 |   let feat2acc_bound = WMH.acc_bounds_table bounds in
30 |   let dense_fingerprints = A.map (WMH.to_dense nb_features) sparse_fingerprints in
31 |   let n = A.length sparse_fingerprints in
32 |   Log.info "read %d molecules" n;
33 |   (* compute Tani for many pairs (and compute scoring rate) *)
34 |   Random.init 12345; (* seed PRNG *)
35 |   let pairs = A.init 10_000 (fun _i -> (Random.int n, Random.int n)) in
36 |   let dists = A.make 10_000 0.0 in
37 |   let dt1, () = Utls.time_it (fun () ->
38 |       A.iteri (fun i (i1, i2) ->
39 |           let tani =
40 |             Fp.tanimoto
41 |               (A.unsafe_get sparse_fingerprints i1)
42 |               (A.unsafe_get sparse_fingerprints i2) in
43 |           A.unsafe_set dists i tani
44 |         ) pairs
45 |     ) in
46 |   let tani_rate = (float n) /. dt1 in
47 |   Log.info "Tani-rate: %.2f" tani_rate;
48 |   let ks = [10; 20; 30; 40; 50; 100; 200; 500] in
49 |   (* test the correctness and bench hashing and scoring speeds
50 |      as a function of k (the number of hashes) *)
51 |   L.iter (fun k ->
52 |       let data_fn = sprintf "k_%03d.data" k in
53 |       Utls.with_out_file data_fn (fun out ->
54 |           (* hash them (and compute hashing rate) *)
55 |           let seeds = WMH.get_seeds k in
56 |           let rands = WMH.gen_rands seeds rand_bound in
57 |           let dt0, hashes = Utls.time_it (fun () ->
58 |               A.map (WMH.hash rands idx2feat feat2acc_bound) dense_fingerprints
59 |             ) in
60 |           Log.info "k: %d hashing-rate: %11.2f" k (float n /. dt0);
61 |           (* compute estimated tani for the same pairs
62 |              (and compute scoring rate) *)
63 |           let est_dists = A.make 10_000 0.0 in
64 |           let dt2, () = Utls.time_it (fun () ->
65 |               A.iteri (fun i (i1, i2) ->
66 |                   let m1 = A.unsafe_get hashes i1 in
67 |                   let m2 = A.unsafe_get hashes i2 in
68 |                   let tani = WMH.estimate_jaccard m1 m2 in
69 |                   A.unsafe_set est_dists i tani
70 |                 ) pairs
71 |             ) in
72 |           let est_tani_rate = (float n) /. dt2 in
73 |           (if est_tani_rate <= tani_rate
74 |            then Log.warn
75 |            else Log.info) "k: %d est-Tani-rate: %.2f accel: %.2f"
76 |             k est_tani_rate (est_tani_rate /. tani_rate);
77 |           A.iteri (fun i exact_dist ->
78 |               let abs_error = abs_float (exact_dist -. est_dists.(i)) in
79 |               fprintf out "%f %f %f\n" exact_dist est_dists.(i) abs_error
80 |             ) dists;
81 |           (* output maximum Tani error *)
82 |           let diffs =
83 |             A.map2 (fun d1 d2 -> abs_float (d1 -. d2)) dists est_dists in
84 |           let max_error = A.max diffs in
85 |           let avg_error = A.favg diffs in
86 |           let med_error = Utls.list_medianf (A.to_list diffs) in
87 |           Log.info "k: %d error(max, avg, med): %.2f %.2f %.2f"
88 |             k max_error avg_error med_error
89 |         )
90 |     ) ks;
91 |   Utls.run_command "gnuplot -persist tani_est.gpl"
92 | 
93 | let () = main ()
94 | 


--------------------------------------------------------------------------------
/src/wmh_unit_test.ml:
--------------------------------------------------------------------------------
 1 | 
 2 | open Printf
 3 | 
 4 | module A = BatArray
 5 | module CLI = Minicli.CLI
 6 | module Fp = Molenc.Fingerprint
 7 | module FpMol = Molenc.FpMol
 8 | module L = BatList
 9 | module Log = Dolog.Log
10 | module Utls = Molenc.Utls
11 | module WMH = Molenc.WMH
12 | 
13 | let print_array title a =
14 |   printf "%s:" title;
15 |   for i = 0 to (A.length a) - 1 do
16 |     printf " %d" a.(i)
17 |   done;
18 |   printf "\n"
19 | 
20 | let printi_array title a =
21 |   printf "%s:" title;
22 |   for i = 0 to (A.length a) - 1 do
23 |     printf " %d:%d" i a.(i)
24 |   done;
25 |   printf "\n"
26 | 
27 | let main () =
28 |   Log.color_on ();
29 |   Log.set_log_level Log.INFO;
30 |   (* read all molecules *)
31 |   let molecules =
32 |     L.mapi FpMol.parse_one
33 |       ["m0,0.0,[2:1;3:1]";
34 |        "m1,0.0,[0:2;1:2;2:2;3:2]"] in
35 |   let nb_features = L.max (L.map FpMol.nb_features molecules) in
36 |   printf "nb_features: %d\n" nb_features;
37 |   let sparse_fingerprints = A.of_list (L.map FpMol.get_fp molecules) in
38 |   let bounds = WMH.bounds nb_features sparse_fingerprints in
39 |   print_array "bounds" bounds;
40 |   let idx2feat = WMH.lookup_table bounds in
41 |   printi_array "idx2feat" idx2feat;
42 |   let rand_bound = A.length idx2feat in
43 |   printf "bound: %d\n" rand_bound;
44 |   let feat2acc_bound = WMH.acc_bounds_table bounds in
45 |   print_array "feat2acc_bound" feat2acc_bound;
46 |   let dense_fingerprints = A.map (WMH.to_dense nb_features) sparse_fingerprints in
47 |   (* k = 1 *)
48 |   let rands = [|0;1;2;3;5;7;4|] in (* only last one should hit *)
49 |   let hash = WMH.hash [|rands|] idx2feat feat2acc_bound dense_fingerprints.(0) in
50 |   let hash_val = hash.(0) in
51 |   assert(hash_val = 6);
52 |   let rands = [|0;1;2;3;7;5;6|] in (* only last one should hit *)
53 |   let hash = WMH.hash [|rands|] idx2feat feat2acc_bound dense_fingerprints.(0) in
54 |   let hash_val = hash.(0) in
55 |   assert(hash_val = 6);
56 |   let rands = [|4|] in (* 1st one should hit *)
57 |   let hash = WMH.hash [|rands|] idx2feat feat2acc_bound dense_fingerprints.(0) in
58 |   let hash_val = hash.(0) in
59 |   assert(hash_val = 0);
60 |   let rands = [|6|] in (* 1st one should hit *)
61 |   let hash = WMH.hash [|rands|] idx2feat feat2acc_bound dense_fingerprints.(0) in
62 |   let hash_val = hash.(0) in
63 |   assert(hash_val = 0)
64 | 
65 | let () = main ()
66 | 


--------------------------------------------------------------------------------
/tani_est.gpl:
--------------------------------------------------------------------------------
 1 | 
 2 | set xrange [0:1]
 3 | set yrange [0:1]
 4 | 
 5 | set size square
 6 | 
 7 | set xlabel 'Exact Tanimoto'
 8 | set ylabel 'Estimated Tanimoto'
 9 | 
10 | set key outside
11 | 
12 | f(x) = x
13 | 
14 | plot f(x) not, \
15 |      'k_010.data' u 1:2 t 'k=10' , \
16 |      'k_020.data' u 1:2 t 'k=20' , \
17 |      'k_030.data' u 1:2 t 'k=30' , \
18 |      'k_040.data' u 1:2 t 'k=40' , \
19 |      'k_050.data' u 1:2 t 'k=50' , \
20 |      'k_500.data' u 1:2 t 'k=500'
21 | 
22 | # 'k_100.data' u 1:2 t 'k=100'
23 | # 'k_200.data' u 1:2 t 'k=200'
24 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #set -x # DEBUG
 4 | 
 5 | # encoding an SDF or a SMILES file is the same
 6 | # and it is the one we expect
 7 | diff <(./bin/molenc_type_atoms.py data/caff_coca.sdf) data/caff_coca_types.ref
 8 | diff <(./bin/molenc_type_atoms.py data/caff_coca.smi) data/caff_coca_types.ref
 9 | 
10 | # ph4 features are the same than the ones extracted by ShowFeats.py
11 | # (that were checked by hand and stored in a reference file)
12 | diff <(./bin/molenc_ph4_type_atoms.py data/caff_coca.sdf) data/caff_coca_feats.ref
13 | 
14 | diff <(_build/default/src/pubchem_decoder.exe -i data/test_in.pbc -o /dev/stdout) data/test_out.ref
15 | 
16 | # atom pairs encoder tests
17 | rm -f data/AP_test.smi.dix data/AP_test.txt # clean any previous run
18 | molenc.sh --pairs -i data/AP_test.smi -o data/AP_test.txt
19 | diff data/AP_test.smi.dix data/AP_test.smi.dix.ref
20 | diff data/AP_test.txt data/AP_test.txt.ref
21 | 


--------------------------------------------------------------------------------
/test_BBAD.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #set -x # DEBUG
 4 | 
 5 | # check some properties of the AP-BBAD
 6 | 
 7 | # 1) the BBAD of a single molecule is the encoded single molecule
 8 | rm -f caffeine_AP_BBAD.txt
 9 | _build/default/src/AP_BBAD.exe -i data/caffeine.smi -o caffeine_AP_BBAD.txt
10 | awk -v sum=0 -F' ' '{sum += $2}END{if(sum == 105){print "|features| OK"}}' caffeine_AP_BBAD.txt
11 | 
12 | # 2) the BBAD computed in parallel is the same as the sequential one
13 | rm -f seq_AD.txt par_AD.txt
14 | _build/default/src/AP_BBAD.exe -i data/chembl1868_std.smi -o seq_AD.txt -np 1
15 | nprocs=`getconf _NPROCESSORS_ONLN`
16 | _build/default/src/AP_BBAD.exe -i data/chembl1868_std.smi -o par_AD.txt -np ${nprocs}
17 | diff seq_AD.txt par_AD.txt
18 | 
19 | # 3) compute a simple BBAD by hand; check this is the one we obtain
20 | rm -f data/alcools.AD.curr
21 | _build/default/src/AP_BBAD.exe -i data/alcools.smi -o data/alcools.AD.curr
22 | diff data/alcools.AD.curr data/alcools.AD.ref
23 | 
24 | # 4) the BBAD of some molecules doesn't filter out any of those molecules
25 | rm -f filtered.txt
26 | _build/default/src/AP_BBAD.exe --bbad seq_AD.txt -i data/chembl1868_std.smi -o filtered.txt -np ${nprocs}
27 | diff <(cat data/chembl1868_std.smi | wc -l) <(cat filtered.txt | wc -l)
28 | 
29 | # 5) the BBAD union for two sets of molecules should be the same as the AD for the union of the sets
30 | rm -f head_AD.txt tail_AD.txt head_tail_AD_union.txt head_tail_AD.txt
31 | _build/default/src/AP_BBAD.exe -i <(head data/chembl1868_std.smi) -o head_AD.txt -np ${nprocs}
32 | _build/default/src/AP_BBAD.exe -i <(tail data/chembl1868_std.smi) -o tail_AD.txt -np ${nprocs}
33 | _build/default/src/AP_BBAD.exe --bbad head_AD.txt,tail_AD.txt -o head_tail_AD_union.txt
34 | _build/default/src/AP_BBAD.exe -i <(head data/chembl1868_std.smi; tail data/chembl1868_std.smi) \
35 |                                -o head_tail_AD.txt
36 | diff head_tail_AD_union.txt head_tail_AD.txt
37 | 


--------------------------------------------------------------------------------
/test_sdf_read.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x # DEBUG
 4 | 
 5 | # check we parse correctly 10 3D molecules in a .sdf file
 6 | rm -f data/chembl30_10mols.txt.curr
 7 | _build/default/src/sdf_read.exe data/chembl30_10mols.sdf \
 8 |                                 > data/chembl30_10mols.txt.curr
 9 | diff data/chembl30_10mols.txt.ref data/chembl30_10mols.txt.curr
10 | 


--------------------------------------------------------------------------------
/test_uhd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # regression test for the UHD fingerprint
 4 | make
 5 | 
 6 | # cleanup any prior run
 7 | \rm -f data/ethanol.uhd data/ethanol.smi.dix
 8 | 
 9 | # run
10 | _build/default/src/molenc_UHD.exe -f -i data/ethanol.smi -o data/ethanol.uhd
11 | 
12 | # check Vs refs
13 | diff data/ethanol.uhd     data/ethanol.uhd.ref
14 | diff data/ethanol.smi.dix data/ethanol.uhd.dix.ref
15 | 


--------------------------------------------------------------------------------