├── .gitignore ├── INSTALL.txt ├── LICENSE ├── Makefile ├── README.md ├── TODO ├── bin ├── csv2txt.sh ├── fpscores.pkl.gz ├── inchi2smi.py ├── molenc.sh ├── molenc_HA.py ├── molenc_SA.py ├── molenc_atoms_filter.py ├── molenc_color.py ├── molenc_common.py ├── molenc_deepsmi.py ├── molenc_diam.py ├── molenc_drug.py ├── molenc_elements.py ├── molenc_eln.py ├── molenc_frag.py ├── molenc_frag2smi.py ├── molenc_fscan.py ├── molenc_get_tag.py ├── molenc_gpc.py ├── molenc_gpr.py ├── molenc_histo.py ├── molenc_ifg.py ├── molenc_iupac.smi ├── molenc_lead.py ├── molenc_lean.py ├── molenc_ligprep.sh ├── molenc_linker.py ├── molenc_lizard.py ├── molenc_mol2smi.py ├── molenc_mview.py ├── molenc_nearest.py ├── molenc_padel.py ├── molenc_panascan.py ├── molenc_ph4.py ├── molenc_ph4_type_atoms.py ├── molenc_qed.py ├── molenc_rbonds_filter.py ├── molenc_rdfrag.py ├── molenc_regr_stats.py ├── molenc_rfr.py ├── molenc_rotbond.py ├── molenc_scaffold.py ├── molenc_scan.py ├── molenc_scan.sh ├── molenc_sdf2smi.py ├── molenc_sdf_strip.py ├── molenc_smi2cansmi.py ├── molenc_smi2png.py ├── molenc_smisur.py ├── molenc_stable.py ├── molenc_std.py ├── molenc_thash.py ├── molenc_type_atoms.py ├── molenc_uniq.py ├── rgb_scale.py ├── smi2png.py └── smi2svg.py ├── data ├── 1k_mols_std_01.txt ├── 3.frags ├── 3.mols ├── 3.smi ├── 3.to_frag ├── 3_frags.smi ├── 3_frags.txt ├── 3_genmols.smi ├── 3_genmols.txt ├── 3_genmols_uniq.smi ├── ALDH1_2conf.ph4 ├── AP_test.smi ├── AP_test.smi.dix.ref ├── AP_test.txt.ref ├── alcools.AD.ref ├── alcools.smi ├── caff_coca.sdf ├── caff_coca.smi ├── caff_coca_feats.ref ├── caff_coca_types.ref ├── caffeine.mol2 ├── caffeine.sdf ├── caffeine.smi ├── caffeine_3d.sdf ├── chembl1868_std.AP ├── chembl1868_std.smi ├── chembl30_10mols.sdf ├── chembl30_10mols.txt.ref ├── chembl30_10mols_am1bcc.mol2 ├── chembl_antivirals.frags.smi ├── chembl_antivirals.genmol.smi ├── chembl_antivirals.smi ├── chemical_formulas.txt ├── cisapride.smi ├── co_1conf.sdf ├── cocaine.smi ├── ethanol.smi ├── ethanol.uhd.dix.ref ├── ethanol.uhd.ref ├── fda_approved.smi ├── features.txt ├── gen_mols.txt ├── h2o_1conf.sdf ├── merge.txt ├── opio.smi ├── ptable.txt ├── test_HYD_group.sdf ├── test_HYD_group.smi ├── test_in.pbc ├── test_mols.txt └── test_out.ref ├── deepsmi_test.sh ├── doc ├── Ester_KDD_1996_DBSCANclustering.pdf └── Shrivastava_2016_ExactWeightedMinwiseHashing.pdf ├── dune-project ├── fcodec ├── histo.gpl ├── kb_test.sh ├── mol_frag_test.sh ├── molenc.opam ├── molenc_frag ├── rfp ├── smisur_test.sh ├── src ├── AP_BBAD.ml ├── BBAD.ml ├── MSE_mol.ml ├── MST.ml ├── WMH.ml ├── ap_encoder.ml ├── ap_types.ml ├── atom_env.ml ├── atom_pair.ml ├── bloom.ml ├── bond.ml ├── butina.ml ├── decoder.ml ├── dsmi.ml ├── dune ├── encoder.ml ├── filter.ml ├── finder.ml ├── fingerprint.ml ├── formula.ml ├── fpMol.ml ├── fp_test.ml ├── fragmentable_mol.ml ├── gen_bindings.sh ├── get_mol.ml ├── gnuplot.ml ├── gram.ml ├── index.ml ├── indexer.ml ├── intSet.ml ├── lean.ml ├── lig_box.ml ├── merge.ml ├── mini_mol.ml ├── mol2.ml ├── molenc_AP.ml ├── molenc_UHD.ml ├── molenc_fcodec.ml ├── myList.ml ├── node.ml ├── norm.ml ├── palette.ml ├── pareto.ml ├── ph4.ml ├── ph4_atom.ml ├── piEltHA.ml ├── prune.ml ├── ptable.ml ├── pubchem_decoder.ml ├── rank.ml ├── rdkit.ml ├── rdkit_wrapper.py ├── rdkit_wrapper_specs.txt ├── scale.ml ├── sdf.ml ├── sdf_3D.ml ├── sdf_read.ml ├── shannon.ml ├── shuf.ml ├── smi.ml ├── split.ml ├── syb_atom.ml ├── sybyl.ml ├── test_RS.ml ├── to_dense.ml ├── uniq.ml ├── utls.ml ├── wmh_bench.ml ├── wmh_test.ml └── wmh_unit_test.ml ├── tani_est.gpl ├── test.sh ├── test_BBAD.sh ├── test_sdf_read.sh └── test_uhd.sh /.gitignore: -------------------------------------------------------------------------------- 1 | molenc.install 2 | src/.merlin 3 | data/*.svg 4 | -------------------------------------------------------------------------------- /INSTALL.txt: -------------------------------------------------------------------------------- 1 | Molenc install guide 2 | ==================== 3 | 4 | Author: Francois Berenger 5 | Date: 6th July 2022 6 | 7 | Example installation instructions on a fresh Debian 11.3 system. 8 | On Ubuntu Linux, installation should be very similar. 9 | 10 | On Mac computers, this software has worked in the past, but 11 | installation is a pain; hence we don't maintain anymore 12 | neither recommend this setup. 13 | 14 | The Bash shell is assumed for all commands. 15 | 16 | Sudo rights are assumed for the user performing the installation. 17 | 18 | I) Install system-wide packages 19 | ------------------------------- 20 | 21 | $ sudo apt install git opam python3-pip python3-numpy 22 | 23 | II) Configure the OCaml package manager 24 | --------------------------------------- 25 | 26 | $ opam init -y 27 | $ eval `opam config env` # path setup for ocaml executables 28 | # might be needed in your ~/.bashrc 29 | 30 | III) Install OCaml packages 31 | --------------------------- 32 | 33 | $ opam depext -i molenc # this will also install rdkit system-wide 34 | 35 | II) Install user-space packages 36 | ------------------------------- 37 | 38 | $ pip3 install six # required by chemo-standardizer 39 | $ pip3 install chemo-standardizer # requires system-wide rdkit 40 | 41 | III) Tests 42 | ---------- 43 | 44 | Test the molecular standardiser is correctly installed. 45 | It is used by molenc in case molecules need to be standardized. 46 | 47 | $ standardiser -h 48 | 49 | If not, it may be missing from PATH: 50 | 51 | $ export PATH=$PATH:~/.local/bin # might be needed in your ~/.bashrc 52 | $ standardiser -h # test again 53 | 54 | IV) Encode some molecules 55 | ------------------------- 56 | 57 | Get some molecules in the SMILES format: 58 | 59 | $ wget https://raw.githubusercontent.com/UnixJunkie/molenc/master/data/chembl_antivirals.smi -O antivirals.smi 60 | 61 | Encode those molecules using counted atom pairs fingerprint: 62 | 63 | $ molenc.sh --pairs -i antivirals.smi -o antivirals_std.AP 64 | 65 | Look at what was obtained: 66 | $ head -1 antivirals_std.AP 67 | CHEMBL807,0.0,[2:6;8:1;15:3;25:12;26:2;70:3;93:3;372:6;393:6;407:1;412:2;453:3;466:2;524:9;917:9;1095:3;1742:1;1776:3;2063:3;2576:4;2646:1;4428:3;5906:2;5916:1;6005:2] 68 | 69 | V) Encode more molecules with an existing encoding dictionary 70 | ------------------------------------------------------------- 71 | 72 | Let's say we want to encode some new molecules using an existing encoding dictionary 73 | (a dictionary was created in the previous step for antivirals.smi). 74 | In the real world, you might want the encoding dictionary to cover the whole ChEMBL database 75 | (or your company's whole compound collection), so that the dictionary is exhaustive enough. 76 | 77 | In the following, you need to replace MY_MOLECULES.smi with the SMILES file of your choice. 78 | 79 | $ molenc.sh --pairs -d antivirals.smi.dix -i MY_MOLECULES.smi -o MY_MOLECULES_std.AP 80 | 81 | Concluding remarks 82 | ------------------ 83 | 84 | Molenc is a research software prototype. 85 | As such, it might be be a little difficult to install and under-documented. 86 | So is the fate of research by-products. 87 | Don't hesitate to contact the author in case you cannot install the software, 88 | find any bug or encounter some problems while using it. 89 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Francois BERENGER 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build install uninstall reinstall test 2 | 3 | build: 4 | dune build @install -j `getconf _NPROCESSORS_ONLN` 5 | 6 | clean: 7 | rm -rf _build 8 | 9 | edit: 10 | emacs src/*.ml TODO commands.sh & 11 | 12 | install: build 13 | dune install 14 | 15 | uninstall: 16 | dune uninstall 17 | 18 | reinstall: uninstall install 19 | 20 | test: 21 | rm -f _build/default/src/fp_test.exe 22 | dune build src/fp_test.exe 23 | _build/default/src/fp_test.exe 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | MolEnc: a molecular encoder using rdkit and OCaml. 4 | 5 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3546675.svg)](https://doi.org/10.5281/zenodo.3546675) 6 | 7 | The implemented fingerprint is J-L Faulon's "Signature Molecular Descriptor" 8 | (SMD [1]). 9 | This is an unfolded-counted chemical fingerprint. 10 | Such fingerprints are less lossy than famous chemical fingerprints like ECFP4. 11 | SMD encoding doesn't introduce feature collisions upon encoding. 12 | Also, a feature dictionary is created at encoding time. 13 | This dictionary can be used later on to map a given feature index to an 14 | atom environment. 15 | Molenc also implements unfolded-counted atom pairs [2]. 16 | 17 | For SMD, we recommend using a radius of zero to one (molenc.sh -r 0:1 ...) or 18 | zero to two. 19 | 20 | Currently, the atom typing scheme being used is: 21 | (#pi-electrons, element symbol, #HA neighbors, formal charge). 22 | 23 | In the future, we might add pharmacophore feature points[3] 24 | (Donor, Acceptor, PosIonizable, NegIonizable, Aromatic, Hydrophobe), 25 | to allow a fuzzier description of molecules. 26 | 27 | # How to install the software 28 | 29 | For beginners/non opam users: 30 | download and execute the latest self-installer 31 | shell script from (https://github.com/UnixJunkie/molenc/releases). 32 | 33 | Then execute: 34 | ``` 35 | ./molenc-5.0.1.sh ~/usr/molenc-5.0.1 36 | ``` 37 | 38 | This will create ~/usr/molenc-5.0.1/bin/molenc.sh, among other things 39 | inside the same directory. 40 | 41 | For opam users: 42 | ``` 43 | opam install molenc 44 | ``` 45 | 46 | Do not hesitate to contact the author in case you have problems installing 47 | or using the software or if you have any question. 48 | 49 | # Usage 50 | 51 | ``` 52 | molenc.sh -i input.smi -o output.txt 53 | [-d encoding.dix]: reuse existing feature dictionary 54 | [-r i:j]: fingerprint radius (default=0:1) 55 | [--pairs]: use atom pairs instead of Faulon's FP 56 | [-m ]: maximum allowed atom-pair distance 57 | (default: no limit) 58 | [--seq]: sequential mode (disable parallelization) 59 | [-v]: debug mode; keep temp files 60 | [-n ]: max jobs in parallel 61 | [-c ]: chunk size 62 | [--no-std]: don't standardize input file molecules 63 | ONLY USE IF THEY HAVE ALREADY BEEN STANDARDIZED 64 | ``` 65 | 66 | How to encode a database of molecules: 67 | 68 | ``` 69 | molenc.sh -i molecules.smi -o molecules.txt 70 | 71 | ``` 72 | 73 | How to encode another database of molecules, but reusing the feature 74 | dictionary from another database: 75 | 76 | ``` 77 | molenc.sh -i other_molecules.smi -o other_molecules.txt -d molecules.txt.dix 78 | ``` 79 | 80 | # Bibliography 81 | 82 | [1] Faulon, J. L., Visco, D. P., & Pophale, R. S. (2003). The signature molecular descriptor. 1. Using extended valence sequences in QSAR and QSPR studies. Journal of chemical information and computer sciences, 43(3), 707-720. 83 | 84 | [2] Carhart, R. E., Smith, D. H., & Venkataraghavan, R. (1985). Atom pairs as molecular features in structure-activity studies: definition and applications. Journal of Chemical Information and Computer Sciences, 25(2), 64-73. 85 | 86 | [3] Kearsley, S. K., Sallamack, S., Fluder, E. M., Andose, J. D., Mosley, R. T., & Sheridan, R. P. (1996). Chemical similarity using physiochemical property descriptors. Journal of Chemical Information and Computer Sciences, 36(1), 118-127. 87 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | time ./bin/molenc_frag.py -i data/all_kegg_drugs_20112019_std.smi \ 3 | -o data/all_kegg_drugs_20112019_std.to_frag 4 | #7605 molecules at 1045.63 molecule/s 5 | #real 0m7.595s 6 | time ./molenc_frag -i data/all_kegg_drugs_20112019_std.to_frag \ 7 | -o data/all_kegg_drugs_20112019_std.frags -s 1234 8 | #real 0m1.158s i.e. ~= 6567 molecule/s 9 | -------------------------------------------------------------------------------- /bin/csv2txt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # csv format output by molenc_lizard.py to txt format expected by 4 | # several molenc tools 5 | 6 | awk -F',' \ 7 | '{print $1","$2",[0:"$3";1:"$4";2:"$5";3:"$6";4:"$7";5:"$8";6:"$9"]"}' $1 8 | -------------------------------------------------------------------------------- /bin/fpscores.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UnixJunkie/molenc/edc27db8206e6cbca4409b962426c94f3d14e18d/bin/fpscores.pkl.gz -------------------------------------------------------------------------------- /bin/inchi2smi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # InChI to SMILES conversion 4 | 5 | import argparse 6 | import rdkit 7 | import sys 8 | from rdkit import Chem 9 | 10 | def RobustMolSupplier(filename): 11 | with open(filename) as f: 12 | for line in f: 13 | words = line.split() 14 | name = words[0] 15 | inchi = words[1] 16 | yield (name, Chem.MolFromInchi(inchi)) 17 | 18 | if __name__ == '__main__': 19 | # parse CLI 20 | # show help in case user has no clue of what to do 21 | if len(sys.argv) != 3: 22 | sys.stderr.write("%s input.inchi output.smi\n" % sys.argv[0]) 23 | sys.exit(1) 24 | input_inchi = sys.argv[1] 25 | output_smi = sys.argv[2] 26 | output = open(output_smi, 'w') 27 | for name, mol in RobustMolSupplier(input_inchi): 28 | if mol is None: 29 | continue 30 | smi = Chem.MolToSmiles(mol) 31 | output.write("%s\t%s\n" % (smi, name)) 32 | output.close() 33 | -------------------------------------------------------------------------------- /bin/molenc_HA.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (C) 2023, Francois Berenger 4 | # Tsuda laboratory, The University of Tokyo, 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan. 6 | 7 | # Robust Heavy Atom count from a .smi input file 8 | # (molenc_lizard.py can ignore some molecules because of RDKit) 9 | 10 | import rdkit, sys 11 | from rdkit import Chem 12 | from rdkit.Chem import Lipinski 13 | 14 | def RobustSmilesMolSupplier(filename): 15 | with open(filename) as f: 16 | for i, line in enumerate(f): 17 | words = line.split() 18 | smile = words[0] 19 | name = words[1] 20 | yield (Chem.MolFromSmiles(smile, sanitize=False), name) 21 | 22 | input_smi = sys.argv[1] 23 | for mol, name in RobustSmilesMolSupplier(input_smi): 24 | if mol is None: 25 | print("rdkit could not parse: %s" % name, file=sys.stderr) 26 | else: 27 | HA = Lipinski.HeavyAtomCount(mol) 28 | print("%s\t%d" % (name, HA)) 29 | -------------------------------------------------------------------------------- /bin/molenc_atoms_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright (C) 2023, Francois Berenger 4 | # Tsuda laboratory, Tokyo University, 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan. 6 | # 7 | # Keep only molecules using allowed atoms 8 | 9 | import argparse, re, sys, time 10 | from rdkit import Chem 11 | 12 | regex = re.compile('\s') 13 | 14 | def find_whitespace(s): 15 | m = re.search(regex, s) 16 | if m == None: 17 | return -1 18 | else: 19 | return m.start() 20 | 21 | def parse_smiles_line(line): 22 | fst_white = find_whitespace(line) 23 | smi = '' 24 | name = '' 25 | if fst_white == -1: 26 | # no whitespace separator: assume molecule has no name 27 | # use the SMILES itself as the name, so this unnamed 28 | # molecule will percolate instead of behing lost 29 | smi = line 30 | name = line 31 | else: 32 | smi = line[0:fst_white] 33 | name = line[fst_white + 1:] 34 | return Chem.MolFromSmiles(smi) 35 | 36 | def parse_atoms_list(line): 37 | return set(line.strip().split(',')) 38 | 39 | def atoms_filter(allowed_atoms, mol): 40 | for a in mol.GetAtoms(): 41 | if a.GetSymbol() not in allowed_atoms: 42 | return False 43 | return True 44 | 45 | if __name__ == '__main__': 46 | before = time.time() 47 | # CLI options parsing 48 | parser = argparse.ArgumentParser(description = "filter out molecules w/ disallowed atoms") 49 | parser.add_argument("-i", metavar = "input.smi", dest = "input_fn", 50 | help = "molecules input file") 51 | parser.add_argument("-o", metavar = "output.smi", dest = "output_fn", 52 | help = "molecules output file") 53 | parser.add_argument('-a', metavar = "ATOMS_LIST", dest='allowed_atoms', 54 | default="C,H,N,O,P,S,F,Cl,Br,I", 55 | help = "comma-separated list of allowed atoms \ 56 | (default=C,H,N,O,P,S,F,Cl,Br,I)") 57 | # parse CLI --------------------------------------------------------------- 58 | if len(sys.argv) == 1: 59 | # user has no clue of what to do -> usage 60 | parser.print_help(sys.stderr) 61 | sys.exit(1) 62 | args = parser.parse_args() 63 | input_fn = args.input_fn 64 | output_fn = args.output_fn 65 | allowed_atoms = parse_atoms_list(args.allowed_atoms) 66 | # parse CLI end ----------------------------------------------------------- 67 | count = 0 68 | errors = 0 69 | with open(output_fn, 'w') as out: 70 | with open(input_fn, 'r') as input: 71 | for line in input.readlines(): 72 | mol = parse_smiles_line(line.strip()) 73 | if atoms_filter(allowed_atoms, mol): 74 | out.write("%s" % line) 75 | else: 76 | errors += 1 77 | count += 1 78 | after = time.time() 79 | dt = after - before 80 | print("%d molecules @ %.2fHz; removed %d" % (count, count / dt, errors), 81 | file=sys.stderr) 82 | -------------------------------------------------------------------------------- /bin/molenc_color.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # color molecules from a SMILES file according to per-atom delta score 4 | # values from another file 5 | 6 | import matplotlib.pyplot as plot 7 | import rdkit, sys 8 | from rdkit import Chem 9 | from rdkit.Chem import Draw 10 | from rdkit.Chem.Draw import rdDepictor, SimilarityMaps 11 | 12 | def RobustSmilesMolSupplier(filename): 13 | with open(filename) as f: 14 | for line in f: 15 | words = line.split() 16 | smile = words[0] 17 | name = " ".join(words[1:]) # everything after the SMILES string 18 | yield (name, Chem.MolFromSmiles(smile)) 19 | 20 | # draw all atoms in black 21 | drawOptions = Draw.DrawingOptions() 22 | drawOptions.elemDict = {} 23 | drawOptions.bgColor = None 24 | 25 | if __name__ == '__main__': 26 | if len(sys.argv) != 3: 27 | print("usage: %s molecules.smi molecules.delta" % sys.argv[0]) 28 | exit(1) 29 | smiles_fn = sys.argv[1] 30 | deltas_fn = sys.argv[2] 31 | delta_max = 0.1 # arbitrary, to normalize deltas and color-scale them 32 | delta_file = open(deltas_fn, 'r') 33 | count = 0 34 | for long_name, mol in RobustSmilesMolSupplier(smiles_fn): 35 | # split by '_' in case name was postfixed with underscores 36 | # and additional data 37 | name = long_name.split('_')[0] 38 | line = delta_file.readline() 39 | words = line.split() 40 | curr_name = words[0] 41 | if curr_name != name: 42 | print("names differ: %s != %s" % (name, curr_name)) 43 | exit(1) 44 | delta_strings = words[1:] 45 | nb_deltas = len(delta_strings) 46 | nb_atoms = mol.GetNumAtoms() 47 | assert(nb_deltas == nb_atoms) 48 | deltas = list(map(lambda x: float(x), delta_strings)) 49 | rdDepictor.Compute2DCoords(mol) # 2D conformer for figure 50 | # compute similarity map weights 51 | weights = [] 52 | for delta in deltas: 53 | # run-time check that delta is not too high or delta_max too small 54 | assert(delta <= delta_max) 55 | weight = delta / delta_max 56 | weights.append(weight) 57 | sim_map = Draw.SimilarityMaps.\ 58 | GetSimilarityMapFromWeights(mol, weights, size = (200,200), 59 | options=drawOptions, 60 | scale=50.0) 61 | # the bbox param forces centering the molecule in the figure 62 | sim_map.savefig(name + '.svg', bbox_inches = 'tight') 63 | plot.close(sim_map) 64 | count += 1 65 | print('processed: %d\r' % count, end='') 66 | print('processed: %d' % count) 67 | delta_file.close() 68 | -------------------------------------------------------------------------------- /bin/molenc_deepsmi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (C) 2021, Francois Berenger 4 | # Tsuda laboratory, Tokyo University, Japan. 5 | 6 | # DeepSMILES encoder/decoder from/to SMILES 7 | # 8 | # "DeepSMILES: An adaptation of SMILES for use in machine-learning of 9 | # chemical structures". Noel M. O’Boyle and Andrew Dalke. ChemRxiv (2018). 10 | 11 | import argparse 12 | import deepsmiles 13 | import molenc_common 14 | import rdkit 15 | import sys 16 | import time 17 | 18 | from rdkit import Chem 19 | from rdkit.Chem import AllChem 20 | 21 | from molenc_common import RobustSmilesSupplier 22 | 23 | def encode(converter, smi): 24 | return converter.encode(smi) 25 | 26 | def decode(converter, deep_smi): 27 | try: 28 | smi = converter.decode(deep_smi) 29 | # currently, de decoder does not output a canonical SMILES 30 | # https://github.com/baoilleach/deepsmiles/issues/19 31 | # I want canonical SMILES, because this is rdkit's default 32 | mol = Chem.MolFromSmiles(smi) 33 | cano_smi = Chem.MolToSmiles(mol) 34 | return cano_smi 35 | except deepsmiles.DecodeError as e: 36 | print("molenc_deepsmi.py: decode: '%s'" % e.message, 37 | file = sys.stderr) 38 | return None 39 | 40 | if __name__ == '__main__': 41 | before = time.time() 42 | # CLI options 43 | parser = argparse.ArgumentParser( 44 | description = "DeepSMILES encoder/decoder") 45 | parser.add_argument("-i", metavar = "input.smi", dest = "input_fn", 46 | help = "molecules input file") 47 | parser.add_argument("-o", metavar = "output.smi", dest = "output_fn", 48 | help = "molecules output file") 49 | parser.add_argument("--no-rings", dest = "rings", 50 | action = "store_true", 51 | default = False, 52 | help = "DeepSMILES without ring openings") 53 | parser.add_argument("--no-branches", dest = "branches", 54 | action = "store_true", 55 | default = False, 56 | help = "DeepSMILES without branches") 57 | parser.add_argument("-e", dest = "do_encode", 58 | action = "store_true", 59 | default = True, 60 | help = "encode: SMILES to DeepSMILES (default)") 61 | parser.add_argument("-d", dest = "do_decode", 62 | action = "store_true", 63 | help = "decode: DeepSMILES to SMILES") 64 | # parse CLI ---------------------------------------------- 65 | if len(sys.argv) == 1: 66 | # user has no clue of what to do -> usage 67 | parser.print_help(sys.stderr) 68 | sys.exit(1) 69 | args = parser.parse_args() 70 | input_fn = args.input_fn 71 | output = open(args.output_fn, 'w') 72 | rings = args.rings 73 | branches = args.branches 74 | do_encode = args.do_encode 75 | do_decode = args.do_decode 76 | if do_decode: 77 | do_encode = False 78 | assert(not (do_encode and do_decode)) 79 | if not (rings or branches): 80 | print("use at least --no-rings or --no-branches", 81 | file=sys.stderr) 82 | sys.exit(1) 83 | count = 0 84 | # work ---------------------------------------------- 85 | smi_supplier = RobustSmilesSupplier(input_fn) 86 | converter = deepsmiles.Converter(rings, branches) 87 | if do_encode: 88 | 89 | for smi, name in smi_supplier: 90 | deep_smi = encode(converter, smi) 91 | print("%s\t%s" % (deep_smi, name), file=output) 92 | count += 1 93 | else: # decode 94 | for deep_smi, name in smi_supplier: 95 | smi = decode(converter, deep_smi) 96 | if smi != None: 97 | print("%s\t%s" % (smi, name), file=output) 98 | count += 1 99 | after = time.time() 100 | dt = after - before 101 | print("%d molecules at %.2f mol/s" % (count, count / dt), file=sys.stderr) 102 | output.close() 103 | -------------------------------------------------------------------------------- /bin/molenc_diam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright (C) 2022, Francois Berenger 4 | # Tsuda laboratory, Tokyo University, 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan. 6 | # 7 | # Compute the diamater of a molecule's 3D conformer 8 | # i.e. largest interatomic distance 9 | 10 | import argparse, math, sys 11 | from rdkit import Chem 12 | 13 | def euclid(xyz0, xyz1): 14 | x0, y0, z0 = xyz0 15 | x1, y1, z1 = xyz1 16 | dx = x0 - x1 17 | dy = y0 - y1 18 | dz = z0 - z1 19 | return math.sqrt(dx*dx + dy*dy + dz*dz) 20 | 21 | # WARNING: O(n^2) 22 | def diameter(mol): 23 | num_atoms = mol.GetNumAtoms() 24 | conf = mol.GetConformer(0) 25 | diam = 0.0 26 | for i in range(num_atoms - 1): 27 | xyz_i = conf.GetAtomPosition(i) 28 | for j in range(i + 1, num_atoms): 29 | xyz_j = conf.GetAtomPosition(j) 30 | dist = euclid(xyz_i, xyz_j) 31 | if dist > diam: 32 | diam = dist 33 | return diam 34 | 35 | if __name__ == '__main__': 36 | # CLI options parsing 37 | parser = argparse.ArgumentParser(description = 38 | "compute molecular diameter") 39 | parser.add_argument("-i", metavar = "input.sdf", dest = "input_fn", 40 | help = "3D conformer input file \ 41 | (single molecule AND conformer)") 42 | # parse CLI --------------------------------------------------------------- 43 | if len(sys.argv) == 1: 44 | # user has no clue of what to do -> usage 45 | parser.print_help(sys.stderr) 46 | sys.exit(1) 47 | args = parser.parse_args() 48 | input_fn = args.input_fn 49 | # parse CLI end ----------------------------------------------------------- 50 | count = 0 51 | mol_supplier = Chem.SDMolSupplier(input_fn) 52 | for mol in mol_supplier: 53 | if (mol == None) or (count > 1): 54 | assert(False) 55 | count += 1 56 | diam = diameter(mol) 57 | print("%f" % diam) 58 | -------------------------------------------------------------------------------- /bin/molenc_drug.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright (C) 2023, Francois Berenger 4 | # Tsuda laboratory, The University of Tokyo, 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan. 6 | # 7 | # Drug-like filter: only drug-like molecules will be printed on stdout 8 | 9 | import sys 10 | 11 | from rdkit import Chem 12 | from rdkit.Chem import Descriptors 13 | 14 | # Tran-Nguyen, V. K., Jacquemard, C., & Rognan, D. (2020). 15 | # LIT-PCBA: An unbiased data set for machine learning and virtual screening. 16 | # Journal of chemical information and modeling, 60(9), 4263-4273. 17 | def drug_like_filter(mol): 18 | MolW = Descriptors.MolWt(mol) 19 | if MolW <= 150 or MolW >= 800: # 150 < MolW < 800 Da 20 | return False 21 | cLogP = Descriptors.MolLogP(mol) 22 | if cLogP <= -3.0 or cLogP >= 5.0: # −3.0 < AlogP < 5.0 23 | return False 24 | RotB = Descriptors.NumRotatableBonds(mol) 25 | if RotB >= 15: # RotB < 15 26 | return False 27 | HBA = Descriptors.NumHAcceptors(mol) 28 | if HBA >= 10: # HBA < 10 29 | return False 30 | HBD = Descriptors.NumHDonors(mol) 31 | if HBD >= 10: # HBD < 10 32 | return False 33 | FC = Chem.rdmolops.GetFormalCharge(mol) 34 | if FC <= -2 or FC >= 2: # −2.0 < FC < 2.0 35 | return False 36 | return True # Still here? Drug-like then! 37 | 38 | def RobustSmilesMolSupplier(filename): 39 | with open(filename) as f: 40 | for line in f: 41 | smile, name = line.strip().split("\t") # enforce TAB-separated 42 | try: 43 | mol = Chem.MolFromSmiles(smile) 44 | yield (mol, smile, name) 45 | except Exception: 46 | print("ERROR: cannot parse: %s" % line, 47 | file=sys.stderr, end='') 48 | 49 | input_fn = sys.argv[1] 50 | 51 | for mol, smile, name in RobustSmilesMolSupplier(input_fn): 52 | if drug_like_filter(mol): 53 | print('%s\t%s' % (smile, name)) 54 | -------------------------------------------------------------------------------- /bin/molenc_elements.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # output to stdout elements found in each molecule 4 | # INPUT: SMILES file 5 | # OUTPUT: one symbol per line; several times if element present multiple times; 6 | # hydrogens are made explicit 7 | 8 | import sys 9 | from rdkit import Chem 10 | 11 | def RobustSmilesMolSupplier(filename): 12 | with open(filename) as f: 13 | for line in f: 14 | smi, _name = line.strip().split("\t") # enforce TAB-separated 15 | try: 16 | yield Chem.MolFromSmiles(smi) 17 | except Exception: 18 | print("ERROR: cannot parse: %s" % line, 19 | file=sys.stderr, end='') 20 | 21 | input_fn = sys.argv[1] 22 | 23 | for mol in RobustSmilesMolSupplier(input_fn): 24 | mol_H = Chem.AddHs(mol) 25 | for a in mol_H.GetAtoms(): 26 | print(a.GetSymbol()) 27 | -------------------------------------------------------------------------------- /bin/molenc_fscan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Fluorine scan of a molecule 4 | # create all analogs of input molecule where one heavy atom 5 | # at a time has all of its hydrogens replaced by F 6 | # optionally, do this only for heteroatoms 7 | 8 | import argparse, rdkit, sys 9 | from rdkit import Chem 10 | 11 | def RobustSmilesMolSupplier(input_fn): 12 | with open(input_fn) as f: 13 | for line in f: 14 | strip = line.strip() 15 | toks = strip.split() 16 | smi = toks[0] 17 | toks.reverse() 18 | name = toks[0] 19 | yield (smi, name) 20 | 21 | fluor = Chem.Atom(9) 22 | 23 | if __name__ == '__main__': 24 | # CLI options parsing 25 | parser = argparse.ArgumentParser( 26 | description = "compute atom types and distances") 27 | parser.add_argument("-i", metavar = "input.smi", dest = "input_fn", 28 | help = "molecules input file") 29 | parser.add_argument("-o", metavar = "output.smi", dest = "output_fn", 30 | help = "molecules output file") 31 | parser.add_argument('--hetero', dest='only_heteroatoms', action='store_true', 32 | help = "only scan heteroatoms") 33 | # parse CLI 34 | if len(sys.argv) == 1: 35 | # show help in case user has no clue of what to do 36 | parser.print_help(sys.stderr) 37 | sys.exit(1) 38 | args = parser.parse_args() 39 | input_fn = args.input_fn 40 | output_fn = args.output_fn 41 | only_hetero = args.only_heteroatoms 42 | with open(output_fn, 'w') as out: 43 | for smi, name in RobustSmilesMolSupplier(input_fn): 44 | # output original molecule first 45 | print("%s\t%s" % (smi, name), file=out) 46 | mol = Chem.MolFromSmiles(smi) 47 | mol = Chem.AddHs(mol) 48 | # then output its fluorinated analogs 49 | count = 1 50 | for a in mol.GetAtoms(): 51 | anum = a.GetAtomicNum() 52 | if anum > 1 and ((not only_hetero) or anum != 6): 53 | # heavy atom 54 | if a.GetTotalNumHs(includeNeighbors=True) >= 1: 55 | # hydrogens attached 56 | editable = Chem.EditableMol(mol) 57 | for neighb in a.GetNeighbors(): 58 | if neighb.GetAtomicNum() == 1: 59 | # Fluorine instead 60 | a_j = neighb.GetIdx() 61 | editable.ReplaceAtom(a_j, fluor) 62 | edited = editable.GetMol() 63 | smi = Chem.MolToSmiles(edited) 64 | print("%s\t%s_%d" % (smi, name, count), file=out) 65 | count += 1 66 | -------------------------------------------------------------------------------- /bin/molenc_get_tag.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Extract given tags from an SDF file 4 | # could also be called sdf2csv 5 | 6 | import argparse, rdkit, sys 7 | from rdkit import Chem 8 | 9 | # m.GetProp but w/ a default value 10 | def get_prop_default(m, prop, def_val): 11 | res = def_val 12 | try: 13 | res = m.GetProp(prop) 14 | except KeyError: 15 | pass 16 | return res 17 | 18 | def get_props(m, tags): 19 | res = [] 20 | for t in tags: 21 | x = get_prop_default(m, t, '') 22 | res.append(x) 23 | return res 24 | 25 | if __name__ == '__main__': 26 | # CLI options parsing 27 | parser = argparse.ArgumentParser( 28 | description = "compute atom types and distances") 29 | parser.add_argument("-i", metavar = "input.sdf", dest = "input_fn", 30 | help = "molecules input file") 31 | parser.add_argument("-o", metavar = "output.txt", dest = "output_fn", 32 | help = "output file") 33 | parser.add_argument('-t', metavar = 'tag1,tag2,...', dest = "tags", 34 | help = "comma-separated list of tags to extract") 35 | # parse CLI 36 | if len(sys.argv) == 1: 37 | # show help in case user has no clue of what to do 38 | parser.print_help(sys.stderr) 39 | sys.exit(1) 40 | args = parser.parse_args() 41 | input_fn = args.input_fn 42 | output_fn = args.output_fn 43 | tags = args.tags.strip().split(',') 44 | # ------------------------------------------------------------------------- 45 | with open(output_fn, 'w') as out: 46 | # we just want to extract SDF tags; we don't care if rdkit 47 | # is unhappy w/ some molecules 48 | for mol in Chem.SDMolSupplier(input_fn, sanitize=False): 49 | if mol: 50 | props = get_props(mol, tags) 51 | for i, p in enumerate(props): 52 | if i > 0: 53 | print(',%s' % p, end='', file=out) 54 | else: 55 | print('%s' % p, end='', file=out) 56 | # EOL plus makes empty fields obvious 57 | print(',', file=out) 58 | -------------------------------------------------------------------------------- /bin/molenc_histo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # usage: ./histo.py FILE:str STEPS:int 4 | # 0 1 2 5 | 6 | import sys 7 | 8 | input_fn = sys.argv[1] 9 | num_steps = float(int(sys.argv[2])) 10 | 11 | # read in values 12 | floats = [] 13 | for line in open(input_fn).readlines(): 14 | strip = line.strip() 15 | x = float(strip) 16 | floats.append(x) 17 | assert(len(floats) > 0) 18 | 19 | min_val = min(floats) 20 | max_val = max(floats) 21 | assert(min_val < max_val) 22 | delta = (max_val - min_val) / num_steps 23 | print('DEBUG: min:%g max:%g steps:%d delta:%g' % 24 | (min_val, max_val, num_steps, delta), file=sys.stderr) 25 | 26 | # initialize the histogram 27 | histo = {} 28 | x = min_val 29 | i = 0 30 | while x <= max_val + delta: 31 | histo[i] = 0 32 | x += delta 33 | i += 1 34 | 35 | # finalize the histogram 36 | for x in floats: 37 | assert(x >= min_val) 38 | assert(x <= max_val) 39 | hist_bin = int((x - min_val) / delta) 40 | histo[hist_bin] += 1 41 | 42 | # print out histogram 43 | x = min_val 44 | i = 0 45 | while x <= max_val + delta: 46 | x_val = min_val + float(i) * delta 47 | y_val = histo[i] 48 | print('%f %d' % (x_val, y_val)) 49 | x += delta 50 | i += 1 51 | -------------------------------------------------------------------------------- /bin/molenc_ifg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Original authors: Richard Hall and Guillaume Godin 4 | # This file is part of the RDKit. 5 | # The contents are covered by the terms of the BSD license 6 | # which is included in the file license.txt, found at the root 7 | # of the RDKit source tree. 8 | 9 | # Richard hall 2017 10 | # IFG main code 11 | # Guillaume Godin 2017 12 | # refine output function 13 | # astex_ifg: identify functional groups a la Ertl, J. Cheminform (2017) 9:36 14 | from rdkit import Chem 15 | from collections import namedtuple 16 | import sys 17 | 18 | def RobustSmilesMolSupplier(filename): 19 | with open(filename) as f: 20 | for line in f: 21 | words = line.split() 22 | smile = words[0] 23 | name = " ".join(words[1:]) # everything after the SMILES string 24 | yield (name, Chem.MolFromSmiles(smile)) 25 | 26 | def merge(mol, marked, aset): 27 | bset = set() 28 | for idx in aset: 29 | atom = mol.GetAtomWithIdx(idx) 30 | for nbr in atom.GetNeighbors(): 31 | jdx = nbr.GetIdx() 32 | if jdx in marked: 33 | marked.remove(jdx) 34 | bset.add(jdx) 35 | if not bset: 36 | return 37 | merge(mol, marked, bset) 38 | aset.update(bset) 39 | 40 | # atoms connected by non-aromatic double or triple bond to any heteroatom 41 | # c=O should not match (see fig1, box 15). I think using A instead of * should sort that out? 42 | PATT_DOUBLE_TRIPLE = Chem.MolFromSmarts('A=,#[!#6]') 43 | # atoms in non aromatic carbon-carbon double or triple bonds 44 | PATT_CC_DOUBLE_TRIPLE = Chem.MolFromSmarts('C=,#C') 45 | # acetal carbons, i.e. sp3 carbons connected to tow or more oxygens, nitrogens or sulfurs; these O, N or S atoms must have only single bonds 46 | PATT_ACETAL = Chem.MolFromSmarts('[CX4](-[O,N,S])-[O,N,S]') 47 | # all atoms in oxirane, aziridine and thiirane rings 48 | PATT_OXIRANE_ETC = Chem.MolFromSmarts('[O,N,S]1CC1') 49 | 50 | PATT_TUPLE = (PATT_DOUBLE_TRIPLE, PATT_CC_DOUBLE_TRIPLE, PATT_ACETAL, PATT_OXIRANE_ETC) 51 | 52 | def identify_functional_groups(mol): 53 | marked = set() 54 | #mark all heteroatoms in a molecule, including halogens 55 | for atom in mol.GetAtoms(): 56 | if atom.GetAtomicNum() not in (6,1): # would we ever have hydrogen? 57 | marked.add(atom.GetIdx()) 58 | 59 | #mark the four specific types of carbon atom 60 | for patt in PATT_TUPLE: 61 | for path in mol.GetSubstructMatches(patt): 62 | for atomindex in path: 63 | marked.add(atomindex) 64 | 65 | #merge all connected marked atoms to a single FG 66 | groups = [] 67 | while marked: 68 | grp = set([marked.pop()]) 69 | merge(mol, marked, grp) 70 | groups.append(grp) 71 | 72 | #extract also connected unmarked carbon atoms 73 | ifg = namedtuple('IFG', ['atomIds', 'atoms', 'type']) 74 | ifgs = [] 75 | for g in groups: 76 | uca = set() 77 | for atomidx in g: 78 | for n in mol.GetAtomWithIdx(atomidx).GetNeighbors(): 79 | if n.GetAtomicNum() == 6: 80 | uca.add(n.GetIdx()) 81 | ifgs.append(ifg(atomIds=tuple(list(g)), atoms=Chem.MolFragmentToSmiles(mol, g, canonical=True), type=Chem.MolFragmentToSmiles(mol, g.union(uca),canonical=True))) 82 | return ifgs 83 | 84 | def main(): 85 | argc = len(sys.argv) 86 | if argc == 1: 87 | print('usage: ifg.py input.smi', file=sys.stderr) 88 | exit(1) 89 | input_fn = sys.argv[1] 90 | for name, mol in RobustSmilesMolSupplier(input_fn): 91 | fgs = identify_functional_groups(mol) 92 | nb_fun_groups = len(fgs) 93 | print("%s %d" % (name, nb_fun_groups)) 94 | 95 | if __name__ == "__main__": 96 | main() 97 | -------------------------------------------------------------------------------- /bin/molenc_iupac.smi: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | from STOUT import translate_forward, translate_reverse 5 | 6 | # SMILES to IUPAC name translation 7 | 8 | SMILES = sys.argv[1] 9 | IUPAC_name = translate_forward(SMILES) 10 | print("IUPAC name of "+SMILES+" is: "+IUPAC_name) 11 | 12 | # IUPAC name to SMILES translation 13 | 14 | SMILES = translate_reverse(IUPAC_name) 15 | print("SMILES of "+IUPAC_name+" is: "+SMILES) 16 | -------------------------------------------------------------------------------- /bin/molenc_lead.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright (C) 2022, Francois Berenger 4 | # Tsuda laboratory, The University of Tokyo, 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan. 6 | # 7 | # lead-like filter: only lead-like molecules will be printed on stdout 8 | 9 | import sys 10 | 11 | from rdkit import Chem 12 | from rdkit.Chem import Descriptors 13 | 14 | # Oprea's lead-like filter 15 | # Hann, M. M., & Oprea, T. I. (2004). 16 | # Pursuing the leadlikeness concept in pharmaceutical research. 17 | # Current opinion in chemical biology, 8(3), 255-263. 18 | def lead_like(mol): 19 | # MolW <= 460 20 | if Descriptors.MolWt(mol) > 460: 21 | return False 22 | # -4.0 <= LogP <= 4.2 23 | LogP = Descriptors.MolLogP(mol) 24 | if LogP < -4.0 or LogP > 4.2: 25 | return False 26 | # # LogSw >= -5 # ignored 27 | # rotB <= 10 28 | if Descriptors.NumRotatableBonds(mol) > 10: 29 | return False 30 | # nRings <= 4 (number of SSSR rings, _not_ aromatic rings) 31 | if len(Chem.GetSSSR(mol)) > 4: 32 | return False 33 | # HBD <= 5 34 | if Descriptors.NumHDonors(mol) > 5: 35 | return False 36 | # HBA <= 9 37 | if Descriptors.NumHAcceptors(mol) > 9: 38 | return False 39 | return True # lead-like then! 40 | 41 | def RobustSmilesMolSupplier(filename): 42 | with open(filename) as f: 43 | for line in f: 44 | smile, name = line.strip().split("\t") # enforce TAB-separated 45 | try: 46 | mol = Chem.MolFromSmiles(smile) 47 | yield (mol, smile, name) 48 | except Exception: 49 | print("ERROR: cannot parse: %s" % line, 50 | file=sys.stderr, end='') 51 | 52 | input_fn = sys.argv[1] 53 | 54 | for mol, smile, name in RobustSmilesMolSupplier(input_fn): 55 | if lead_like(mol): 56 | print('%s\t%s' % (smile, name)) 57 | -------------------------------------------------------------------------------- /bin/molenc_linker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import rdkit, typing 4 | from rdkit import Chem 5 | from rdkit.Chem import AllChem 6 | 7 | def create_PEG_chain(length: int): 8 | s = '' 9 | for i in range(length): 10 | s += '[CH2R0X4][CH2R0X4][OH0R0X2]' # SMARTS for one PEG unit 11 | #return s 12 | return Chem.MolFromSmarts(s) 13 | 14 | # assume longest linker is 10 units of PEG 15 | peg10 = create_PEG_chain(10) 16 | peg09 = create_PEG_chain(9) 17 | peg08 = create_PEG_chain(8) 18 | peg07 = create_PEG_chain(7) 19 | peg06 = create_PEG_chain(6) 20 | peg05 = create_PEG_chain(5) 21 | peg04 = create_PEG_chain(4) 22 | peg03 = create_PEG_chain(3) 23 | peg02 = create_PEG_chain(2) 24 | #peg01: I assume a single PEG unit is too short to be a proper linker 25 | 26 | peg_10_downto_2 = [peg10, peg09, peg08, peg07, peg06, peg05, peg04, peg03, peg02] 27 | 28 | # remove the PEG linker, if any 29 | # if not, the molecule is returned unchanged (either it has no linker, or 30 | # the linker is not PEG) 31 | def cut_PEG_linker(mol): 32 | for patt in peg_10_downto_2: 33 | if mol.HasSubstructMatch(patt): 34 | res = AllChem.DeleteSubstructs(mol, patt) 35 | return (True, res) 36 | return (False, mol) 37 | -------------------------------------------------------------------------------- /bin/molenc_mol2smi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (C) 2020, Francois Berenger 4 | # Yamanishi laboratory, 5 | # Department of Bioscience and Bioinformatics, 6 | # Faculty of Computer Science and Systems Engineering, 7 | # Kyushu Institute of Technology, 8 | # 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. 9 | 10 | # txt molecule to SMILES 11 | 12 | import argparse, rdkit, re, sys, time 13 | import molenc_common as common 14 | from rdkit import Chem 15 | 16 | # create a fake molecule for the corresp. fragment 17 | def read_one_molecule(input): 18 | res_mol = Chem.RWMol() 19 | atoms_header = input.readline().strip() 20 | if atoms_header == '': 21 | raise common.End_of_file # no EOF in Python... 22 | nb_atoms, name = common.read_atoms_header(atoms_header) 23 | old2new = {} 24 | for _i in range(nb_atoms): 25 | line = input.readline().strip() 26 | (index, nb_pi, atomic_num, nb_HA, charge, stereo) = \ 27 | common.read_atom(line) 28 | # add atom 29 | a = Chem.Atom(atomic_num) 30 | a.SetFormalCharge(charge) 31 | if stereo > 0: # set chirality 32 | a.SetChiralTag(common.atom_stereo_code_to_chiral_tag(stereo)) 33 | j = res_mol.AddAtom(a) 34 | # we need to convert atom indexes 35 | old2new[index] = j 36 | bonds_header = input.readline().strip() 37 | nb_bonds = common.read_bonds_header(bonds_header) 38 | stereo_bonds = [] 39 | for i in range(nb_bonds): 40 | line = input.readline().strip() 41 | (start_i, bt, stop_i, (stereo, c, d)) = common.read_bond(line) 42 | start = old2new[start_i] 43 | stop = old2new[stop_i] 44 | # add bond 45 | n = res_mol.AddBond(start, stop, bt) 46 | if stereo != rdkit.Chem.rdchem.BondStereo.STEREONONE: 47 | bi = n - 1 48 | # convert stereo bond stereo atoms indexes 49 | a = old2new[c] 50 | b = old2new[d] 51 | stereo_bonds.append((bi, stereo, a, b)) 52 | # all atoms and bonds are here now 53 | # so stereo bonds info can be set 54 | for (bi, stereo, a, b) in stereo_bonds: 55 | bond = res_mol.GetBondWithIdx(bi) 56 | bond.SetStereo(stereo) 57 | bond.SetStereoAtoms(a, b) 58 | print('%s stereo %s on bond %d (%d, %d)' % 59 | (name, common.char_of_bond_stereo(stereo), bi, a, b), 60 | file=sys.stderr) 61 | try: 62 | Chem.SanitizeMol(res_mol) 63 | Chem.AssignStereochemistry(res_mol) # ! MANDATORY; AFTER SanitizeMol ! 64 | except rdkit.Chem.rdchem.KekulizeException: 65 | print("KekulizeException in %s" % name, file=sys.stderr) 66 | smi = Chem.MolToSmiles(res_mol) 67 | return (smi, name) 68 | 69 | if __name__ == '__main__': 70 | before = time.time() 71 | # CLI options parsing 72 | parser = argparse.ArgumentParser(description = "txt molecule to smi") 73 | parser.add_argument("-i", metavar = "input.mols", dest = "input_fn", 74 | help = "molecules input file") 75 | parser.add_argument("-o", metavar = "output.smi", dest = "output_fn", 76 | help = "output file") 77 | # parse CLI 78 | if len(sys.argv) == 1: 79 | # show help in case user has no clue of what to do 80 | parser.print_help(sys.stderr) 81 | sys.exit(1) 82 | args = parser.parse_args() 83 | input_fn = args.input_fn 84 | output = open(args.output_fn, 'w') 85 | count = 0 86 | with open(input_fn) as input: 87 | try: 88 | while True: 89 | smi, name = read_one_molecule(input) 90 | count += 1 91 | print('%s\t%s' % (smi, name), file=output) 92 | except common.End_of_file: 93 | pass 94 | after = time.time() 95 | dt = after - before 96 | print("%d molecules at %.2f molecule/s" % 97 | (count, count / dt), file=sys.stderr) 98 | output.close() 99 | -------------------------------------------------------------------------------- /bin/molenc_panascan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (C) 2021, Francois Berenger 4 | # Yamanishi laboratory, 5 | # Department of Bioscience and Bioinformatics, 6 | # Faculty of Computer Science and Systems Engineering, 7 | # Kyushu Institute of Technology, 8 | # 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. 9 | 10 | # Implementation of 11 | # "Positional Analogue Scanning: An Effective Strategy for 12 | # Multiparameter Optimization in Drug Design". 13 | # Pennington, L. D., Aquila, B. M., Choi, Y., Valiulin, R. A., & Muegge, I. 14 | # Journal of medicinal chemistry (2020). 15 | # https://doi.org/10.1021/acs.jmedchem.9b02092 16 | 17 | import argparse 18 | import rdkit 19 | import time 20 | import random 21 | from rdkit import Chem 22 | from rdkit.Chem import AllChem 23 | import sys 24 | 25 | from molenc_common import RobustSmilesMolSupplier 26 | 27 | def positional_analog_scan(mol, smarts_patt = '[cH]', 28 | smi_substs = ['N','CF','CC','CO', 29 | 'CCN','CCl','CC(F)(F)(F)','COC']): 30 | res = [] 31 | ss = set() # a string set 32 | patt = Chem.MolFromSmarts(smarts_patt) 33 | for smi in smi_substs: 34 | subst = Chem.MolFromSmiles(smi) 35 | analogs = AllChem.ReplaceSubstructs(mol, patt, subst) 36 | for a in analogs: 37 | analog_smi = Chem.MolToSmiles(a) # canonicalization 38 | # remove duplicates 39 | if analog_smi not in ss: 40 | res.append(analog_smi) 41 | ss.add(analog_smi) 42 | return res 43 | 44 | if __name__ == '__main__': 45 | before = time.time() 46 | # CLI options 47 | parser = argparse.ArgumentParser( 48 | description = "Positional Analog Scanning of each input molecule") 49 | parser.add_argument("-i", metavar = "input.smi", dest = "input_fn", 50 | help = "molecules input file") 51 | parser.add_argument("-o", metavar = "output.smi", dest = "output_fn", 52 | help = "analogs output file") 53 | parser.add_argument("--rand-one", dest = "rand_one", action = "store_true", 54 | default = False, 55 | help = "output only one randomly-chosen analog \ 56 | per input molecule") 57 | # parse CLI ---------------------------------------------- 58 | if len(sys.argv) == 1: 59 | # user has no clue of what to do -> usage 60 | parser.print_help(sys.stderr) 61 | sys.exit(1) 62 | args = parser.parse_args() 63 | input_fn = args.input_fn 64 | rand_one = args.rand_one 65 | output = open(args.output_fn, 'w') 66 | count = 0 67 | # work ---------------------------------------------- 68 | mol_supplier = RobustSmilesMolSupplier(input_fn) 69 | for name, mol in mol_supplier: 70 | analogs = positional_analog_scan(mol) 71 | if rand_one: 72 | l = list(analogs) 73 | ana_smi = random.choice(l) 74 | print("%s\t%s_ANA%03d" % (ana_smi, name, 0), 75 | file=output) 76 | else: # print them all 77 | for i, ana_smi in enumerate(analogs): 78 | print("%s\t%s_ANA%03d" % (ana_smi, name, i), 79 | file=output) 80 | count += 1 81 | after = time.time() 82 | dt = after - before 83 | print("%d molecules at %.2f mol/s" % (count, count / dt), file=sys.stderr) 84 | output.close() 85 | -------------------------------------------------------------------------------- /bin/molenc_qed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright (C) 2023, Francois Berenger 4 | # Tsuda laboratory, The University of Tokyo, 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan. 6 | # 7 | # Compute QED for each input SMILES 8 | # scores about 300 molecule/s on a single core 9 | # 10 | # requires: installing qed from sources (pip3 package broken as of 23/01/2023) 11 | # cf. https://github.com/silicos-it/qed 12 | # an open-source implementation of "Quantifying the chemical beauty of drugs" 13 | # https://doi.org/10.1038/nchem.1243 14 | 15 | import argparse, rdkit, sys 16 | from qed import qed 17 | from rdkit import Chem 18 | 19 | def RobustSmilesMolSupplier(filename): 20 | with open(filename) as f: 21 | for i, line in enumerate(f): 22 | words = line.split() 23 | smile = words[0] 24 | name = words[1] 25 | yield (i, Chem.MolFromSmiles(smile), name) 26 | 27 | def main(): 28 | # CLI options parsing 29 | parser = argparse.ArgumentParser( 30 | description = "Compute Quantitative Estimate of Drug-likeness (QED)") 31 | parser.add_argument("-i", metavar = "input_smi", dest = "input_smi", 32 | help = "input SMILES file") 33 | parser.add_argument("-o", metavar = "output_tsv", dest = "output_tsv", 34 | help = "output CSV file") 35 | # parse CLI 36 | if len(sys.argv) == 1: 37 | # show help in case user has no clue of what to do 38 | parser.print_help(sys.stderr) 39 | sys.exit(1) 40 | args = parser.parse_args() 41 | input_smi = args.input_smi 42 | output_tsv = args.output_tsv 43 | out_count = 0 44 | error_count = 0 45 | with open(output_tsv, 'w') as out_file: 46 | for i, mol, name in RobustSmilesMolSupplier(input_smi): 47 | if mol is None: 48 | error_count += 1 49 | else: 50 | score = qed.default(mol, False) 51 | print("%s\t%f" % (name, score), file=out_file) 52 | out_count += 1 53 | total_count = out_count + error_count 54 | print("read: %d errors: %d" % (out_count, error_count), 55 | file=sys.stderr) 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /bin/molenc_rbonds_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright (C) 2023, Francois Berenger 4 | # Tsuda laboratory, Tokyo University, 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan. 6 | # 7 | # Only keep molecules with an acceptable number of rotatable bonds 8 | 9 | import argparse, re, sys, time 10 | from rdkit import Chem 11 | from rdkit.Chem import Descriptors 12 | 13 | regex = re.compile('\s') 14 | 15 | def find_whitespace(s): 16 | m = re.search(regex, s) 17 | if m == None: 18 | return -1 19 | else: 20 | return m.start() 21 | 22 | def parse_smiles_line(line): 23 | fst_white = find_whitespace(line) 24 | smi = '' 25 | name = '' 26 | if fst_white == -1: 27 | # no whitespace separator: assume molecule has no name 28 | # use the SMILES itself as the name, so this unnamed 29 | # molecule will percolate instead of behing lost 30 | smi = line 31 | name = line 32 | else: 33 | smi = line[0:fst_white] 34 | name = line[fst_white + 1:] 35 | return Chem.MolFromSmiles(smi) 36 | 37 | def rbonds_filter(max_rbonds, mol): 38 | if Descriptors.NumRotatableBonds(mol) <= max_rbonds: 39 | return True 40 | else: 41 | return False 42 | 43 | if __name__ == '__main__': 44 | before = time.time() 45 | # CLI options parsing 46 | parser = argparse.ArgumentParser(description = "filter out molecules w/ disallowed atoms") 47 | parser.add_argument("-i", metavar = "input.smi", dest = "input_fn", 48 | help = "molecules input file") 49 | parser.add_argument("-o", metavar = "output.smi", dest = "output_fn", 50 | help = "molecules output file") 51 | parser.add_argument('-r', metavar = "MAX_ROT_BONDS_INT", dest='max_rbonds', 52 | default=-1, type=int, 53 | help = "maximum number of rotatable bonds allowed (default=NO_LIMIT") 54 | # parse CLI --------------------------------------------------------------- 55 | if len(sys.argv) == 1: 56 | # user has no clue of what to do -> usage 57 | parser.print_help(sys.stderr) 58 | sys.exit(1) 59 | args = parser.parse_args() 60 | input_fn = args.input_fn 61 | output_fn = args.output_fn 62 | max_rbonds = args.max_rbonds 63 | # parse CLI end ----------------------------------------------------------- 64 | count = 0 65 | errors = 0 66 | with open(output_fn, 'w') as out: 67 | with open(input_fn, 'r') as input: 68 | for line in input.readlines(): 69 | mol = parse_smiles_line(line.strip()) 70 | if rbonds_filter(max_rbonds, mol): 71 | out.write("%s" % line) 72 | else: 73 | errors += 1 74 | count += 1 75 | after = time.time() 76 | dt = after - before 77 | print("%d molecules @ %.2fHz; removed %d" % (count, count / dt, errors), 78 | file=sys.stderr) 79 | -------------------------------------------------------------------------------- /bin/molenc_regr_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # output R2 and RMSE regression statistics from a file containing pairs 4 | # of float values (one blank-separated pair per line; no header) 5 | 6 | import sklearn, sys 7 | 8 | from sklearn.metrics import r2_score, root_mean_squared_error 9 | 10 | def read_pair(line): 11 | tokens = line.strip().split() 12 | x = float(tokens[0]) 13 | y = float(tokens[1]) 14 | xy = (x, y) 15 | return xy 16 | 17 | if __name__ == '__main__': 18 | input_fn = sys.argv[1] 19 | xs = [] 20 | ys = [] 21 | for line in open(input_fn).readlines(): 22 | x, y = read_pair(line) 23 | xs.append(x) 24 | ys.append(y) 25 | r2 = r2_score(xs, ys) 26 | rmse = root_mean_squared_error(xs, ys) 27 | print('R2=%.3f RMSE=%.3f' % (r2, rmse)) 28 | -------------------------------------------------------------------------------- /bin/molenc_scaffold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (C) 2020, Francois Berenger 4 | # Yamanishi laboratory, 5 | # Department of Bioscience and Bioinformatics, 6 | # Faculty of Computer Science and Systems Engineering, 7 | # Kyushu Institute of Technology, 8 | # 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. 9 | # 10 | # Compute the Bemis-Murcho generic scaffold (framework) 11 | # of each input molecule. 12 | # 13 | # Bemis, G. W., & Murcko, M. A. (1996). 14 | # "The properties of known drugs. 1. Molecular frameworks." 15 | # Journal of medicinal chemistry, 39(15), 2887-2893. 16 | 17 | import argparse, rdkit, sys 18 | from rdkit import Chem 19 | 20 | def RobustSmilesMolSupplier(filename): 21 | with open(filename) as f: 22 | for line in f: 23 | words = line.split() 24 | smi = words[0] 25 | name = words[1] 26 | mol = Chem.MolFromSmiles(smi) 27 | yield (smi, name, mol) 28 | 29 | def find_terminal_atoms(mol): 30 | res = [] 31 | for a in mol.GetAtoms(): 32 | if len(a.GetBonds()) == 1: 33 | res.append(a) 34 | return res 35 | 36 | def BemisMurckoFramework(mol): 37 | # keep only Heavy Atoms (HA) 38 | only_HA = rdkit.Chem.rdmolops.RemoveHs(mol) 39 | # switch all HA to Carbon 40 | rw_mol = Chem.RWMol(only_HA) 41 | for i in range(rw_mol.GetNumAtoms()): 42 | rw_mol.ReplaceAtom(i, Chem.Atom(6)) 43 | # switch all non single bonds to single 44 | non_single_bonds = [] 45 | for b in rw_mol.GetBonds(): 46 | if b.GetBondType() != Chem.BondType.SINGLE: 47 | non_single_bonds.append(b) 48 | for b in non_single_bonds: 49 | j = b.GetBeginAtomIdx() 50 | k = b.GetEndAtomIdx() 51 | rw_mol.RemoveBond(j, k) 52 | rw_mol.AddBond(j, k, Chem.BondType.SINGLE) 53 | # as long as there are terminal atoms, remove them 54 | terminal_atoms = find_terminal_atoms(rw_mol) 55 | while terminal_atoms != []: 56 | for a in terminal_atoms: 57 | for b in a.GetBonds(): 58 | rw_mol.RemoveBond(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) 59 | rw_mol.RemoveAtom(a.GetIdx()) 60 | terminal_atoms = find_terminal_atoms(rw_mol) 61 | return rw_mol.GetMol() 62 | 63 | def main(): 64 | # CLI options parsing 65 | parser = argparse.ArgumentParser( 66 | description = "Append Bemis-Murcko scaffold to each input molecule") 67 | parser.add_argument("-i", metavar = "input_smi", dest = "input_smi", 68 | help = "input SMILES file") 69 | parser.add_argument("-o", metavar = "output_smi", dest = "output_smi", 70 | help = "output SMILES file") 71 | parser.add_argument('--new-line', dest='new_line', 72 | action='store_true', default=False, 73 | help = "insert a newline before the scaffold") 74 | # parse CLI 75 | if len(sys.argv) == 1: 76 | # show help in case user has no clue of what to do 77 | parser.print_help(sys.stderr) 78 | sys.exit(1) 79 | args = parser.parse_args() 80 | input_smi = args.input_smi 81 | output_smi = args.output_smi 82 | new_line = args.new_line 83 | out_count = 0 84 | error_count = 0 85 | with open(output_smi, 'w') as out_file: 86 | for smi, name, mol in RobustSmilesMolSupplier(input_smi): 87 | if mol is None: 88 | error_count += 1 89 | else: 90 | scaff = BemisMurckoFramework(mol) 91 | scaff_smi = Chem.MolToSmiles(scaff) 92 | if new_line: 93 | print("%s\t%s\n%s" % (smi, name, scaff_smi), file=out_file) 94 | else: 95 | print("%s\t%s\t%s" % (smi, name, scaff_smi), file=out_file) 96 | out_count += 1 97 | total_count = out_count + error_count 98 | print("encoded: %d errors: %d total: %d" % 99 | (out_count, error_count, total_count), 100 | file=sys.stderr) 101 | 102 | if __name__ == '__main__': 103 | main() 104 | -------------------------------------------------------------------------------- /bin/molenc_scan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # wildcard atom scan of molecules 4 | # for each molecule, output all variants where 5 | # a single heavy atom at a time is switched to the SMILES wildcard atom '*' 6 | 7 | import rdkit, sys, time 8 | from rdkit import Chem 9 | # from rdkit.Chem import rdChemReactions 10 | 11 | def RobustSmilesMolSupplier(filename): 12 | with open(filename) as f: 13 | for line in f: 14 | words = line.split() 15 | smile = words[0] 16 | name = " ".join(words[1:]) # everything after the SMILES string 17 | yield (name, smile) 18 | 19 | if __name__ == '__main__': 20 | before = time.time() 21 | argc = len(sys.argv) 22 | if argc != 2: 23 | print("usage: %s input.smi" % sys.argv[0]) 24 | sys.exit(1) 25 | input = sys.argv[1] 26 | count = 0 27 | wildcard = Chem.Atom(0) 28 | for name, orig_smile in RobustSmilesMolSupplier(input): 29 | mol = Chem.MolFromSmiles(orig_smile) 30 | # output original molecule first 31 | print("%s\t%s" % (orig_smile, name)) 32 | num_atoms = mol.GetNumAtoms() 33 | # then output its variants 34 | for i in range(num_atoms): 35 | editable = Chem.EditableMol(mol) 36 | editable.ReplaceAtom(i, wildcard, preserveProps=True) 37 | edited = editable.GetMol() 38 | smi = Chem.MolToSmiles(edited) 39 | print("%s\t%s_%d" % (smi, name, i)) 40 | count += 1 41 | after = time.time() 42 | dt = after - before 43 | print("%d molecules at %.2f mol/s" % (count, count / dt), file=sys.stderr) 44 | 45 | # # original code by @Iwatobipen 46 | # # replace any aromatic carbon to aromatic nitrogen. 47 | # # TODO: does not compile 48 | # def nitrogen_scan(mol_in): 49 | # out_mol_list = [] 50 | # used = set() 51 | # rxn = rdChemReactions.ReactionFromSmarts("[c:1][H]>>[n:1]") 52 | # products = rxn.RunReactants([mol_in]) 53 | # for p in products: 54 | # smi = Chem.MolToSmiles(Chem.RemoveHs(p)) 55 | # if smi not in used: 56 | # used.add(smi) 57 | # out_mol_list.append(p) 58 | # return out_mol_list 59 | -------------------------------------------------------------------------------- /bin/molenc_scan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #set -x # DEBUG 4 | 5 | # check params 6 | if [ "$#" -ne 3 ]; then 7 | # 0 1 2 3 8 | echo "usage: molenc.sh input.smi output.txt features.dix" 9 | exit 1 10 | fi 11 | 12 | input=$1 13 | output=$2 14 | dico=$3 15 | 16 | std_log=$input'.std_log' 17 | tmp=`mktemp` 18 | tmp_smi=$tmp'_std.smi' 19 | tmp_scan=$tmp'_scan.smi' 20 | tmp_types=$tmp'_std.types' 21 | tmp_enc=$tmp'_std.enc' 22 | 23 | # tell user how to install standardiser if not here 24 | which standardiser 2>&1 > /dev/null || \ 25 | echo 'ERROR: type: pip3 install chemo-standardizer' 26 | 27 | echo standardizing molecules... 28 | (standardiser -i $input -o $tmp_smi 2>&1) > $std_log 29 | echo wildcard scan... 30 | molenc_scan.py $tmp_smi > $tmp_scan 31 | echo typing atoms... 32 | molenc_type_atoms.py $tmp_scan > $tmp_types 33 | echo encoding molecules... 34 | molenc_e -i $tmp_types -r 0:1 -o $tmp_enc 35 | molenc_d -i $tmp_enc -o $output -d $dico 36 | 37 | # cleanup 38 | rm -f $std_log $tmp $tmp_smi $tmp_scan $tmp_types $tmp_enc 39 | -------------------------------------------------------------------------------- /bin/molenc_sdf2smi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | 5 | from rdkit import Chem 6 | from rdkit.Chem.rdmolfiles import SmilesWriter 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('inputfile', help="sdf input file") 10 | parser.add_argument('outputfile', help="smi output file") 11 | args = parser.parse_args() 12 | sdf = Chem.SDMolSupplier(args.inputfile) 13 | writer = SmilesWriter(args.outputfile, delimiter='\t', includeHeader=False) 14 | 15 | for mol in sdf: 16 | writer.write(mol) 17 | writer.close() 18 | -------------------------------------------------------------------------------- /bin/molenc_sdf_strip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # remove listed tags from a .sdf file 4 | # usage: molenc_sdf_strip.py input.sdf ",,..." > stripped.sdf 5 | 6 | import sys 7 | 8 | sdf_fn = sys.argv[1] 9 | tags = sys.argv[2] # coma-separated list of tags to remove 10 | 11 | tags_list = tags.split(',') 12 | tags_set = set(tags_list) 13 | 14 | skip = False 15 | 16 | def endswith_any(line, tset): 17 | for tag in tset: 18 | if line.endswith(tag): 19 | return True 20 | return False 21 | 22 | for line in open(sdf_fn).readlines(): 23 | stripped = line.strip() 24 | if endswith_any(stripped, tags_set): 25 | skip = True # the tag line itself 26 | elif skip: 27 | skip = False # the line after 28 | else: 29 | print(line, end='') # any other line 30 | -------------------------------------------------------------------------------- /bin/molenc_smi2cansmi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # SMILES to RdKit canonical SMILES 4 | 5 | import sys 6 | 7 | from rdkit import Chem 8 | 9 | def RobustSmilesMolSupplier(filename): 10 | with open(filename) as f: 11 | for line in f: 12 | smile, name = line.strip().split("\t") # enforce TAB-separated 13 | try: 14 | mol = Chem.MolFromSmiles(smile) 15 | cano_smi = Chem.MolToSmiles(mol) 16 | yield (cano_smi, name) 17 | except Exception: 18 | print("ERROR: cannot parse: %s" % line, 19 | file=sys.stderr, end='') 20 | 21 | input_fn = sys.argv[1] 22 | 23 | for cano_smi, name in RobustSmilesMolSupplier(input_fn): 24 | print('%s\t%s' % (cano_smi, name)) 25 | -------------------------------------------------------------------------------- /bin/molenc_smi2png.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import rdkit, sys 4 | from rdkit import Chem 5 | from rdkit.Chem import Draw 6 | 7 | input_smi = sys.argv[1] 8 | output_png = sys.argv[2] 9 | 10 | # WARNING: only read and consider first line of input SMILES file 11 | with open(input_smi, 'r') as input: 12 | with open(output_png, 'wb') as output: 13 | line = input.readline() 14 | line.strip() 15 | split = line.split() 16 | smi = split[0] 17 | name = split[1] 18 | mol = Chem.MolFromSmiles(smi) 19 | assert(mol != None) 20 | d2d = Draw.MolDraw2DCairo(-1,-1) 21 | Draw.DrawMoleculeACS1996(d2d, mol, legend=name) 22 | pix = d2d.GetDrawingText() 23 | output.write(pix) 24 | -------------------------------------------------------------------------------- /bin/molenc_stable.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright (C) 2024, Francois Berenger 4 | # Tsuda laboratory, The University of Tokyo, 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan. 6 | # 7 | # stable filter: only non-reactive molecules are printed on stdout 8 | # input line format: \t 9 | # output line format: same as input 10 | 11 | import sys 12 | 13 | from rdkit import Chem 14 | from rdkit.Chem import Descriptors 15 | 16 | def RobustSmilesMolSupplier(filename): 17 | with open(filename) as f: 18 | for line in f: 19 | splits = line.strip().split("\t") # enforce TAB-separated 20 | smile = splits[0] 21 | try: 22 | mol = Chem.MolFromSmiles(smile) 23 | yield (mol, line) 24 | except Exception: 25 | print("ERROR: cannot parse: %s" % line, 26 | file=sys.stderr) 27 | 28 | # Lisurek, M., Rupp, B., Wichard, J., Neuenschwander, M., von Kries, J. P., 29 | # Frank, R., ... & Kühne, R. (2010). 30 | # Design of chemical libraries with potentially bioactive molecules applying 31 | # a maximum common substructure concept. Molecular diversity, 14(2), 401-408. 32 | # SMARTS patterns kindly provided by Michael Lisurek 33 | pat1 = Chem.MolFromSmarts('[C,c]S(=O)(=O)[F,Cl,Br,I]') # sulfonylhalide 34 | pat2 = Chem.MolFromSmarts('[C,c]S(=O)(=O)O[CX4]') # sulfone_ester 35 | pat3 = Chem.MolFromSmarts('C(=O)[F,Cl,Br,I]') # acylhalide 36 | pat4 = Chem.MolFromSmarts('O=COC=O') # acidanhydride 37 | pat5 = Chem.MolFromSmarts('c1([F,Cl,Br,I])ncccn1') # 2-halo_pyrimidine 38 | pat6 = Chem.MolFromSmarts('[H]C=O') # aldehyde 39 | pat7 = Chem.MolFromSmarts('C(=O)C(=O)') # 1,2-dicarbonyl 40 | pat8 = Chem.MolFromSmarts('C1OC1') # epoxide 41 | pat9 = Chem.MolFromSmarts('C1NC1') # aziridine 42 | pat10 = Chem.MolFromSmarts('C(=O)S') # thioester 43 | pat11 = Chem.MolFromSmarts('[#7]!@[#7]') # hydrazine 44 | pat12 = Chem.MolFromSmarts('C=[CH2]') # ethenes 45 | pat13 = Chem.MolFromSmarts('[H,*,!N][N;!R]=[C;!R]([*,H])[*,H]') # imine 46 | pat14 = Chem.MolFromSmarts('[CX4]I') # alkyl_iodide 47 | pat15 = Chem.MolFromSmarts('[Se]') # selenide 48 | pat16 = Chem.MolFromSmarts('O-O') # peroxide 49 | pat17 = Chem.MolFromSmarts('[NX3]!@[OX2]') # hetero-hetero_single_bond 50 | pat18 = Chem.MolFromSmarts('[NX3]!@[NX3]') # hetero-hetero_single_bond 51 | pat19 = Chem.MolFromSmarts('[NX3]!@[SX2]') # hetero-hetero_single_bond 52 | pat20 = Chem.MolFromSmarts('[SX2]!@[SX2]') # hetero-hetero_single_bond 53 | pat21 = Chem.MolFromSmarts('[SX2]!@[OX2]') # hetero-hetero_single_bond 54 | 55 | def stable_filter(mol): 56 | return (not ( 57 | mol.HasSubstructMatch(pat1) or 58 | mol.HasSubstructMatch(pat2) or 59 | mol.HasSubstructMatch(pat3) or 60 | mol.HasSubstructMatch(pat4) or 61 | mol.HasSubstructMatch(pat5) or 62 | mol.HasSubstructMatch(pat6) or 63 | mol.HasSubstructMatch(pat7) or 64 | mol.HasSubstructMatch(pat8) or 65 | mol.HasSubstructMatch(pat9) or 66 | mol.HasSubstructMatch(pat10) or 67 | mol.HasSubstructMatch(pat11) or 68 | mol.HasSubstructMatch(pat12) or 69 | mol.HasSubstructMatch(pat13) or 70 | mol.HasSubstructMatch(pat14) or 71 | mol.HasSubstructMatch(pat15) or 72 | mol.HasSubstructMatch(pat16) or 73 | mol.HasSubstructMatch(pat17) or 74 | mol.HasSubstructMatch(pat18) or 75 | mol.HasSubstructMatch(pat19) or 76 | mol.HasSubstructMatch(pat20) or 77 | mol.HasSubstructMatch(pat21))) 78 | 79 | input_fn = sys.argv[1] 80 | 81 | for mol, line in RobustSmilesMolSupplier(input_fn): 82 | if stable_filter(mol): 83 | # exact input lines replicated to the output 84 | print(line, end='') 85 | -------------------------------------------------------------------------------- /bin/molenc_thash.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright (C) 2024, Francois Berenger 4 | # Tsuda laboratory, The University of Tokyo, 5 | # 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan. 6 | # 7 | # append to stdout tautomer_hash to lines of a provided SMILES file 8 | 9 | import rdkit, sys, typing 10 | from rdkit import Chem 11 | from rdkit.Chem import RegistrationHash 12 | from rdkit.Chem.RegistrationHash import HashLayer 13 | 14 | input_fn = sys.argv[1] 15 | 16 | def get_rdkit_tautomer_hash(smi: str) -> str: 17 | mol = Chem.MolFromSmiles(smi) 18 | layers = RegistrationHash.GetMolLayers(mol) 19 | return layers[HashLayer.TAUTOMER_HASH] 20 | 21 | for line in open(input_fn).readlines(): 22 | stripped = line.strip() 23 | smi = stripped.split()[0] 24 | taut_hash = get_rdkit_tautomer_hash(smi) 25 | print('%s\t%s' % (stripped, taut_hash)) 26 | -------------------------------------------------------------------------------- /bin/molenc_type_atoms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # type atoms of a molecule a la atom pairs 4 | # (nb. pi electrons if > 0, elt. symbol, nbHA neighbors) 5 | # formal charges are ignored, as was the case in the seminal implementation 6 | # of atom pairs, not sure this is very smart though 7 | 8 | import argparse, molenc_common, os, rdkit, sys, time 9 | from enum import Enum 10 | from rdkit import Chem 11 | from rdkit import RDConfig 12 | from rdkit.Chem import AllChem, Descriptors 13 | from rdkit.Chem.AtomPairs import Pairs 14 | 15 | def RobustSmilesMolSupplier(filename): 16 | with open(filename) as f: 17 | for line in f: 18 | words = line.split() 19 | smile = words[0] 20 | name = " ".join(words[1:]) # everything after the SMILES string 21 | yield (name, Chem.MolFromSmiles(smile)) 22 | 23 | def SdfMolSupplier(fn): 24 | for mol in Chem.SDMolSupplier(fn): 25 | if mol: 26 | name = mol.GetProp('_Name') 27 | yield (name, mol) 28 | 29 | def nb_heavy_atom_neighbors(a): 30 | res = 0 31 | for neighb in a.GetNeighbors(): 32 | if neighb.GetAtomicNum() != 1: 33 | res += 1 34 | return res 35 | 36 | PeriodicTable = Chem.GetPeriodicTable() 37 | 38 | def string_of_charge(charge): 39 | if charge == 0: return "" 40 | elif charge == -1: return "-" 41 | elif charge == 1: return "+" 42 | else: return ("%+d" % charge) 43 | 44 | def type_atom(a): 45 | res = None 46 | nb_pi_electrons = Pairs.Utils.NumPiElectrons(a) 47 | symbol = PeriodicTable.GetElementSymbol(a.GetAtomicNum()) 48 | nbHA = nb_heavy_atom_neighbors(a) 49 | formal_charge = string_of_charge(a.GetFormalCharge()) 50 | if nb_pi_electrons > 0: 51 | res = "%d%s%d%s" % (nb_pi_electrons, symbol, nbHA, formal_charge) 52 | else: 53 | res = "%s%d%s" % (symbol, nbHA, formal_charge) 54 | return res 55 | 56 | def encode_molecule(m): 57 | return map(type_atom, m.GetAtoms()) 58 | 59 | def print_encoded_atoms(out, atoms): 60 | for i, a in enumerate(atoms): 61 | print("%d %s" % (i, a), file=out) 62 | 63 | if __name__ == '__main__': 64 | before = time.time() 65 | # CLI options parsing 66 | parser = argparse.ArgumentParser( 67 | description = "compute atom types and distances") 68 | parser.add_argument("-i", metavar = "input.{smi|sdf}", dest = "input_fn", 69 | help = "molecules input file") 70 | parser.add_argument("-o", metavar = "output.txt", dest = "output_fn", 71 | help = "output file") 72 | parser.add_argument('--3D', dest='three_dimensions', action='store_true', 73 | help = "consider molecules in 3D (requires SDF)") 74 | parser.set_defaults(three_dimensions=False) 75 | # parse CLI 76 | if len(sys.argv) == 1: 77 | # show help in case user has no clue of what to do 78 | parser.print_help(sys.stderr) 79 | sys.exit(1) 80 | args = parser.parse_args() 81 | input_fn = args.input_fn 82 | output = open(args.output_fn, 'w') 83 | mol_supplier = None 84 | three_dimensions = args.three_dimensions 85 | if three_dimensions or input_fn.endswith(".sdf"): 86 | mol_supplier = SdfMolSupplier 87 | elif input_fn.endswith(".smi"): 88 | mol_supplier = RobustSmilesMolSupplier 89 | else: 90 | print("molenc_type_atoms.py: input file not .smi or .sdf and no --3D", 91 | file=sys.stderr) 92 | sys.exit(1) 93 | count = 0 94 | for name, mol in mol_supplier(input_fn): 95 | print("#atoms:%d %s" % (mol.GetNumAtoms(), name), file=output) 96 | print_encoded_atoms(output, encode_molecule(mol)) 97 | molenc_common.print_bonds(output, mol) 98 | molenc_common.print_distance_matrix(output, mol, three_dimensions) 99 | count += 1 100 | after = time.time() 101 | dt = after - before 102 | print("%d molecules at %.2f mol/s" % (count, count / dt), file=sys.stderr) 103 | output.close() 104 | -------------------------------------------------------------------------------- /bin/molenc_uniq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # only print on stdout a line if its key was never seen before 4 | 5 | import sys 6 | 7 | input_fn = sys.argv[1] 8 | sep = sys.argv[2] 9 | # user-provided field number, as in awk, start at 1 10 | field = int(sys.argv[3]) - 1 11 | 12 | seen = {} 13 | 14 | for line in open(input_fn).readlines(): 15 | strip = line.strip() 16 | toks = strip.split(sep) 17 | key = toks[field] 18 | already_seen = seen.get(key, False) 19 | if not already_seen: 20 | print(line) 21 | seen[key] = True 22 | -------------------------------------------------------------------------------- /bin/rgb_scale.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # pip3 install colour # to get the required library 4 | 5 | from colour import Color 6 | 7 | red = Color("red") 8 | colors = list(red.range_to(Color("white"), 101)) 9 | for c in colors: 10 | (r, g, b) = c.get_rgb() 11 | print("%.2f %.2f %.2f" % (r, g, b)) 12 | -------------------------------------------------------------------------------- /bin/smi2png.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # One 2D picture SVG for each SMILES line 4 | # molecule images are created in a created pix/ directory 5 | # and named after their corresponding molecule 6 | 7 | import argparse 8 | import rdkit 9 | import os 10 | import sys 11 | from rdkit import Chem 12 | from rdkit.Chem import AllChem 13 | from rdkit.Chem.Draw import rdMolDraw2D 14 | 15 | def RobustMolSupplier(filename): 16 | with open(filename) as f: 17 | i = 0 18 | for line in f: 19 | words = line.split() 20 | index = i 21 | i += 1 22 | smi = words[0] 23 | name = words[1] 24 | yield (index, name, Chem.MolFromSmiles(smi)) 25 | 26 | if __name__ == '__main__': 27 | # parse CLI 28 | # show help in case user has no clue of what to do 29 | if len(sys.argv) != 2: 30 | sys.stderr.write("usage: %s input.smi\n" % sys.argv[0]) 31 | sys.exit(1) 32 | input_smi = sys.argv[1] 33 | if not (os.path.isdir('pix')): 34 | os.mkdir('pix') 35 | for i, name, mol in RobustMolSupplier(input_smi): 36 | if mol is None: 37 | continue 38 | AllChem.Compute2DCoords(mol) # generate 2D conformer 39 | d = rdMolDraw2D.MolDraw2DCairo(300, 300) # PNG output 40 | # d.drawOptions().addAtomIndices = True 41 | caption = '%d %s' % (i, name) 42 | d.DrawMolecule(mol, legend = caption) 43 | d.FinishDrawing() 44 | out_fn = 'pix/%s.png' % name 45 | print("creating %s" % out_fn) 46 | with open(out_fn, 'wb') as out: 47 | out.write(d.GetDrawingText()) 48 | -------------------------------------------------------------------------------- /bin/smi2svg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # One 2D picture SVG for each SMILES line 4 | # molecule images are named after the index of the molecule in the input file 5 | # they are created in the current directory 6 | 7 | import argparse 8 | import rdkit 9 | import sys 10 | from rdkit import Chem 11 | from rdkit.Chem import AllChem 12 | from rdkit.Chem.Draw import rdMolDraw2D 13 | 14 | def RobustMolSupplier(filename): 15 | with open(filename) as f: 16 | i = 0 17 | for line in f: 18 | words = line.split() 19 | index = i 20 | i += 1 21 | smi = words[0] 22 | name = words[1] 23 | yield (index, name, Chem.MolFromSmiles(smi)) 24 | 25 | if __name__ == '__main__': 26 | # parse CLI 27 | # show help in case user has no clue of what to do 28 | if len(sys.argv) != 2: 29 | sys.stderr.write("usage: %s input.smi\n" % sys.argv[0]) 30 | sys.exit(1) 31 | input_smi = sys.argv[1] 32 | for i, name, mol in RobustMolSupplier(input_smi): 33 | if mol is None: 34 | continue 35 | AllChem.Compute2DCoords(mol) # generate 2D conformer 36 | d = rdMolDraw2D.MolDraw2DSVG(200, 200) 37 | # d.drawOptions().addAtomIndices = True 38 | caption = '%d %s' % (i, name) 39 | d.DrawMolecule(mol, legend = caption) 40 | d.FinishDrawing() 41 | out_fn = '%d.svg' % i 42 | print('creating %s' % out_fn) 43 | with open(out_fn, 'w') as out: 44 | out.write(d.GetDrawingText()) 45 | -------------------------------------------------------------------------------- /data/3.frags: -------------------------------------------------------------------------------- 1 | #atoms:15 NCGC00261552-01_f00 2 | 0 0,6,2,0 3 | 16 0,7,2,0 4 | 17 0,6,3,0 5 | 18 1,6,3,0 6 | 19 1,6,2,0 7 | 20 1,6,2,0 8 | 21 1,6,2,0 9 | 22 1,6,2,0 10 | 23 1,6,2,0 11 | 24 1,6,3,0 12 | 25 1,6,2,0 13 | 26 1,6,2,0 14 | 27 1,6,2,0 15 | 28 1,6,2,0 16 | 29 1,6,2,0 17 | #bonds:16 18 | 0 - 16 19 | 16 - 17 20 | 17 - 18 21 | 18 : 19 22 | 19 : 20 23 | 20 : 21 24 | 21 : 22 25 | 22 : 23 26 | 17 - 24 27 | 24 : 25 28 | 25 : 26 29 | 26 : 27 30 | 27 : 28 31 | 28 : 29 32 | 23 : 18 33 | 29 : 24 34 | #anchors:1 35 | 0,6,2,0 0 0,6,2,0 36 | #atoms:1 NCGC00261552-01_f01 37 | 1 0,6,2,0 38 | #bonds:0 39 | #anchors:2 40 | 0,6,2,0 1 0,7,2,0 41 | 0,6,2,0 1 0,6,2,0 42 | #atoms:6 NCGC00261552-01_f02 43 | 10 1,6,3,0 44 | 11 1,6,2,0 45 | 12 1,6,2,0 46 | 13 1,6,2,0 47 | 14 1,6,2,0 48 | 15 1,6,2,0 49 | #bonds:6 50 | 10 : 11 51 | 11 : 12 52 | 12 : 13 53 | 13 : 14 54 | 14 : 15 55 | 15 : 10 56 | #anchors:1 57 | 1,6,3,0 10 0,6,3,0 58 | #atoms:8 NCGC00261552-01_f03 59 | 2 0,7,2,0 60 | 3 0,6,3,0 61 | 4 1,6,3,0 62 | 5 1,6,2,0 63 | 6 1,6,2,0 64 | 7 1,6,2,0 65 | 8 1,6,2,0 66 | 9 1,6,2,0 67 | #bonds:8 68 | 2 - 3 69 | 3 - 4 70 | 4 : 5 71 | 5 : 6 72 | 6 : 7 73 | 7 : 8 74 | 8 : 9 75 | 9 : 4 76 | #anchors:2 77 | 0,6,3,0 3 1,6,3,0 78 | 0,7,2,0 2 0,6,2,0 79 | #atoms:6 NCGC00261763-01_f00 80 | 13 1,6,3,0 81 | 14 1,6,2,0 82 | 15 1,6,2,0 83 | 16 1,6,2,0 84 | 17 1,6,2,0 85 | 18 1,6,2,0 86 | #bonds:6 87 | 13 : 14 88 | 14 : 15 89 | 15 : 16 90 | 16 : 17 91 | 17 : 18 92 | 18 : 13 93 | #anchors:1 94 | 1,6,3,0 13 1,7,3,0 95 | #atoms:6 NCGC00261763-01_f01 96 | 7 1,6,3,0 97 | 8 1,6,2,0 98 | 9 1,6,2,0 99 | 10 1,6,2,0 100 | 11 1,6,2,0 101 | 12 1,6,2,0 102 | #bonds:6 103 | 7 : 8 104 | 8 : 9 105 | 9 : 10 106 | 10 : 11 107 | 11 : 12 108 | 12 : 7 109 | #anchors:1 110 | 1,6,3,0 7 1,6,3,0 111 | #atoms:7 NCGC00261763-01_f02 112 | 0 0,6,1,0 113 | 1 1,7,2,0 114 | 2 1,6,3,0 115 | 3 1,16,2,0 116 | 4 1,7,3,0 117 | 5 1,6,3,0 118 | 6 1,7,2,0 119 | #bonds:7 120 | 0 - 1 121 | 1 = 2 122 | 2 : 3 123 | 3 : 4 124 | 4 : 5 125 | 5 : 6 126 | 6 : 2 127 | #anchors:2 128 | 1,6,3,0 5 1,6,3,0 129 | 1,7,3,0 4 1,6,3,0 130 | #atoms:1 NCGC00260832-01_f00 131 | 0 0,6,1,0 132 | #bonds:0 133 | #anchors:1 134 | 0,6,1,0 0 0,7,3,0 135 | #atoms:7 NCGC00260832-01_f01 136 | 10 0,7,2,0 137 | 11 1,6,3,0 138 | 12 1,6,2,0 139 | 13 1,6,2,0 140 | 14 1,6,2,0 141 | 15 1,7,2,0 142 | 16 1,6,2,0 143 | #bonds:7 144 | 10 - 11 145 | 11 : 12 146 | 12 : 13 147 | 13 : 14 148 | 14 : 15 149 | 15 : 16 150 | 16 : 11 151 | #anchors:1 152 | 0,7,2,0 10 1,6,3,0 153 | #atoms:12 NCGC00260832-01_f02 154 | 1 0,7,3,0 155 | 2 0,6,2,0 156 | 3 0,6,2,0 157 | 4 1,6,3,0 158 | 5 1,6,2,0 159 | 6 1,6,3,0 160 | 7 0,7,2,0 161 | 8 1,6,3,0 162 | 9 1,8,1,0 163 | 17 1,6,2,0 164 | 18 1,6,2,0 165 | 19 1,6,3,0 166 | #bonds:13 167 | 1 - 2 168 | 2 - 3 169 | 3 - 4 170 | 4 : 5 171 | 5 : 6 172 | 6 - 7 173 | 7 - 8 174 | 8 = 9 175 | 6 : 17 176 | 17 : 18 177 | 18 : 19 178 | 19 - 1 179 | 19 : 4 180 | #anchors:2 181 | 1,6,3,0 8 0,7,2,0 182 | 0,7,3,0 1 0,6,1,0 183 | -------------------------------------------------------------------------------- /data/3.smi: -------------------------------------------------------------------------------- 1 | C(CNC(C1=CC=CC=C1)C2=CC=CC=C2)NC(C3=CC=CC=C3)C4=CC=CC=C4 NCGC00261552-01 2 | C\N=C1/SN(C(=N1)C2=CC=CC=C2)C3=CC=CC=C3 NCGC00261763-01 3 | CN1CCC2=CC(NC(=O)NC3=CC=CN=C3)=CC=C12 NCGC00260832-01 4 | -------------------------------------------------------------------------------- /data/3.to_frag: -------------------------------------------------------------------------------- 1 | #atoms:30 NCGC00261552-01 2 | 0 0,6,2,0,0 3 | 1 0,6,2,0,0 4 | 2 0,7,2,0,0 5 | 3 0,6,3,0,0 6 | 4 1,6,3,0,0 7 | 5 1,6,2,0,0 8 | 6 1,6,2,0,0 9 | 7 1,6,2,0,0 10 | 8 1,6,2,0,0 11 | 9 1,6,2,0,0 12 | 10 1,6,3,0,0 13 | 11 1,6,2,0,0 14 | 12 1,6,2,0,0 15 | 13 1,6,2,0,0 16 | 14 1,6,2,0,0 17 | 15 1,6,2,0,0 18 | 16 0,7,2,0,0 19 | 17 0,6,3,0,0 20 | 18 1,6,3,0,0 21 | 19 1,6,2,0,0 22 | 20 1,6,2,0,0 23 | 21 1,6,2,0,0 24 | 22 1,6,2,0,0 25 | 23 1,6,2,0,0 26 | 24 1,6,3,0,0 27 | 25 1,6,2,0,0 28 | 26 1,6,2,0,0 29 | 27 1,6,2,0,0 30 | 28 1,6,2,0,0 31 | 29 1,6,2,0,0 32 | #bonds:33 33 | 0 - 1 N 34 | 1 - 2 N 35 | 2 - 3 N 36 | 3 - 4 N 37 | 4 : 5 N 38 | 5 : 6 N 39 | 6 : 7 N 40 | 7 : 8 N 41 | 8 : 9 N 42 | 3 - 10 N 43 | 10 : 11 N 44 | 11 : 12 N 45 | 12 : 13 N 46 | 13 : 14 N 47 | 14 : 15 N 48 | 0 - 16 N 49 | 16 - 17 N 50 | 17 - 18 N 51 | 18 : 19 N 52 | 19 : 20 N 53 | 20 : 21 N 54 | 21 : 22 N 55 | 22 : 23 N 56 | 17 - 24 N 57 | 24 : 25 N 58 | 25 : 26 N 59 | 26 : 27 N 60 | 27 : 28 N 61 | 28 : 29 N 62 | 9 : 4 N 63 | 15 : 10 N 64 | 23 : 18 N 65 | 29 : 24 N 66 | #cut_bonds:9:2 67 | 0 68 | 1 69 | 2 70 | 3 71 | 9 72 | 15 73 | 16 74 | 17 75 | 23 76 | #atoms:19 NCGC00261763-01 77 | 0 0,6,1,0,0 78 | 1 1,7,2,0,0 79 | 2 1,6,3,0,0 80 | 3 1,16,2,0,0 81 | 4 1,7,3,0,0 82 | 5 1,6,3,0,0 83 | 6 1,7,2,0,0 84 | 7 1,6,3,0,0 85 | 8 1,6,2,0,0 86 | 9 1,6,2,0,0 87 | 10 1,6,2,0,0 88 | 11 1,6,2,0,0 89 | 12 1,6,2,0,0 90 | 13 1,6,3,0,0 91 | 14 1,6,2,0,0 92 | 15 1,6,2,0,0 93 | 16 1,6,2,0,0 94 | 17 1,6,2,0,0 95 | 18 1,6,2,0,0 96 | #bonds:21 97 | 0 - 1 N 98 | 1 = 2 Z:0:3 99 | 2 : 3 N 100 | 3 : 4 N 101 | 4 : 5 N 102 | 5 : 6 N 103 | 5 - 7 N 104 | 7 : 8 N 105 | 8 : 9 N 106 | 9 : 10 N 107 | 10 : 11 N 108 | 11 : 12 N 109 | 4 - 13 N 110 | 13 : 14 N 111 | 14 : 15 N 112 | 15 : 16 N 113 | 16 : 17 N 114 | 17 : 18 N 115 | 6 : 2 N 116 | 12 : 7 N 117 | 18 : 13 N 118 | #cut_bonds:2:1 119 | 6 120 | 12 121 | #atoms:20 NCGC00260832-01 122 | 0 0,6,1,0,0 123 | 1 0,7,3,0,0 124 | 2 0,6,2,0,0 125 | 3 0,6,2,0,0 126 | 4 1,6,3,0,0 127 | 5 1,6,2,0,0 128 | 6 1,6,3,0,0 129 | 7 0,7,2,0,0 130 | 8 1,6,3,0,0 131 | 9 1,8,1,0,0 132 | 10 0,7,2,0,0 133 | 11 1,6,3,0,0 134 | 12 1,6,2,0,0 135 | 13 1,6,2,0,0 136 | 14 1,6,2,0,0 137 | 15 1,7,2,0,0 138 | 16 1,6,2,0,0 139 | 17 1,6,2,0,0 140 | 18 1,6,2,0,0 141 | 19 1,6,3,0,0 142 | #bonds:22 143 | 0 - 1 N 144 | 1 - 2 N 145 | 2 - 3 N 146 | 3 - 4 N 147 | 4 : 5 N 148 | 5 : 6 N 149 | 6 - 7 N 150 | 7 - 8 N 151 | 8 = 9 N 152 | 8 - 10 N 153 | 10 - 11 N 154 | 11 : 12 N 155 | 12 : 13 N 156 | 13 : 14 N 157 | 14 : 15 N 158 | 15 : 16 N 159 | 6 : 17 N 160 | 17 : 18 N 161 | 18 : 19 N 162 | 19 - 1 N 163 | 19 : 4 N 164 | 16 : 11 N 165 | #cut_bonds:5:1 166 | 0 167 | 6 168 | 7 169 | 9 170 | 10 171 | -------------------------------------------------------------------------------- /data/3_frags.smi: -------------------------------------------------------------------------------- 1 | *CCNC(c1ccccc1)c1ccccc1 NCGC00261552-01_f00 2 | *c1ccccc1 NCGC00261552-01_f01 3 | *NC(*)c1ccccc1 NCGC00261552-01_f02 4 | *c1ccccc1 NCGC00261763-01_f00 5 | *c1n/c(=N/C)sn1-c1ccccc1 NCGC00261763-01_f01 6 | *c1cccnc1 NCGC00260832-01_f00 7 | *NC(=O)Nc1ccc2c(c1)CCN2C NCGC00260832-01_f01 8 | -------------------------------------------------------------------------------- /data/3_frags.txt: -------------------------------------------------------------------------------- 1 | #atoms:16 NCGC00261552-01_f00 2 | 0 0,6,2,0,0 3 | 1 0,6,2,0,0 4 | 16 0,7,2,0,0 5 | 17 0,6,3,0,0 6 | 18 1,6,3,0,0 7 | 19 1,6,2,0,0 8 | 20 1,6,2,0,0 9 | 21 1,6,2,0,0 10 | 22 1,6,2,0,0 11 | 23 1,6,2,0,0 12 | 24 1,6,3,0,0 13 | 25 1,6,2,0,0 14 | 26 1,6,2,0,0 15 | 27 1,6,2,0,0 16 | 28 1,6,2,0,0 17 | 29 1,6,2,0,0 18 | #bonds:17 19 | 0 - 1 N 20 | 0 - 16 N 21 | 16 - 17 N 22 | 17 - 18 N 23 | 18 : 19 N 24 | 19 : 20 N 25 | 20 : 21 N 26 | 21 : 22 N 27 | 22 : 23 N 28 | 17 - 24 N 29 | 24 : 25 N 30 | 25 : 26 N 31 | 26 : 27 N 32 | 27 : 28 N 33 | 28 : 29 N 34 | 23 : 18 N 35 | 29 : 24 N 36 | #anchors:1 37 | 0,6,2,0,0 1 0,7,2,0,0 38 | #atoms:6 NCGC00261552-01_f01 39 | 10 1,6,3,0,0 40 | 11 1,6,2,0,0 41 | 12 1,6,2,0,0 42 | 13 1,6,2,0,0 43 | 14 1,6,2,0,0 44 | 15 1,6,2,0,0 45 | #bonds:6 46 | 10 : 11 N 47 | 11 : 12 N 48 | 12 : 13 N 49 | 13 : 14 N 50 | 14 : 15 N 51 | 15 : 10 N 52 | #anchors:1 53 | 1,6,3,0,0 10 0,6,3,0,0 54 | #atoms:8 NCGC00261552-01_f02 55 | 2 0,7,2,0,0 56 | 3 0,6,3,0,0 57 | 4 1,6,3,0,0 58 | 5 1,6,2,0,0 59 | 6 1,6,2,0,0 60 | 7 1,6,2,0,0 61 | 8 1,6,2,0,0 62 | 9 1,6,2,0,0 63 | #bonds:8 64 | 2 - 3 N 65 | 3 - 4 N 66 | 4 : 5 N 67 | 5 : 6 N 68 | 6 : 7 N 69 | 7 : 8 N 70 | 8 : 9 N 71 | 9 : 4 N 72 | #anchors:2 73 | 0,6,3,0,0 3 1,6,3,0,0 74 | 0,7,2,0,0 2 0,6,2,0,0 75 | #atoms:6 NCGC00261763-01_f00 76 | 7 1,6,3,0,0 77 | 8 1,6,2,0,0 78 | 9 1,6,2,0,0 79 | 10 1,6,2,0,0 80 | 11 1,6,2,0,0 81 | 12 1,6,2,0,0 82 | #bonds:6 83 | 7 : 8 N 84 | 8 : 9 N 85 | 9 : 10 N 86 | 10 : 11 N 87 | 11 : 12 N 88 | 12 : 7 N 89 | #anchors:1 90 | 1,6,3,0,0 7 1,6,3,0,0 91 | #atoms:13 NCGC00261763-01_f01 92 | 0 0,6,1,0,0 93 | 1 1,7,2,0,0 94 | 2 1,6,3,0,0 95 | 3 1,16,2,0,0 96 | 4 1,7,3,0,0 97 | 5 1,6,3,0,0 98 | 6 1,7,2,0,0 99 | 13 1,6,3,0,0 100 | 14 1,6,2,0,0 101 | 15 1,6,2,0,0 102 | 16 1,6,2,0,0 103 | 17 1,6,2,0,0 104 | 18 1,6,2,0,0 105 | #bonds:14 106 | 0 - 1 N 107 | 1 = 2 Z:0:3 108 | 2 : 3 N 109 | 3 : 4 N 110 | 4 : 5 N 111 | 5 : 6 N 112 | 4 - 13 N 113 | 13 : 14 N 114 | 14 : 15 N 115 | 15 : 16 N 116 | 16 : 17 N 117 | 17 : 18 N 118 | 6 : 2 N 119 | 18 : 13 N 120 | #anchors:1 121 | 1,6,3,0,0 5 1,6,3,0,0 122 | #atoms:6 NCGC00260832-01_f00 123 | 11 1,6,3,0,0 124 | 12 1,6,2,0,0 125 | 13 1,6,2,0,0 126 | 14 1,6,2,0,0 127 | 15 1,7,2,0,0 128 | 16 1,6,2,0,0 129 | #bonds:6 130 | 11 : 12 N 131 | 12 : 13 N 132 | 13 : 14 N 133 | 14 : 15 N 134 | 15 : 16 N 135 | 16 : 11 N 136 | #anchors:1 137 | 1,6,3,0,0 11 0,7,2,0,0 138 | #atoms:14 NCGC00260832-01_f01 139 | 0 0,6,1,0,0 140 | 1 0,7,3,0,0 141 | 2 0,6,2,0,0 142 | 3 0,6,2,0,0 143 | 4 1,6,3,0,0 144 | 5 1,6,2,0,0 145 | 6 1,6,3,0,0 146 | 7 0,7,2,0,0 147 | 8 1,6,3,0,0 148 | 9 1,8,1,0,0 149 | 10 0,7,2,0,0 150 | 17 1,6,2,0,0 151 | 18 1,6,2,0,0 152 | 19 1,6,3,0,0 153 | #bonds:15 154 | 0 - 1 N 155 | 1 - 2 N 156 | 2 - 3 N 157 | 3 - 4 N 158 | 4 : 5 N 159 | 5 : 6 N 160 | 6 - 7 N 161 | 7 - 8 N 162 | 8 = 9 N 163 | 8 - 10 N 164 | 6 : 17 N 165 | 17 : 18 N 166 | 18 : 19 N 167 | 19 - 1 N 168 | 19 : 4 N 169 | #anchors:1 170 | 0,7,2,0,0 10 1,6,3,0,0 171 | -------------------------------------------------------------------------------- /data/3_genmols.smi: -------------------------------------------------------------------------------- 1 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000001_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 2 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000002_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 3 | CN1CCc2cc(NC(=O)Nc3cccnc3)ccc21 genmol_000003_NCGC00260832-01_f00,NCGC00260832-01_f01 4 | C/N=c1/nc(-c2n/c(=N/C)sn2-c2ccccc2)n(-c2ccccc2)s1 genmol_000004_NCGC00261763-01_f01,NCGC00261763-01_f01 5 | C/N=c1/nc(-c2ccccc2)n(-c2ccccc2)s1 genmol_000005_NCGC00261763-01_f00,NCGC00261763-01_f01 6 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000006_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 7 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000007_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 8 | C/N=c1/nc(-c2n/c(=N/C)sn2-c2ccccc2)n(-c2ccccc2)s1 genmol_000008_NCGC00261763-01_f01,NCGC00261763-01_f01 9 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000009_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 10 | CN1CCc2cc(NC(=O)Nc3cccnc3)ccc21 genmol_000010_NCGC00260832-01_f00,NCGC00260832-01_f01 11 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000011_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 12 | CN1CCc2cc(NC(=O)Nc3cccnc3)ccc21 genmol_000012_NCGC00260832-01_f00,NCGC00260832-01_f01 13 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000013_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 14 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000014_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 15 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000015_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 16 | C/N=c1/nc(-c2ccccc2)n(-c2ccccc2)s1 genmol_000016_NCGC00261763-01_f00,NCGC00261763-01_f01 17 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000017_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 18 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000018_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 19 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000019_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 20 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 genmol_000020_NCGC00261552-01_f00,NCGC00261552-01_f01,NCGC00261552-01_f02 21 | -------------------------------------------------------------------------------- /data/3_genmols_uniq.smi: -------------------------------------------------------------------------------- 1 | c1ccc(C(NCCNC(c2ccccc2)c2ccccc2)c2ccccc2)cc1 2 | CN1CCc2cc(NC(=O)Nc3cccnc3)ccc21 3 | C/N=c1/nc(-c2ccccc2)n(-c2ccccc2)s1 4 | C/N=c1/nc(-c2n/c(=N/C)sn2-c2ccccc2)n(-c2ccccc2)s1 5 | -------------------------------------------------------------------------------- /data/ALDH1_2conf.ph4: -------------------------------------------------------------------------------- 1 | 13:22401519_1_1_1_1 2 | ARO 1.47088 -0.706617 1.86095 3 | ARO -3.88528 -1.98218 5.7585 4 | ARO 1.76862 -1.26624 3.89744 5 | HBD 2.0174 -1.9041 6.2761 6 | HBA 2.9153 -1.4999 4.0836 7 | HBA -0.4533 -1.1965 5.0622 8 | HBA -7.202 -2.5145 6.7169 9 | HBA -6.6826 -3.8445 5.0495 10 | POS -6.3641 -2.9407 5.8728 11 | NEG -7.202 -2.5145 6.7169 12 | HYD 1.47088 -0.706617 1.86095 13 | HYD -3.88528 -1.98218 5.7585 14 | HYD 1.76862 -1.26624 3.89744 15 | 13:22401519_1_1_1_2 16 | ARO 1.47088 -0.706617 1.86095 17 | ARO -3.47526 0.19586 6.40746 18 | ARO 1.76862 -1.26624 3.89744 19 | HBD 2.0174 -1.9041 6.2761 20 | HBA 2.9153 -1.4999 4.0836 21 | HBA -0.4533 -1.1965 5.0622 22 | HBA -6.4947 1.2154 7.8379 23 | HBA -5.574 2.8916 6.76 24 | POS -5.5543 1.6862 7.1378 25 | NEG -6.4947 1.2154 7.8379 26 | HYD 1.47088 -0.706617 1.86095 27 | HYD -3.47526 0.19586 6.40746 28 | HYD 1.76862 -1.26624 3.89744 29 | 14:4256362_1_1_1_1 30 | ARO 6.11932 -0.416467 -0.548583 31 | ARO 2.15405 0.669883 -1.00792 32 | ARO -3.07 3.7901 -3.20373 33 | ARO 4.28834 0.50898 -1.18316 34 | HBD -0.2095 1.8844 -1.8496 35 | HBA -3.4513 4.3099 -4.4494 36 | HBA 0.5397 3.3853 -3.4661 37 | HBA -0.2324 -0.0904 0.1262 38 | HYD 6.11932 -0.416467 -0.548583 39 | HYD 2.15405 0.669883 -1.00792 40 | HYD -3.07 3.7901 -3.20373 41 | HYD 4.28834 0.50898 -1.18316 42 | HYD -0.6502 0.6217 1.2873 43 | HYD -5.2726 3.6343 -0.7889 44 | 14:4256362_1_1_1_2 45 | ARO 6.11932 -0.416467 -0.548583 46 | ARO 2.15405 0.669883 -1.00792 47 | ARO -3.08318 3.77257 -3.14745 48 | ARO 4.28834 0.50898 -1.18316 49 | HBD -0.2095 1.8844 -1.8496 50 | HBA -3.9734 3.675 -2.068 51 | HBA 0.5397 3.3853 -3.4661 52 | HBA -0.2324 -0.0904 0.1262 53 | HYD 6.11932 -0.416467 -0.548583 54 | HYD 2.15405 0.669883 -1.00792 55 | HYD -3.08318 3.77257 -3.14745 56 | HYD 4.28834 0.50898 -1.18316 57 | HYD -0.6502 0.6217 1.2873 58 | HYD -4.1364 5.0094 -5.9878 59 | -------------------------------------------------------------------------------- /data/AP_test.smi: -------------------------------------------------------------------------------- 1 | On1cccc1 mol_1 2 | -------------------------------------------------------------------------------- /data/AP_test.smi.dix.ref: -------------------------------------------------------------------------------- 1 | #atom_pairs 2 | 0 1C2-0-1C2 4 3 | 1 1C2-2-1C2 3 4 | 2 1C2-1-1C2 3 5 | 3 1C2-3-O1 2 6 | 4 1C2-2-O1 2 7 | 5 1C2-2-1N3 2 8 | 6 1C2-1-1N3 2 9 | 7 O1-0-O1 1 10 | 8 1N3-1-O1 1 11 | 9 1N3-0-1N3 1 12 | -------------------------------------------------------------------------------- /data/AP_test.txt.ref: -------------------------------------------------------------------------------- 1 | mol_1,0.0,[0:4;1:3;2:3;3:2;4:2;5:2;6:2;7:1;8:1;9:1] 2 | -------------------------------------------------------------------------------- /data/alcools.AD.ref: -------------------------------------------------------------------------------- 1 | (6,1,3,1,0)-0-(6,1,3,1,0) 1 2 | (6,1,3,1,0)-1-(6,2,2,2,0) 1 3 | (6,1,3,1,0)-1-(8,1,1,1,0) 1 4 | (6,1,3,1,0)-2-(8,1,1,1,0) 1 5 | (6,2,2,2,0)-0-(6,2,2,2,0) 1 6 | (6,2,2,2,0)-1-(8,1,1,1,0) 1 7 | (8,1,1,1,0)-0-(8,1,1,1,0) 1 8 | -------------------------------------------------------------------------------- /data/alcools.smi: -------------------------------------------------------------------------------- 1 | CO methanol 2 | CCO ethanol 3 | -------------------------------------------------------------------------------- /data/caff_coca.sdf: -------------------------------------------------------------------------------- 1 | caffeine 2 | OpenBabel10151815402D 3 | 4 | 14 15 0 0 0 0 0 0 0 0999 V2000 5 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 6 | 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 7 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 8 | 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 9 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 10 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 11 | 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 12 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 13 | 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 14 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 15 | 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 16 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 17 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 18 | 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 19 | 1 2 1 0 0 0 0 20 | 2 13 1 0 0 0 0 21 | 2 3 1 0 0 0 0 22 | 3 4 2 0 0 0 0 23 | 3 5 1 0 0 0 0 24 | 5 9 1 0 0 0 0 25 | 5 6 2 0 0 0 0 26 | 6 7 1 0 0 0 0 27 | 6 11 1 0 0 0 0 28 | 7 8 2 0 0 0 0 29 | 8 9 1 0 0 0 0 30 | 9 10 1 0 0 0 0 31 | 11 12 1 0 0 0 0 32 | 11 13 1 0 0 0 0 33 | 13 14 2 0 0 0 0 34 | M END 35 | $$$$ 36 | cocaine 37 | OpenBabel10151815402D 38 | 39 | 22 24 0 0 1 0 0 0 0 0999 V2000 40 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 41 | 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 42 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 43 | 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 44 | 0.0000 0.0000 0.0000 C 0 0 1 0 0 0 0 0 0 0 0 0 45 | 0.0000 0.0000 0.0000 C 0 0 1 0 0 0 0 0 0 0 0 0 46 | 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 47 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 48 | 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 49 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 50 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 51 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 52 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 53 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 54 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 55 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 56 | 0.0000 0.0000 0.0000 C 0 0 2 0 0 0 0 0 0 0 0 0 57 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 58 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 59 | 0.0000 0.0000 0.0000 C 0 0 1 0 0 0 0 0 0 0 0 0 60 | 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 61 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 62 | 1 2 1 0 0 0 0 63 | 2 3 1 0 0 0 0 64 | 3 4 2 0 0 0 0 65 | 3 5 1 0 0 0 0 66 | 5 6 1 0 0 0 0 67 | 5 20 1 0 0 0 0 68 | 6 7 1 0 0 0 0 69 | 6 16 1 0 0 0 0 70 | 7 8 1 0 0 0 0 71 | 8 9 2 0 0 0 0 72 | 8 10 1 0 0 0 0 73 | 10 15 2 0 0 0 0 74 | 10 11 1 0 0 0 0 75 | 11 12 2 0 0 0 0 76 | 12 13 1 0 0 0 0 77 | 13 14 2 0 0 0 0 78 | 14 15 1 0 0 0 0 79 | 16 17 1 0 0 0 0 80 | 17 18 1 0 0 0 0 81 | 17 21 1 0 0 0 0 82 | 18 19 1 0 0 0 0 83 | 19 20 1 0 0 0 0 84 | 20 21 1 0 0 0 0 85 | 21 22 1 0 0 0 0 86 | M END 87 | $$$$ 88 | -------------------------------------------------------------------------------- /data/caff_coca.smi: -------------------------------------------------------------------------------- 1 | Cn1c(=O)c2c(ncn2C)n(C)c1=O caffeine 2 | COC(=O)[C@H]1[C@@H](OC(=O)c2ccccc2)C[C@@H]2CC[C@H]1N2C cocaine 3 | -------------------------------------------------------------------------------- /data/caff_coca_feats.ref: -------------------------------------------------------------------------------- 1 | #atoms:14 caffeine 2 | 0 _ 3 | 1 a 4 | 2 a 5 | 3 A 6 | 4 P a 7 | 5 P a 8 | 6 A P a 9 | 7 P a 10 | 8 P a 11 | 9 _ 12 | 10 a 13 | 11 _ 14 | 12 a 15 | 13 A 16 | #bonds:15 17 | 0 1 18 | 1 2 19 | 1 12 20 | 2 3 21 | 2 4 22 | 4 5 23 | 4 8 24 | 5 6 25 | 5 10 26 | 6 7 27 | 7 8 28 | 8 9 29 | 10 11 30 | 10 12 31 | 12 13 32 | #diameter:6 33 | 0 1 2 3 3 4 5 5 4 5 3 4 2 3 34 | 1 0 1 2 2 3 4 4 3 4 2 3 1 2 35 | 2 1 0 1 1 2 3 3 2 3 3 4 2 3 36 | 3 2 1 0 2 3 4 4 3 4 4 5 3 4 37 | 3 2 1 2 0 1 2 2 1 2 2 3 3 4 38 | 4 3 2 3 1 0 1 2 2 3 1 2 2 3 39 | 5 4 3 4 2 1 0 1 2 3 2 3 3 4 40 | 5 4 3 4 2 2 1 0 1 2 3 4 4 5 41 | 4 3 2 3 1 2 2 1 0 1 3 4 4 5 42 | 5 4 3 4 2 3 3 2 1 0 4 5 5 6 43 | 3 2 3 4 2 1 2 3 3 4 0 1 1 2 44 | 4 3 4 5 3 2 3 4 4 5 1 0 2 3 45 | 2 1 2 3 3 2 3 4 4 5 1 2 0 1 46 | 3 2 3 4 4 3 4 5 5 6 2 3 1 0 47 | #atoms:22 cocaine 48 | 0 _ 49 | 1 A 50 | 2 _ 51 | 3 A 52 | 4 H 53 | 5 _ 54 | 6 A 55 | 7 _ 56 | 8 A 57 | 9 H a h 58 | 10 a h 59 | 11 a h 60 | 12 a h 61 | 13 a h 62 | 14 a h 63 | 15 _ 64 | 16 _ 65 | 17 _ 66 | 18 _ 67 | 19 _ 68 | 20 D P 69 | 21 _ 70 | #bonds:24 71 | 0 1 72 | 1 2 73 | 2 3 74 | 2 4 75 | 4 5 76 | 4 19 77 | 5 6 78 | 5 15 79 | 6 7 80 | 7 8 81 | 7 9 82 | 9 10 83 | 9 14 84 | 10 11 85 | 11 12 86 | 12 13 87 | 13 14 88 | 15 16 89 | 16 17 90 | 16 20 91 | 17 18 92 | 18 19 93 | 19 20 94 | 20 21 95 | #diameter:10 96 | 0 1 2 3 3 4 5 6 7 7 8 9 10 9 8 5 6 6 5 4 5 6 97 | 1 0 1 2 2 3 4 5 6 6 7 8 9 8 7 4 5 5 4 3 4 5 98 | 2 1 0 1 1 2 3 4 5 5 6 7 8 7 6 3 4 4 3 2 3 4 99 | 3 2 1 0 2 3 4 5 6 6 7 8 9 8 7 4 5 5 4 3 4 5 100 | 3 2 1 2 0 1 2 3 4 4 5 6 7 6 5 2 3 3 2 1 2 3 101 | 4 3 2 3 1 0 1 2 3 3 4 5 6 5 4 1 2 3 3 2 3 4 102 | 5 4 3 4 2 1 0 1 2 2 3 4 5 4 3 2 3 4 4 3 4 5 103 | 6 5 4 5 3 2 1 0 1 1 2 3 4 3 2 3 4 5 5 4 5 6 104 | 7 6 5 6 4 3 2 1 0 2 3 4 5 4 3 4 5 6 6 5 6 7 105 | 7 6 5 6 4 3 2 1 2 0 1 2 3 2 1 4 5 6 6 5 6 7 106 | 8 7 6 7 5 4 3 2 3 1 0 1 2 3 2 5 6 7 7 6 7 8 107 | 9 8 7 8 6 5 4 3 4 2 1 0 1 2 3 6 7 8 8 7 8 9 108 | 10 9 8 9 7 6 5 4 5 3 2 1 0 1 2 7 8 9 9 8 9 10 109 | 9 8 7 8 6 5 4 3 4 2 3 2 1 0 1 6 7 8 8 7 8 9 110 | 8 7 6 7 5 4 3 2 3 1 2 3 2 1 0 5 6 7 7 6 7 8 111 | 5 4 3 4 2 1 2 3 4 4 5 6 7 6 5 0 1 2 3 3 2 3 112 | 6 5 4 5 3 2 3 4 5 5 6 7 8 7 6 1 0 1 2 2 1 2 113 | 6 5 4 5 3 3 4 5 6 6 7 8 9 8 7 2 1 0 1 2 2 3 114 | 5 4 3 4 2 3 4 5 6 6 7 8 9 8 7 3 2 1 0 1 2 3 115 | 4 3 2 3 1 2 3 4 5 5 6 7 8 7 6 3 2 2 1 0 1 2 116 | 5 4 3 4 2 3 4 5 6 6 7 8 9 8 7 2 1 2 2 1 0 1 117 | 6 5 4 5 3 4 5 6 7 7 8 9 10 9 8 3 2 3 3 2 1 0 118 | -------------------------------------------------------------------------------- /data/caff_coca_types.ref: -------------------------------------------------------------------------------- 1 | #atoms:14 caffeine 2 | 0 C1 3 | 1 1N3 4 | 2 1C3 5 | 3 1O1 6 | 4 1C3 7 | 5 1C3 8 | 6 1N2 9 | 7 1C2 10 | 8 1N3 11 | 9 C1 12 | 10 1N3 13 | 11 C1 14 | 12 1C3 15 | 13 1O1 16 | #bonds:15 17 | 0 1 18 | 1 2 19 | 1 12 20 | 2 3 21 | 2 4 22 | 4 5 23 | 4 8 24 | 5 6 25 | 5 10 26 | 6 7 27 | 7 8 28 | 8 9 29 | 10 11 30 | 10 12 31 | 12 13 32 | #diameter:6 33 | 0 1 2 3 3 4 5 5 4 5 3 4 2 3 34 | 1 0 1 2 2 3 4 4 3 4 2 3 1 2 35 | 2 1 0 1 1 2 3 3 2 3 3 4 2 3 36 | 3 2 1 0 2 3 4 4 3 4 4 5 3 4 37 | 3 2 1 2 0 1 2 2 1 2 2 3 3 4 38 | 4 3 2 3 1 0 1 2 2 3 1 2 2 3 39 | 5 4 3 4 2 1 0 1 2 3 2 3 3 4 40 | 5 4 3 4 2 2 1 0 1 2 3 4 4 5 41 | 4 3 2 3 1 2 2 1 0 1 3 4 4 5 42 | 5 4 3 4 2 3 3 2 1 0 4 5 5 6 43 | 3 2 3 4 2 1 2 3 3 4 0 1 1 2 44 | 4 3 4 5 3 2 3 4 4 5 1 0 2 3 45 | 2 1 2 3 3 2 3 4 4 5 1 2 0 1 46 | 3 2 3 4 4 3 4 5 5 6 2 3 1 0 47 | #atoms:22 cocaine 48 | 0 C1 49 | 1 O2 50 | 2 1C3 51 | 3 1O1 52 | 4 C3 53 | 5 C3 54 | 6 O2 55 | 7 1C3 56 | 8 1O1 57 | 9 1C3 58 | 10 1C2 59 | 11 1C2 60 | 12 1C2 61 | 13 1C2 62 | 14 1C2 63 | 15 C2 64 | 16 C3 65 | 17 C2 66 | 18 C2 67 | 19 C3 68 | 20 N3 69 | 21 C1 70 | #bonds:24 71 | 0 1 72 | 1 2 73 | 2 3 74 | 2 4 75 | 4 5 76 | 4 19 77 | 5 6 78 | 5 15 79 | 6 7 80 | 7 8 81 | 7 9 82 | 9 10 83 | 9 14 84 | 10 11 85 | 11 12 86 | 12 13 87 | 13 14 88 | 15 16 89 | 16 17 90 | 16 20 91 | 17 18 92 | 18 19 93 | 19 20 94 | 20 21 95 | #diameter:10 96 | 0 1 2 3 3 4 5 6 7 7 8 9 10 9 8 5 6 6 5 4 5 6 97 | 1 0 1 2 2 3 4 5 6 6 7 8 9 8 7 4 5 5 4 3 4 5 98 | 2 1 0 1 1 2 3 4 5 5 6 7 8 7 6 3 4 4 3 2 3 4 99 | 3 2 1 0 2 3 4 5 6 6 7 8 9 8 7 4 5 5 4 3 4 5 100 | 3 2 1 2 0 1 2 3 4 4 5 6 7 6 5 2 3 3 2 1 2 3 101 | 4 3 2 3 1 0 1 2 3 3 4 5 6 5 4 1 2 3 3 2 3 4 102 | 5 4 3 4 2 1 0 1 2 2 3 4 5 4 3 2 3 4 4 3 4 5 103 | 6 5 4 5 3 2 1 0 1 1 2 3 4 3 2 3 4 5 5 4 5 6 104 | 7 6 5 6 4 3 2 1 0 2 3 4 5 4 3 4 5 6 6 5 6 7 105 | 7 6 5 6 4 3 2 1 2 0 1 2 3 2 1 4 5 6 6 5 6 7 106 | 8 7 6 7 5 4 3 2 3 1 0 1 2 3 2 5 6 7 7 6 7 8 107 | 9 8 7 8 6 5 4 3 4 2 1 0 1 2 3 6 7 8 8 7 8 9 108 | 10 9 8 9 7 6 5 4 5 3 2 1 0 1 2 7 8 9 9 8 9 10 109 | 9 8 7 8 6 5 4 3 4 2 3 2 1 0 1 6 7 8 8 7 8 9 110 | 8 7 6 7 5 4 3 2 3 1 2 3 2 1 0 5 6 7 7 6 7 8 111 | 5 4 3 4 2 1 2 3 4 4 5 6 7 6 5 0 1 2 3 3 2 3 112 | 6 5 4 5 3 2 3 4 5 5 6 7 8 7 6 1 0 1 2 2 1 2 113 | 6 5 4 5 3 3 4 5 6 6 7 8 9 8 7 2 1 0 1 2 2 3 114 | 5 4 3 4 2 3 4 5 6 6 7 8 9 8 7 3 2 1 0 1 2 3 115 | 4 3 2 3 1 2 3 4 5 5 6 7 8 7 6 3 2 2 1 0 1 2 116 | 5 4 3 4 2 3 4 5 6 6 7 8 9 8 7 2 1 2 2 1 0 1 117 | 6 5 4 5 3 4 5 6 7 7 8 9 10 9 8 3 2 3 3 2 1 0 118 | -------------------------------------------------------------------------------- /data/caffeine.sdf: -------------------------------------------------------------------------------- 1 | caffeine 2 | OpenBabel10151815402D 3 | 4 | 14 15 0 0 0 0 0 0 0 0999 V2000 5 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 6 | 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 7 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 8 | 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 9 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 10 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 11 | 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 12 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 13 | 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 14 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 15 | 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 16 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 17 | 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 18 | 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 19 | 1 2 1 0 0 0 0 20 | 2 13 1 0 0 0 0 21 | 2 3 1 0 0 0 0 22 | 3 4 2 0 0 0 0 23 | 3 5 1 0 0 0 0 24 | 5 9 1 0 0 0 0 25 | 5 6 2 0 0 0 0 26 | 6 7 1 0 0 0 0 27 | 6 11 1 0 0 0 0 28 | 7 8 2 0 0 0 0 29 | 8 9 1 0 0 0 0 30 | 9 10 1 0 0 0 0 31 | 11 12 1 0 0 0 0 32 | 11 13 1 0 0 0 0 33 | 13 14 2 0 0 0 0 34 | M END 35 | $$$$ 36 | -------------------------------------------------------------------------------- /data/caffeine.smi: -------------------------------------------------------------------------------- 1 | Cn1c(=O)c2c(ncn2C)n(C)c1=O caffeine 2 | -------------------------------------------------------------------------------- /data/caffeine_3d.sdf: -------------------------------------------------------------------------------- 1 | caffeine 2 | OpenBabel10171811233D 3 | 4 | 24 25 0 0 0 0 0 0 0 0999 V2000 5 | -1.4537 2.7848 0.2699 C 0 0 0 0 0 0 0 0 0 0 0 0 6 | -1.0108 1.4083 0.1062 N 0 0 0 0 0 0 0 0 0 0 0 0 7 | 0.3015 1.1323 0.0489 C 0 0 0 0 0 0 0 0 0 0 0 0 8 | 1.1081 2.0920 0.1407 O 0 0 0 0 0 0 0 0 0 0 0 0 9 | 0.8161 -0.1286 -0.1033 C 0 0 0 0 0 0 0 0 0 0 0 0 10 | -0.0929 -1.1771 -0.2031 C 0 0 0 0 0 0 0 0 0 0 0 0 11 | 0.6111 -2.3242 -0.3462 N 0 0 0 0 0 0 0 0 0 0 0 0 12 | 1.9386 -2.0269 -0.3392 C 0 0 0 0 0 0 0 0 0 0 0 0 13 | 2.0299 -0.6962 -0.1913 N 0 0 0 0 0 0 0 0 0 0 0 0 14 | 3.2729 0.0261 -0.1349 C 0 0 0 0 0 0 0 0 0 0 0 0 15 | -1.4004 -0.8770 -0.1432 N 0 0 0 0 0 0 0 0 0 0 0 0 16 | -2.3540 -1.9596 -0.2459 C 0 0 0 0 0 0 0 0 0 0 0 0 17 | -1.8697 0.3771 0.0073 C 0 0 0 0 0 0 0 0 0 0 0 0 18 | -3.0974 0.6510 0.0627 O 0 0 0 0 0 0 0 0 0 0 0 0 19 | -0.6884 3.3191 0.8569 H 0 0 0 0 0 0 0 0 0 0 0 0 20 | -1.5024 3.2204 -0.7549 H 0 0 0 0 0 0 0 0 0 0 0 0 21 | -2.4690 2.8350 0.7286 H 0 0 0 0 0 0 0 0 0 0 0 0 22 | 2.7299 -2.7636 -0.4379 H 0 0 0 0 0 0 0 0 0 0 0 0 23 | 3.4783 0.4186 0.8888 H 0 0 0 0 0 0 0 0 0 0 0 0 24 | 4.1200 -0.5981 -0.4606 H 0 0 0 0 0 0 0 0 0 0 0 0 25 | 3.2700 0.9110 -0.8337 H 0 0 0 0 0 0 0 0 0 0 0 0 26 | -1.8812 -2.8834 0.1466 H 0 0 0 0 0 0 0 0 0 0 0 0 27 | -2.6277 -2.0396 -1.3222 H 0 0 0 0 0 0 0 0 0 0 0 0 28 | -3.2286 -1.7014 0.3855 H 0 0 0 0 0 0 0 0 0 0 0 0 29 | 1 2 1 0 0 0 0 30 | 1 15 1 0 0 0 0 31 | 1 16 1 0 0 0 0 32 | 1 17 1 0 0 0 0 33 | 2 3 1 0 0 0 0 34 | 3 4 2 0 0 0 0 35 | 3 5 1 0 0 0 0 36 | 5 6 2 0 0 0 0 37 | 6 7 1 0 0 0 0 38 | 6 11 1 0 0 0 0 39 | 7 8 2 0 0 0 0 40 | 8 9 1 0 0 0 0 41 | 8 18 1 0 0 0 0 42 | 9 10 1 0 0 0 0 43 | 9 5 1 0 0 0 0 44 | 10 19 1 0 0 0 0 45 | 10 20 1 0 0 0 0 46 | 10 21 1 0 0 0 0 47 | 11 12 1 0 0 0 0 48 | 11 13 1 0 0 0 0 49 | 12 22 1 0 0 0 0 50 | 12 23 1 0 0 0 0 51 | 12 24 1 0 0 0 0 52 | 13 14 2 0 0 0 0 53 | 13 2 1 0 0 0 0 54 | M END 55 | $$$$ 56 | -------------------------------------------------------------------------------- /data/chemical_formulas.txt: -------------------------------------------------------------------------------- 1 | Preliminary data related to chemical formulas 2 | 3 | #all digits plus all elements (128 symbols) 4 | 0 5 | 1 6 | 2 7 | 3 8 | 4 9 | 5 10 | 6 11 | 7 12 | 8 13 | 9 14 | Ac 15 | Ag 16 | Al 17 | Am 18 | Ar 19 | As 20 | At 21 | Au 22 | B 23 | Ba 24 | Be 25 | Bh 26 | Bi 27 | Bk 28 | Br 29 | C 30 | Ca 31 | Cd 32 | Ce 33 | Cf 34 | Cl 35 | Cm 36 | Cn 37 | Co 38 | Cr 39 | Cs 40 | Cu 41 | Db 42 | Ds 43 | Dy 44 | Er 45 | Es 46 | Eu 47 | F 48 | Fe 49 | Fl 50 | Fm 51 | Fr 52 | Ga 53 | Gd 54 | Ge 55 | H 56 | He 57 | Hf 58 | Hg 59 | Ho 60 | Hs 61 | I 62 | In 63 | Ir 64 | K 65 | Kr 66 | La 67 | Li 68 | Lr 69 | Lu 70 | Lv 71 | Mc 72 | Md 73 | Mg 74 | Mn 75 | Mo 76 | Mt 77 | N 78 | Na 79 | Nb 80 | Nd 81 | Ne 82 | Nh 83 | Ni 84 | No 85 | Np 86 | O 87 | Og 88 | Os 89 | P 90 | Pa 91 | Pb 92 | Pd 93 | Pm 94 | Po 95 | Pr 96 | Pt 97 | Pu 98 | Ra 99 | Rb 100 | Re 101 | Rf 102 | Rg 103 | Rh 104 | Rn 105 | Ru 106 | S 107 | Sb 108 | Sc 109 | Se 110 | Sg 111 | Si 112 | Sm 113 | Sn 114 | Sr 115 | Ta 116 | Tb 117 | Tc 118 | Te 119 | Th 120 | Ti 121 | Tl 122 | Tm 123 | Ts 124 | U 125 | V 126 | W 127 | Xe 128 | Y 129 | Yb 130 | Zn 131 | Zr 132 | 133 | #all digits plus all characters (55 symbols) 134 | 0 135 | 1 136 | 2 137 | 3 138 | 4 139 | 5 140 | 6 141 | 7 142 | 8 143 | 9 144 | A 145 | B 146 | C 147 | D 148 | E 149 | F 150 | G 151 | H 152 | I 153 | K 154 | L 155 | M 156 | N 157 | O 158 | P 159 | R 160 | S 161 | T 162 | U 163 | V 164 | W 165 | X 166 | Y 167 | Z 168 | a 169 | b 170 | c 171 | d 172 | e 173 | f 174 | g 175 | h 176 | i 177 | k 178 | l 179 | m 180 | n 181 | o 182 | p 183 | r 184 | s 185 | t 186 | u 187 | v 188 | y 189 | -------------------------------------------------------------------------------- /data/cisapride.smi: -------------------------------------------------------------------------------- 1 | COc1cc(N)c(Cl)cc1C(=O)N[C@@H]1CCN(CCCOc2ccc(F)cc2)C[C@@H]1OC Cisapride 2 | -------------------------------------------------------------------------------- /data/co_1conf.sdf: -------------------------------------------------------------------------------- 1 | carbon_monoxide 2 | OpenBabel06242216463D 3 | 4 | 2 1 0 0 0 0 0 0 0 0999 V2000 5 | 1.1581 -0.0369 -0.0512 C 0 5 0 0 0 0 0 0 0 0 0 0 6 | 2.2151 -0.0369 -0.0512 O 0 3 0 0 0 0 0 0 0 0 0 0 7 | 1 2 3 0 0 0 0 8 | M CHG 2 1 -1 2 1 9 | M END 10 | $$$$ 11 | -------------------------------------------------------------------------------- /data/cocaine.smi: -------------------------------------------------------------------------------- 1 | COC(=O)[C@H]1[C@@H](OC(=O)c2ccccc2)C[C@@H]2CC[C@H]1N2C cocaine 2 | -------------------------------------------------------------------------------- /data/ethanol.smi: -------------------------------------------------------------------------------- 1 | CCO ethanol 2 | -------------------------------------------------------------------------------- /data/ethanol.uhd.dix.ref: -------------------------------------------------------------------------------- 1 | UHD-1.0.0 2 | C 1 3 | C,CH2O 2 4 | C,CH2O,H4 3 5 | C,CH3 4 6 | C,CH3,H2O 5 7 | C,CH3,H2O,H 6 8 | H 7 9 | H,C 8 10 | H,C,CH2 9 11 | H,C,CH2,H2O 10 12 | H,C,CH2,H2O,H 11 13 | H,C,CHO 12 14 | H,C,CHO,H4 13 15 | H,O 14 16 | H,O,C 15 17 | H,O,C,CH2 16 18 | H,O,C,CH2,H3 17 19 | O 18 20 | O,CH 19 21 | O,CH,CH2 20 22 | O,CH,CH2,H3 21 23 | -------------------------------------------------------------------------------- /data/ethanol.uhd.ref: -------------------------------------------------------------------------------- 1 | ethanol,0.0,[1:2;2:1;3:1;4:1;5:1;6:1;7:6;8:5;9:3;10:3;11:3;12:2;13:2;14:1;15:1;16:1;17:1;18:1;19:1;20:1;21:1] 2 | -------------------------------------------------------------------------------- /data/gen_mols.txt: -------------------------------------------------------------------------------- 1 | #atoms:13 genmol_000001_NCGC00260832-01_f01,NCGC00261763-01_f01 2 | 0 1,6,3,0 3 | 1 1,6,2,0 4 | 2 1,6,2,0 5 | 3 1,6,2,0 6 | 4 1,6,2,0 7 | 5 1,6,2,0 8 | 6 0,7,2,0 9 | 7 1,6,3,0 10 | 8 1,6,2,0 11 | 9 1,6,2,0 12 | 10 1,6,2,0 13 | 11 1,7,2,0 14 | 12 1,6,2,0 15 | #bonds:13 16 | 0 ~ 1 17 | 1 ~ 2 18 | 2 ~ 3 19 | 3 ~ 4 20 | 4 ~ 5 21 | 5 ~ 0 22 | 6 - 7 23 | 7 ~ 8 24 | 8 ~ 9 25 | 9 ~ 10 26 | 10 ~ 11 27 | 11 ~ 12 28 | 12 ~ 7 29 | #atoms:12 genmol_000002_NCGC00261552-01_f02,NCGC00261552-01_f02 30 | 0 1,6,3,0 31 | 1 1,6,2,0 32 | 2 1,6,2,0 33 | 3 1,6,2,0 34 | 4 1,6,2,0 35 | 5 1,6,2,0 36 | 6 1,6,3,0 37 | 7 1,6,2,0 38 | 8 1,6,2,0 39 | 9 1,6,2,0 40 | 10 1,6,2,0 41 | 11 1,6,2,0 42 | #bonds:12 43 | 0 ~ 1 44 | 1 ~ 2 45 | 2 ~ 3 46 | 3 ~ 4 47 | 4 ~ 5 48 | 5 ~ 0 49 | 6 ~ 7 50 | 7 ~ 8 51 | 8 ~ 9 52 | 9 ~ 10 53 | 10 ~ 11 54 | 11 ~ 6 55 | #atoms:2 genmol_000003_NCGC00260832-01_f00,NCGC00260832-01_f00 56 | 0 0,6,1,0 57 | 1 0,6,1,0 58 | #bonds:0 59 | -------------------------------------------------------------------------------- /data/h2o_1conf.sdf: -------------------------------------------------------------------------------- 1 | water 2 | OpenBabel07042215033D 3 | 4 | 3 2 0 0 0 0 0 0 0 0999 V2000 5 | 0.9794 0.0672 0.0986 H 0 0 0 0 0 0 0 0 0 0 0 0 6 | 1.9473 0.0539 0.0541 O 0 0 0 0 0 0 0 0 0 0 0 0 7 | 2.2261 0.3184 0.9436 H 0 0 0 0 0 0 0 0 0 0 0 0 8 | 1 2 1 0 0 0 0 9 | 2 3 1 0 0 0 0 10 | M END 11 | $$$$ 12 | -------------------------------------------------------------------------------- /data/merge.txt: -------------------------------------------------------------------------------- 1 | n1 1 -3 2 | n2 2 -2 3 | n3 3 -1 4 | n4 4 1 5 | n5 5 2 6 | n6 6 3 7 | -------------------------------------------------------------------------------- /data/ptable.txt: -------------------------------------------------------------------------------- 1 | #anum Symbol prime 2 | 1 H 2 3 | 2 He 3 4 | 3 Li 5 5 | 4 Be 7 6 | 5 B 11 7 | 6 C 13 8 | 7 N 17 9 | 8 O 19 10 | 9 F 23 11 | 10 Ne 29 12 | 11 Na 31 13 | 12 Mg 37 14 | 13 Al 41 15 | 14 Si 43 16 | 15 P 47 17 | 16 S 53 18 | 17 Cl 59 19 | 18 Ar 61 20 | 19 K 67 21 | 20 Ca 71 22 | 21 Sc 73 23 | 22 Ti 79 24 | 23 V 83 25 | 24 Cr 89 26 | 25 Mn 97 27 | 26 Fe 101 28 | 27 Co 103 29 | 28 Ni 107 30 | 29 Cu 109 31 | 30 Zn 113 32 | 31 Ga 127 33 | 32 Ge 131 34 | 33 As 137 35 | 34 Se 139 36 | 35 Br 149 37 | 36 Kr 151 38 | 37 Rb 157 39 | 38 Sr 163 40 | 39 Y 167 41 | 40 Zr 173 42 | 41 Nb 179 43 | 42 Mo 181 44 | 43 Tc 191 45 | 44 Ru 193 46 | 45 Rh 197 47 | 46 Pd 199 48 | 47 Ag 211 49 | 48 Cd 223 50 | 49 In 227 51 | 50 Sn 229 52 | 51 Sb 233 53 | 52 Te 239 54 | 53 I 241 55 | 54 Xe 251 56 | 55 Cs 257 57 | 56 Ba 263 58 | 57 La 269 59 | 58 Ce 271 60 | 59 Pr 277 61 | 60 Nd 281 62 | 61 Pm 283 63 | 62 Sm 293 64 | 63 Eu 307 65 | 64 Gd 311 66 | 65 Tb 313 67 | 66 Dy 317 68 | 67 Ho 331 69 | 68 Er 337 70 | 69 Tm 347 71 | 70 Yb 349 72 | 71 Lu 353 73 | 72 Hf 359 74 | 73 Ta 367 75 | 74 W 373 76 | 75 Re 379 77 | 76 Os 383 78 | 77 Ir 389 79 | 78 Pt 397 80 | 79 Au 401 81 | 80 Hg 409 82 | 81 Tl 419 83 | 82 Pb 421 84 | 83 Bi 431 85 | 84 Po 433 86 | 85 At 439 87 | 86 Rn 443 88 | 87 Fr 449 89 | 88 Ra 457 90 | 89 Ac 461 91 | 90 Th 463 92 | 91 Pa 467 93 | 92 U 479 94 | 93 Np 487 95 | 94 Pu 491 96 | 95 Am 499 97 | 96 Cm 503 98 | 97 Bk 509 99 | 98 Cf 521 100 | 99 Es 523 101 | 100 Fm 541 102 | 101 Md 547 103 | 102 No 557 104 | 103 Lr 563 105 | 104 Rf 569 106 | 105 Db 571 107 | 106 Sg 577 108 | 107 Bh 587 109 | 108 Hs 593 110 | 109 Mt 599 111 | 110 Ds 601 112 | 111 Rg 607 113 | 112 Cn 613 114 | 113 Nh 617 115 | 114 Fl 619 116 | 115 Mc 631 117 | 116 Lv 641 118 | 117 Ts 643 119 | 118 Og 647 120 | -------------------------------------------------------------------------------- /data/test_HYD_group.sdf: -------------------------------------------------------------------------------- 1 | test 2 | OpenBabel08222215263D 3 | 4 | 17 16 0 0 0 0 0 0 0 0999 V2000 5 | 0.9307 -0.0311 -0.0651 C 0 0 0 0 0 0 0 0 0 0 0 0 6 | 0.4214 -0.4462 -1.4445 C 0 0 0 0 0 0 0 0 0 0 0 0 7 | 0.4214 -1.0181 0.9841 C 0 0 0 0 0 0 0 0 0 0 0 0 8 | 0.4214 1.3711 0.2651 C 0 0 0 0 0 0 0 0 0 0 0 0 9 | 2.4586 -0.0311 -0.0651 C 0 0 0 0 0 0 0 0 0 0 0 0 10 | -0.6743 -0.4546 -1.4725 H 0 0 0 0 0 0 0 0 0 0 0 0 11 | 0.7728 -1.4507 -1.7071 H 0 0 0 0 0 0 0 0 0 0 0 0 12 | 0.7728 0.2465 -2.2179 H 0 0 0 0 0 0 0 0 0 0 0 0 13 | -0.6743 -1.0382 1.0055 H 0 0 0 0 0 0 0 0 0 0 0 0 14 | 0.7728 -0.7433 1.9853 H 0 0 0 0 0 0 0 0 0 0 0 0 15 | 0.7728 -2.0342 0.7709 H 0 0 0 0 0 0 0 0 0 0 0 0 16 | -0.6743 1.3996 0.2718 H 0 0 0 0 0 0 0 0 0 0 0 0 17 | 0.7728 2.1008 -0.4735 H 0 0 0 0 0 0 0 0 0 0 0 0 18 | 0.7728 1.6945 1.2517 H 0 0 0 0 0 0 0 0 0 0 0 0 19 | 2.8515 -1.0271 -0.2996 H 0 0 0 0 0 0 0 0 0 0 0 0 20 | 2.8515 0.2638 0.9148 H 0 0 0 0 0 0 0 0 0 0 0 0 21 | 2.8515 0.6701 -0.8104 H 0 0 0 0 0 0 0 0 0 0 0 0 22 | 1 2 1 0 0 0 0 23 | 1 3 1 0 0 0 0 24 | 1 4 1 0 0 0 0 25 | 1 5 1 0 0 0 0 26 | 2 6 1 0 0 0 0 27 | 2 7 1 0 0 0 0 28 | 2 8 1 0 0 0 0 29 | 3 9 1 0 0 0 0 30 | 3 10 1 0 0 0 0 31 | 3 11 1 0 0 0 0 32 | 4 12 1 0 0 0 0 33 | 4 13 1 0 0 0 0 34 | 4 14 1 0 0 0 0 35 | 5 15 1 0 0 0 0 36 | 5 16 1 0 0 0 0 37 | 5 17 1 0 0 0 0 38 | M END 39 | $$$$ 40 | -------------------------------------------------------------------------------- /data/test_HYD_group.smi: -------------------------------------------------------------------------------- 1 | C(C)(C)(C)(C) test 2 | -------------------------------------------------------------------------------- /data/test_mols.txt: -------------------------------------------------------------------------------- 1 | 1,0.0,[0:1;2:1;3:5] 2 | 2,0.0,[0:3;1:3;2:3;3:3] 3 | 3,0.0,[2:4] 4 | 4,0.0,[0:5] 5 | 5,0.0,[1:5] 6 | 6,0.0,[2:5] 7 | 7,0.0,[3:5] 8 | -------------------------------------------------------------------------------- /deepsmi_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #set -x 4 | 5 | head ~/src/FMGO/data/TCM_20k.smi > input.smi 6 | dos2unix input.smi 7 | ./bin/molenc_deepsmi.py --no-rings -i input.smi -o output.dsmi 8 | ./bin/molenc_deepsmi.py --no-rings -d -i output.dsmi -o output.smi 9 | diff input.smi output.smi 10 | rm -f output.{dsmi,smi} 11 | 12 | ./bin/molenc_deepsmi.py --no-branches -i input.smi -o output.dsmi 13 | ./bin/molenc_deepsmi.py --no-branches -d -i output.dsmi -o output.smi 14 | diff input.smi output.smi 15 | rm -f output.{dsmi,smi} 16 | 17 | ./bin/molenc_deepsmi.py --no-branches --no-rings -i input.smi -o output.dsmi 18 | ./bin/molenc_deepsmi.py --no-branches --no-rings -d -i output.dsmi -o output.smi 19 | diff input.smi output.smi 20 | rm -f output.{dsmi,smi} 21 | -------------------------------------------------------------------------------- /doc/Ester_KDD_1996_DBSCANclustering.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UnixJunkie/molenc/edc27db8206e6cbca4409b962426c94f3d14e18d/doc/Ester_KDD_1996_DBSCANclustering.pdf -------------------------------------------------------------------------------- /doc/Shrivastava_2016_ExactWeightedMinwiseHashing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UnixJunkie/molenc/edc27db8206e6cbca4409b962426c94f3d14e18d/doc/Shrivastava_2016_ExactWeightedMinwiseHashing.pdf -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 1.11) 2 | (name molenc) 3 | -------------------------------------------------------------------------------- /fcodec: -------------------------------------------------------------------------------- 1 | _build/default/src/molenc_fcodec.exe -------------------------------------------------------------------------------- /histo.gpl: -------------------------------------------------------------------------------- 1 | 2 | set xlabel 'score' 3 | set ylabel 'frequency' 4 | 5 | # gauss1(x) = a1 / (sigma1*sqrt(2.*pi)) * exp(-(x-mu1)**2. / (2.*sigma1**2)) 6 | # gauss2(x) = a2 / (sigma2*sqrt(2.*pi)) * exp(-(x-mu2)**2. / (2.*sigma2**2)) 7 | 8 | gauss1(x) = a1/(sqrt(2*pi)*sigma1)*exp(-(x-mean1)**2/(2*sigma1**2)) 9 | gauss2(x) = a2/(sqrt(2*pi)*sigma2)*exp(-(x-mean2)**2/(2*sigma2**2)) 10 | 11 | # FBR: we need to init the mean with a good value so that the optim 12 | # will converge 13 | mean1 = -8 14 | mean2 = -8 15 | 16 | fit gauss1(x) '/tmp/lean_histo_abe414.txt' u 1:2 via a1,sigma1,mean1 17 | fit gauss2(x) '/tmp/lean_histo_abe414.txt' u 1:3 via a2,sigma2,mean2 18 | 19 | plot '/tmp/lean_histo_abe414.txt' u 1:2 w l t 'smaller sample', \ 20 | '' u ($1+0.01):3 w l t 'bigger sample', \ 21 | gauss1(x) t 'smaller fit', \ 22 | gauss2(x) t 'bigger fit' 23 | -------------------------------------------------------------------------------- /kb_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | ~/src/molenc/kbe -i all_uniq_std.txt -k 64 > test_64_1xCPU.txt 6 | sort test_64_1xCPU.txt -o test_64_1xCPU.txt 7 | 8 | ~/src/molenc/kbe -np 16 -i all_uniq_std.txt -k 64 > test_64_16xCPU.txt 9 | sort test_64_16xCPU.txt -o test_64_16xCPU.txt 10 | 11 | diff test_64_1xCPU.txt test_64_16xCPU.txt 12 | -------------------------------------------------------------------------------- /mol_frag_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set -x #DEBUG 4 | 5 | # --view --> call mview on .smi files 6 | MVIEW="" 7 | if [ "$1" == "--view" ] || [ "$2" == "--view" ]; then 8 | MVIEW="1" 9 | fi 10 | 11 | # --big --> work on the "big" dataset 12 | BIG="" 13 | if [ "$1" == "--big" ] || [ "$2" == "--big" ]; then 14 | BIG="1" 15 | fi 16 | 17 | if [ "$BIG" == "" ]; then 18 | # clean 19 | rm -f data/3.to_frag data/3_frags.txt data/3_frags.smi \ 20 | data/3_genmols.txt data/3_genmols.smi 21 | # regen 22 | ./bin/molenc_frag.py -i data/3.smi -o data/3.to_frag 23 | [ "$MVIEW" == "1" ] && mview data/3.smi & 24 | ./molenc_frag -im data/3.to_frag -of data/3_frags.txt -s 1234 25 | ./bin/molenc_frag2smi.py -i data/3_frags.txt -o data/3_frags.smi 26 | [ "$MVIEW" == "1" ] && mview data/3_frags.smi & 27 | ./molenc_frag -if data/3_frags.txt -om data/3_genmols.txt -s 1234 -n 20 28 | ./bin/molenc_mol2smi.py -i data/3_genmols.txt -o data/3_genmols.smi 29 | cut -f1 data/3_genmols.smi | sort -u > data/3_genmols_uniq.smi 30 | [ "$MVIEW" == "1" ] && mview data/3_genmols_uniq.smi & 31 | else 32 | IN=data/chembl_antivirals 33 | ./bin/molenc_frag.py -i $IN.smi -o $IN.to_frag --draw 34 | ./molenc_frag -im $IN.to_frag -of $IN.frags -s 1234 35 | ./bin/molenc_frag2smi.py -i $IN.frags -o $IN.frags.smi 36 | ./molenc_frag -if $IN.frags -om $IN.mols -s 1234 -n 50 37 | ./bin/molenc_mol2smi.py -i $IN.mols -o $IN.mols.smi 38 | fi 39 | -------------------------------------------------------------------------------- /molenc_frag: -------------------------------------------------------------------------------- 1 | ./_build/install/default/bin/molenc_frag -------------------------------------------------------------------------------- /rfp: -------------------------------------------------------------------------------- 1 | _build/default/src/molenc_RFP.exe -------------------------------------------------------------------------------- /smisur_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x #DEBUG 4 | 5 | # clean 6 | rm -f data/chembl_antivirals.frags.smi data/chembl_antivirals.genmols.txt 7 | # fragment 8 | ./bin/molenc_smisur.py --seed 1234 \ 9 | -i data/chembl_antivirals.smi \ 10 | -o data/chembl_antivirals.frags.smi 11 | # assemble 12 | ./bin/molenc_smisur.py --seed 1234 --assemble \ 13 | -i data/chembl_antivirals.frags.smi \ 14 | -o data/chembl_antivirals.genmols.txt 15 | -------------------------------------------------------------------------------- /src/MSE_mol.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* Multi-Scale-Encoded molecule *) 10 | 11 | open Printf 12 | 13 | module L = MyList 14 | module Log = Dolog.Log 15 | module String = BatString 16 | module StringMap = BatMap.String 17 | 18 | type t = { name: string; map: int StringMap.t } 19 | 20 | let create name map = 21 | { name; map } 22 | 23 | let get_name x = 24 | x.name 25 | 26 | let get_map x = 27 | x.map 28 | 29 | let feat_count_of_string s = 30 | try Scanf.sscanf s "%s %d" (fun s d -> (s, d)) 31 | with exn -> (eprintf "MSE_mol.feat_count_of_string: cannot parse: %s" s; 32 | raise exn) 33 | 34 | (* to construct one molecules with all its constituent lines 35 | already read from the input file *) 36 | let read_one = function 37 | | [] -> failwith "MSE_mol.read_one: empty list" 38 | | name_line :: feat_count_strs -> 39 | (* molecule separator is a line starting with a '#' char *) 40 | assert(String.get name_line 0 = '#'); 41 | let name = String.lchop name_line in (* remove it *) 42 | let map = 43 | List.fold_left (fun acc line -> 44 | let feat, count = feat_count_of_string line in 45 | (* feature cannot already be here; otherwise, 46 | there was a problem during encoding of the molecule *) 47 | if StringMap.mem feat acc then 48 | Log.warn "mol: %s dup feat: %s" name feat; 49 | StringMap.add feat count acc 50 | ) StringMap.empty feat_count_strs in 51 | create name map 52 | 53 | let previous_name = ref "" 54 | 55 | exception Break 56 | 57 | (* get lines for just one molecule (i.e. for one call to read_one after) *) 58 | let get_lines input = 59 | let acc = ref [] in 60 | if !previous_name = "" then 61 | begin 62 | let line = input_line input in 63 | assert(BatString.starts_with line "#"); (* enforce name line *) 64 | previous_name := line 65 | end; 66 | acc := [!previous_name]; 67 | try 68 | while true do 69 | let line' = input_line input in 70 | if BatString.starts_with line' "#" then 71 | (* this is the start of another molecule *) 72 | begin 73 | previous_name := line'; 74 | raise Break 75 | end 76 | else 77 | acc := line' :: !acc 78 | done; 79 | assert(false) (* for typing: should never be reached at exec *) 80 | with Break -> L.rev !acc 81 | | End_of_file -> 82 | begin 83 | previous_name := ""; 84 | L.rev !acc 85 | end 86 | 87 | let of_lines lines = 88 | let rec loop acc ls = 89 | match ls with 90 | | [] -> L.rev acc 91 | | _ -> 92 | let name_l, rest = 93 | L.fold_while 94 | (fun _acc l -> String.starts_with l "#") 95 | (fun acc x -> x :: acc) [] ls in 96 | (match name_l with 97 | | [name] -> 98 | (let feat_counts, remaining_mols = 99 | L.fold_while 100 | (fun _acc l -> not (String.starts_with l "#")) 101 | (fun acc x -> x :: acc) [] rest in 102 | let mol = read_one (name :: feat_counts) in 103 | loop (mol :: acc) remaining_mols) 104 | | _ -> assert(false)) in 105 | loop [] lines 106 | -------------------------------------------------------------------------------- /src/ap_types.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* read files output by ./bin/type_atoms.py *) 10 | 11 | module A = Array 12 | module IntSet = BatSet.Int 13 | module L = BatList 14 | 15 | let read_one counter input = 16 | (* "#atoms:14 caffeine" *) 17 | let atoms_header = input_line input in 18 | let nb_atoms, mol_name = 19 | Scanf.sscanf atoms_header "#atoms:%d %s" 20 | (fun nb_atoms name -> (nb_atoms, name)) in 21 | (* read atoms *) 22 | let atom_lines = Utls.read_n_lines nb_atoms input in 23 | let atoms = 24 | L.map (fun l -> 25 | Scanf.sscanf l "%d %s" 26 | (fun _index typ -> PiEltHA.of_string typ) 27 | ) atom_lines in 28 | let atom_types = A.of_list atoms in 29 | (* read bonds header; like "#bonds:15" *) 30 | let bonds_header = input_line input in 31 | let nb_bonds = Scanf.sscanf bonds_header "#bonds:%d" (fun n -> n) in 32 | (* read bonds *) 33 | let bond_lines = Utls.read_n_lines nb_bonds input in 34 | let succs_table = A.make nb_atoms IntSet.empty in 35 | L.iter (fun bond_line -> 36 | Scanf.sscanf bond_line "%d %d" (fun start stop -> 37 | assert(start <> stop); 38 | (* we need to add the bond two times, because the molecular 39 | graph is undirected *) 40 | (* start -> stop *) 41 | succs_table.(start) <- IntSet.add stop succs_table.(start); 42 | (* stop -> start *) 43 | succs_table.(stop) <- IntSet.add start succs_table.(stop) 44 | ) 45 | ) bond_lines; 46 | (* read distance matrix *) 47 | (* matrix header line *) 48 | let matrix_header = input_line input in 49 | let diameter = Scanf.sscanf matrix_header "#diameter:%d" (fun n -> n) in 50 | (* matrix' content *) 51 | let matrix_lines = Utls.read_n_lines nb_atoms input in 52 | let matrix = Array.make_matrix nb_atoms nb_atoms 0 in 53 | L.iteri (fun i line -> 54 | let dist_strings = BatString.split_on_string line ~by:" " in 55 | L.iteri (fun j str -> 56 | let d = int_of_string str in 57 | matrix.(i).(j) <- d 58 | ) dist_strings 59 | ) matrix_lines; 60 | let nodes = 61 | A.mapi (fun i typ -> 62 | Node.create typ succs_table.(i) 63 | ) atom_types in 64 | incr counter; 65 | Mini_mol.create mol_name nodes diameter matrix 66 | -------------------------------------------------------------------------------- /src/atom_env.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* atom environment *) 10 | 11 | open Printf 12 | 13 | module L = BatList 14 | module Log = Dolog.Log 15 | 16 | (* layer = (depth, counted-atoms) *) 17 | type layer = int * ((PiEltHA.t * int) list) 18 | (* center-atom layers *) 19 | type t = layer list 20 | 21 | let counted_types_to_string (l: (PiEltHA.t * int) list): string = 22 | let buff = Buffer.create 80 in 23 | L.iteri (fun i (x, count) -> 24 | bprintf buff (if i = 0 then "%s:%d" else ",%s:%d") 25 | (PiEltHA.to_string x) count 26 | ) l; 27 | Buffer.contents buff 28 | 29 | let counted_types_of_string (s: string): (PiEltHA.t * int) list = 30 | let strings = BatString.split_on_string s ~by:"," in 31 | L.map (fun str -> Scanf.sscanf str "%s:%d" Utls.make_pair) strings 32 | 33 | let layer_to_string ((depth, counted_types): layer): string = 34 | sprintf "%d_%s" depth (counted_types_to_string counted_types) 35 | 36 | let layer_of_string (str: string): layer = 37 | Scanf.sscanf str "%d_%s" (fun d s -> 38 | (d, counted_types_of_string s) 39 | ) 40 | 41 | let to_string (layers: t): string = 42 | let buff = Buffer.create 80 in 43 | L.iteri (fun i layer -> 44 | bprintf buff (if i = 0 then "%s" else ";%s") 45 | (layer_to_string layer) 46 | ) layers; 47 | Buffer.contents buff 48 | 49 | let of_string (s: string): t = 50 | let layer_strings = BatString.split_on_string s ~by:";" in 51 | L.map layer_of_string layer_strings 52 | 53 | (* parse the 1st line of a .idx file *) 54 | let parse_index_comment fn = 55 | let header, index_lines = Utls.maybe_extract_comment_header fn in 56 | match header with 57 | | None -> (-1, []) 58 | | Some comment -> 59 | let radius = Scanf.sscanf comment "#radius=%d" (fun r -> r) in 60 | (radius, index_lines) 61 | 62 | (* parse the 1st line of a .mop2d file *) 63 | let parse_molecules_comment fn = 64 | let header, mol_lines = Utls.maybe_extract_comment_header fn in 65 | match header with 66 | | None -> (-1, "/dev/null", mol_lines) 67 | | Some comment -> 68 | let radius, index_fn = 69 | Scanf.sscanf comment "#radius=%d;index=%s" 70 | (fun r fn -> (r, fn)) in 71 | (radius, index_fn, mol_lines) 72 | 73 | (* parse the 1st line of an already opened .mop2d file 74 | (and advance the file pointer) *) 75 | let parse_comment input = 76 | try (* we are parsing a valid .mop2d file *) 77 | Scanf.sscanf (input_line input) 78 | "#radius=%d;index=%s" (fun r fn -> (r, fn)) 79 | with (* we are not *) 80 | Scanf.Scan_failure _ -> (-1, "/dev/null") 81 | 82 | (* extract the MOP2D atom env. to bitstring index HT *) 83 | let restore_mop2d_index fn = 84 | let radius, index_lines = parse_index_comment fn in 85 | let mop2d_envs = L.map of_string index_lines in 86 | let res = Hashtbl.create 11 in 87 | L.iteri (fun i env -> 88 | (* eprintf "%s\n" (Mop2d_env.to_string env); *) 89 | assert(not (Hashtbl.mem res env)); 90 | Hashtbl.add res env i 91 | ) mop2d_envs; 92 | Log.info "index size: %d" (Hashtbl.length res); 93 | (radius, res) 94 | -------------------------------------------------------------------------------- /src/atom_pair.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. 8 | 9 | An atom pair *) 10 | 11 | (* FBR: TO DROP *) 12 | 13 | type t = { src: PiEltHA.t; (* source atom *) 14 | dst: PiEltHA.t; (* destination atom *) 15 | dist: int } (* distance between them in bonds *) 16 | 17 | (* canonicalization is obtained by sorting the types *) 18 | let create x y dist = 19 | if PiEltHA.compare x y <= 0 then 20 | { src = x; dst = y; dist } 21 | else 22 | { src = y; dst = x; dist } 23 | 24 | let to_string { src; dst; dist } = 25 | Printf.sprintf "%s-%d-%s" src dist dst 26 | 27 | let dist x = 28 | x.dist 29 | -------------------------------------------------------------------------------- /src/bloom.ml: -------------------------------------------------------------------------------- 1 | 2 | (* A counted Bloom filter *) 3 | 4 | module A = BatArray 5 | module Fp = Fingerprint 6 | module L = BatList 7 | module Log = Dolog.Log 8 | 9 | type t = int array array (* input feature index (0..N-1) to output feature 10 | indexes mapping (0..M-1) *) 11 | 12 | let distinct_rands rng n bound = 13 | let rec loop acc count = 14 | if count = n then 15 | acc 16 | else 17 | let cand = Random.State.int rng bound in 18 | if List.mem cand acc then 19 | loop acc count (* retry *) 20 | else 21 | loop (cand :: acc) (count + 1) in 22 | loop [] 0 23 | 24 | (* n: input vector dimension 25 | k: number of "hash" functions; 26 | number of output features "turned ON" by a single input feature 27 | m: output vector dimension *) 28 | let init n k m = 29 | let res = Array.make_matrix n k 0 in 30 | let rng = Random.State.make [|3141596|] in 31 | for i = 0 to n - 1 do 32 | let rands = distinct_rands rng k m in 33 | L.iteri (fun j rand -> 34 | res.(i).(j) <- rand 35 | ) rands 36 | done; 37 | (* log the number of collisions 38 | (different input features mapping to the same set of output features *) 39 | let collisions = ref 0 in 40 | let sorted = A.copy res in 41 | A.sort compare sorted; 42 | for i = 1 to n - 1 do 43 | if sorted.(i - 1) = sorted.(i) then 44 | incr collisions; 45 | done; 46 | (if !collisions > 0 then 47 | Log.warn "Bloom.init(%d,%d,%d): %d collisions" n k m !collisions 48 | ); 49 | (n, k, m, res) 50 | 51 | let encode (_n, k, m, mappings) fp = 52 | let kvs = Fp.key_value_pairs fp in (* sparse input vector *) 53 | let res = A.make m 0 in (* dense output vector *) 54 | L.iter (fun (key, value) -> 55 | let output_indexes = mappings.(key) in 56 | (* increment all corresponding output features *) 57 | for i = 0 to k - 1 do 58 | let j = output_indexes.(i) in 59 | res.(j) <- res.(j) + value 60 | done 61 | ) kvs; 62 | res 63 | -------------------------------------------------------------------------------- /src/bond.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* very small bond module to help compute atom environments (a la molprint2d) 10 | from MOL2 files *) 11 | 12 | open Printf 13 | 14 | type t = { idx: int ; 15 | src: int ; 16 | dst: int } 17 | 18 | (* indexes start at 1 in the MOL2 file format *) 19 | let create (idx: int) (src: int) (dst: int): t = 20 | { idx = idx - 1; src = src - 1; dst = dst - 1 } 21 | 22 | let dummy = create (-1) (-1) (-1) 23 | 24 | (* example line (output of OpenBabel): 25 | " 1 1 11 ar" *) 26 | let of_mol2_line l = 27 | try Scanf.sscanf l " %d %d %d %s" 28 | (fun idx src dst _bullshit -> create idx src dst) 29 | with _ -> failwith ("Bond.of_mol2_line: could not parse: " ^ l) 30 | 31 | let to_string (a: t): string = 32 | sprintf "%d %d %d" a.idx a.src a.dst 33 | -------------------------------------------------------------------------------- /src/dune: -------------------------------------------------------------------------------- 1 | 2 | (library 3 | (name molenc) 4 | (public_name molenc) 5 | (modules ap_types atom_env fingerprint fpMol atom_pair mini_mol MSE_mol 6 | myList node piEltHA scale utls WMH norm bloom index rdkit sdf_3D gram ptable) 7 | (libraries batteries dolog bst parany pyml vector3 line_oriented str)) 8 | 9 | ;; installed executables / public targets 10 | (executables 11 | (names encoder decoder filter butina pubchem_decoder uniq to_dense get_mol 12 | rank ap_encoder prune merge MST lig_box shannon fragmentable_mol 13 | indexer pareto finder AP_BBAD split sdf_read dsmi BBAD molenc_AP molenc_UHD shuf) 14 | (public_names molenc_e molenc_d molenc_filter molenc_cluster 15 | molenc_pubchem_decoder molenc_uniq molenc_dense molenc_get 16 | molenc_rank molenc_ap molenc_prune molenc_merge molenc_mst 17 | molenc_ligbox molenc_shannon molenc_frag molenc_indexer 18 | molenc_pareto molenc_finder molenc_apbbad molenc_split 19 | molenc_sdf_read molenc_dsmi molenc_bbad molenc_AP molenc_UHD molenc_shuf) 20 | (modules encoder decoder filter butina pubchem_decoder uniq to_dense get_mol 21 | sybyl syb_atom mol2 sdf smi ph4 rank ap_encoder prune merge MST 22 | palette gnuplot lig_box shannon fragmentable_mol indexer pareto 23 | finder AP_BBAD split sdf_read dsmi BBAD molenc_AP molenc_UHD shuf) 24 | (libraries molenc bst batteries dolog minicli parany cpm dokeysto 25 | ocamlgraph vector3 line_oriented pyml)) 26 | 27 | ;; never installed executables 28 | (executables 29 | (names fp_test wmh_test wmh_bench wmh_unit_test test_RS) 30 | (modules fp_test wmh_test wmh_bench wmh_unit_test test_RS) 31 | (libraries molenc batteries dolog minicli)) 32 | -------------------------------------------------------------------------------- /src/encoder.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* molecular encoder: a molecule is a list of atom environments. 10 | canonicalization is done by sorting (atoms in an environment 11 | as well as the list of environments that constitutes 12 | the entirely encoded molecule) *) 13 | 14 | open Printf 15 | 16 | module Ap_types = Molenc.Ap_types 17 | module Atom_env = Molenc.Atom_env 18 | module CLI = Minicli.CLI 19 | module L = BatList 20 | module Log = Dolog.Log 21 | module LO = Line_oriented 22 | module Mini_mol = Molenc.Mini_mol 23 | module Ht = BatHashtbl 24 | module Scale = Molenc.Scale 25 | module StringSet = BatSet.String 26 | module Utls = Molenc.Utls 27 | 28 | let read_one counter input () = 29 | try 30 | let m = Ap_types.read_one counter input in 31 | if !counter mod 1000 = 0 then 32 | (* user feedback *) 33 | eprintf "read %d\r%!" !counter; 34 | m 35 | with End_of_file -> 36 | begin 37 | Log.info "read %d" !counter; 38 | raise Parany.End_of_input 39 | end 40 | 41 | let process_one radii m = 42 | let buff = Buffer.create 1024 in 43 | let name = Mini_mol.get_name m in 44 | bprintf buff "#%s\n" name; 45 | let seen_envs = Ht.create 1000 in 46 | L.iter (fun radius -> 47 | let envs = Mini_mol.encode radius m in 48 | L.iter (fun (env, count) -> 49 | (* only output envs that were not already encountered 50 | at lower radius *) 51 | if not (Ht.mem seen_envs env) then 52 | begin 53 | bprintf buff "%s %d\n" (Atom_env.to_string env) count; 54 | Ht.add seen_envs env () 55 | end 56 | ) envs 57 | ) radii; 58 | Buffer.contents buff 59 | 60 | let write_one output str = 61 | fprintf output "%s" str 62 | 63 | let main () = 64 | Log.(set_log_level INFO); 65 | Log.color_on (); 66 | let argc, args = CLI.init () in 67 | if argc = 1 then 68 | (eprintf "usage:\n \ 69 | %s -i molecules.{types|ph4} -r {radius|srad:frad} -o out.idx\n \ 70 | -i : where to read molecules from\n \ 71 | -r {|:}: encoding radius or radii range\n \ 72 | -d : read feature dico from file\n \ 73 | -o : where to write encoded molecules\n \ 74 | [-n ]: max jobs in parallel\n" 75 | Sys.argv.(0); 76 | exit 1); 77 | let input_fn = CLI.get_string ["-i"] args in 78 | let output_fn = CLI.get_string ["-o"] args in 79 | let nprocs = CLI.get_int_def ["-n"] args 1 in 80 | let scale = 81 | if L.mem "-r" args && L.mem "-d" args then 82 | (* enforce that radius ranges are equal *) 83 | let r_scale = Scale.of_string (CLI.get_string ["-r"] args) in 84 | let d_scale = Scale.of_dictionary_header (CLI.get_string ["-d"] args) in 85 | Utls.enforce (r_scale = d_scale) 86 | (sprintf "Encoder: -r and -d don't agree: r_scale=%s d_scale=%s" 87 | (Scale.to_string r_scale) (Scale.to_string d_scale)); 88 | r_scale 89 | else 90 | match CLI.get_string_opt ["-r"] args with 91 | | Some r_str -> Scale.of_string r_str 92 | | None -> 93 | let dico_fn = CLI.get_string ["-d"] args in 94 | Scale.of_dictionary_header dico_fn in 95 | let radii = Scale.to_list scale in 96 | LO.with_infile_outfile input_fn output_fn (fun input output -> 97 | (* format header *) 98 | fprintf output "#radius=%s\n%!" (Scale.to_string scale); 99 | Parany.run ~preserve:true ~csize:1 nprocs 100 | ~demux:(read_one (ref 0) input) 101 | ~work:(process_one radii) 102 | ~mux:(write_one output) 103 | ) 104 | 105 | let () = main () 106 | -------------------------------------------------------------------------------- /src/finder.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2021, Francois Berenger 2 | Tsuda laboratory, 3 | Graduate School of Frontier Sciences, 4 | The University of Tokyo, 5 | 5-1-5 Kashiwa-no-ha, 6 | Kashiwa, Chiba 277-8561, Japan. 7 | 8 | Find name, Tanimoto-score and SMILES of nearest neighbor molecules. 9 | Output a valid SMILES file. *) 10 | 11 | open Printf 12 | 13 | module A = BatArray 14 | module Bstree = Molenc.Index.Bstree 15 | module CLI = Minicli.CLI 16 | module FpMol = Molenc.FpMol 17 | module Ht = Hashtbl 18 | module L = BatList 19 | module LO = Line_oriented 20 | module Log = Dolog.Log 21 | module MolIndex = Molenc.Index 22 | module S = BatString 23 | module Utls = Molenc.Utls 24 | 25 | let split_smiles_line l = 26 | (* Expect '\t' separated SMILES *) 27 | S.split l ~by:"\t" 28 | 29 | let ht_insert_name_if_not_there name2smi name smi = 30 | if Ht.mem name2smi name then 31 | let () = Log.fatal "Finder.ht_insert_name_if_not_there: \ 32 | already seen molecule name: %s" name in 33 | exit 1 34 | else 35 | Ht.add name2smi name smi 36 | 37 | let process_smiles_file name2smi name2activity fn = 38 | LO.iteri fn (fun i line -> 39 | let smi, name = split_smiles_line line in 40 | ht_insert_name_if_not_there name2smi name smi; 41 | (* also index the molecule under its "raw" name 42 | (without postfix pIC50 value) *) 43 | if S.contains name '_' then 44 | begin 45 | let raw_name, pIC50 = S.split name ~by:"_" in 46 | ht_insert_name_if_not_there name2smi raw_name smi; 47 | ht_insert_name_if_not_there name2activity raw_name pIC50 48 | end; 49 | if (i mod 1000) = 0 then 50 | printf "Loaded molecules: %d\r%!" (Ht.length name2smi) 51 | ) 52 | 53 | let main () = 54 | Log.(set_log_level INFO); 55 | Log.color_on (); 56 | let argc, args = CLI.init () in 57 | if argc = 1 then 58 | begin 59 | eprintf "usage:\n\ 60 | %s\n \ 61 | -i : encoded molecules input file\n \ 62 | --bst-fns : list of BST index files\n \ 63 | --smi-fns : list of SMILES files\n \ 64 | -np : nprocs (default=1)\n" 65 | Sys.argv.(0); 66 | exit 1 67 | end; 68 | let input_fn = CLI.get_string ["-i"] args in 69 | let output_fn = CLI.get_string ["-o"] args in 70 | let bst_fns = S.split_on_char ',' (CLI.get_string ["--bst-fns"] args) in 71 | let smi_fns = S.split_on_char ',' (CLI.get_string ["--smi-fns"] args) in 72 | let nprocs = CLI.get_int_def ["-np"] args 1 in 73 | CLI.finalize (); (* ------------------------------------------------------ *) 74 | (* populate the name to SMILES LUT *) 75 | let name2smi = Ht.create 1_000_000 in 76 | let name2activity = Ht.create 1_000_000 in 77 | L.iter (process_smiles_file name2smi name2activity) smi_fns; 78 | let encoded_molecules_in = 79 | A.of_list (Molenc.FpMol.molecules_of_file input_fn) in 80 | LO.with_out_file output_fn (fun out -> 81 | let fp_name_dists = 82 | MolIndex.nearest_neighbor_names_a 83 | nprocs bst_fns encoded_molecules_in in 84 | A.iter (fun (_fp, name, dist) -> 85 | let smi, pIC50 = 86 | try (Ht.find name2smi name, Ht.find name2activity name) 87 | with Not_found -> 88 | let () = Log.fatal "Finder.main: not in Ht: %s" name in 89 | exit 1 in 90 | let tani = 1.0 -. dist in 91 | fprintf out "%s\t%s_T=%.2f_pIC50=%s\n" smi name tani pIC50 92 | ) fp_name_dists 93 | ) 94 | 95 | let () = main () 96 | -------------------------------------------------------------------------------- /src/formula.ml: -------------------------------------------------------------------------------- 1 | 2 | module A = BatArray 3 | module L = BatList 4 | module Log = Dolog.Log 5 | module SMap = BatMap.String 6 | 7 | open Printf 8 | 9 | type formula_item = Element of string 10 | | Count of int 11 | 12 | (* parse list of tokens *) 13 | let rec count_elements = function 14 | | [] -> [] 15 | | [Element symb] -> [(symb, 1)] 16 | | [Count _] -> assert(false) (* should have been processed before *) 17 | | (Element s0) :: (Element s1) :: rest -> 18 | (s0, 1) :: count_elements (Element s1 :: rest) 19 | | (Element symb) :: (Count c) :: rest -> 20 | (symb, c) :: (count_elements rest) 21 | | _ -> assert(false) 22 | 23 | let parse_int s = 24 | try int_of_string s 25 | with exn -> 26 | (Log.fatal "Formula.parse_int: cannot parse: %s" s; 27 | raise exn) 28 | 29 | (* formula -> int *) 30 | let encode _debug f = 31 | let element_counts = A.make 119 0 in 32 | (* lexer: tokenize chemical elements starting from two chars ones *) 33 | let element_counts_0 = 34 | Str.bounded_full_split Ptable.elements_regexp f 1024 in 35 | let element_counts_1 = 36 | L.map (function Str.Delim symbol -> Element symbol 37 | | Str.Text count -> Count (parse_int count) 38 | ) element_counts_0 in 39 | let element_counts_2 = count_elements element_counts_1 in 40 | L.iter (fun (symb, count) -> 41 | let anum = Ptable.anum_of_symbol symb in 42 | (* robust even to extended formulas like CH3CH2OH 43 | instead of the proper C2H6O *) 44 | element_counts.(anum) <- element_counts.(anum) + count 45 | ) element_counts_2; 46 | (* potentially too large number to fit OCaml's 64 bits signed integers *) 47 | let big_int = 48 | A.fold_lefti (fun acc anum count -> 49 | if count > 0 then 50 | let p = Z.of_int (Ptable.prime_for_anum anum) in 51 | (* sum of powers of primes; also called "Godel numbering" *) 52 | Z.mul acc (Z.pow p count) 53 | else 54 | acc 55 | ) Z.one element_counts in 56 | Z.to_int big_int 57 | 58 | let z_2 = Z.of_int 2 59 | let z_3 = Z.of_int 3 60 | let z_5 = Z.of_int 5 61 | let z_7 = Z.of_int 7 62 | 63 | (* Godel numbering for radius up to three bonds *) 64 | let encode_envs e0 e1 e2 e3 = 65 | Z.(to_int (pow z_2 e0 * 66 | pow z_3 e1 * 67 | pow z_5 e2 * 68 | pow z_7 e3)) 69 | 70 | let find_exponent composite prime = 71 | let rec loop acc x = 72 | if x mod prime = 0 then 73 | loop (acc + 1) (x / prime) 74 | else 75 | acc in 76 | loop 0 composite 77 | 78 | (* int -> formula *) 79 | let decode (code: int): string = 80 | let counts = A.map (find_exponent code) Ptable.all_primes in 81 | let prime_counts = A.combine Ptable.all_primes counts in 82 | let symb2count = 83 | A.fold (fun acc (prime, count) -> 84 | if count > 0 then 85 | let symb = Ptable.symbol_for_prime prime in 86 | SMap.add symb count acc 87 | else 88 | acc 89 | ) SMap.empty prime_counts in 90 | (* get back formula as a string *) 91 | let buff = Buffer.create 128 in 92 | SMap.iter (bprintf buff "%s%d") symb2count; 93 | Buffer.contents buff 94 | -------------------------------------------------------------------------------- /src/fpMol.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* A fingerprint-encoded molecule *) 10 | 11 | module A = Array 12 | module Fp = Fingerprint 13 | module Ht = Hashtbl 14 | module L = MyList 15 | module LO = Line_oriented 16 | 17 | open Printf 18 | 19 | type t = { name: string; 20 | index: int; (* position in input file *) 21 | value: float; 22 | fp: Fp.t } 23 | 24 | let create name index value bitstring = 25 | { name; index; value; fp = Fp.of_string bitstring } 26 | 27 | (* read one molecule from an FP file *) 28 | let read_one_mol line = 29 | try Scanf.sscanf line "%s@,%f,%s" 30 | (fun name value bitstring -> 31 | (name, value, bitstring) 32 | ) 33 | with Scanf.Scan_failure msg -> 34 | failwith ("FpMol.read_one_mol: fmt: %s@,%f,%s err: " ^ msg ^ 35 | " line: " ^ line) 36 | 37 | let parse_one index line = 38 | let name, value, bitstring = read_one_mol line in 39 | create name index value bitstring 40 | 41 | (* go back to the line format you came from *) 42 | let to_string (m: t): string = 43 | sprintf "%s,%g,[%s]" 44 | m.name 45 | m.value 46 | (Fp.to_string m.fp) 47 | 48 | let to_out out m = 49 | fprintf out "%s\n" (to_string m) 50 | 51 | let molecules_of_file fn = 52 | LO.mapi fn parse_one 53 | 54 | let dist m1 m2 = 55 | Fp.distance m1.fp m2.fp 56 | 57 | let tani m1 m2 = 58 | Fp.tanimoto m1.fp m2.fp 59 | 60 | let get_name x = 61 | x.name 62 | 63 | let get_value x = 64 | x.value 65 | 66 | let get_index x = 67 | x.index 68 | 69 | let get_fp x = 70 | x.fp 71 | 72 | let nb_features x = 73 | Fp.nb_features x.fp 74 | 75 | let mol_is_active line = 76 | BatString.starts_with line "active" 77 | 78 | let is_active x = 79 | mol_is_active x.name 80 | 81 | let drop_features to_drop x = 82 | { x with fp = Fp.drop_features to_drop x.fp } 83 | -------------------------------------------------------------------------------- /src/fp_test.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* regression tests for Fingerprint.tanimoto *) 10 | 11 | module Fp = Molenc.Fingerprint 12 | module Log = Dolog.Log 13 | 14 | let () = 15 | Log.color_on (); 16 | Log.set_log_level Log.INFO; 17 | let tani = Fp.tanimoto in 18 | let fp1 = Fp.of_string "[0:1;1:2;5:1;11:4]" in 19 | let fp2 = Fp.of_string "[1:1;3:2;11:3]" in 20 | let fp3 = Fp.of_string "[]" in 21 | let fp4 = Fp.of_string "[1:2;5:1;11:2]" in 22 | assert(tani fp1 fp1 = 1.0); 23 | assert(tani fp2 fp2 = 1.0); 24 | assert(tani fp3 fp3 = 0.0); 25 | assert(tani fp4 fp4 = 1.0); 26 | assert(tani fp1 fp2 = 4.0 /. 10.0); 27 | assert(tani fp1 fp3 = 0.0); 28 | assert(tani fp1 fp4 = 5.0 /. 8.0); 29 | Log.info "OK" 30 | -------------------------------------------------------------------------------- /src/gen_bindings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # pyml_bindgen rdkit_wrapper_specs.txt rdkit_wrapper Rdkit \ 4 | # --caml-module=Rdkit --of-pyo-ret-type=no_check > rdkit.ml 5 | 6 | pyml_bindgen rdkit_wrapper_specs.txt rdkit_wrapper Rdkit \ 7 | --embed-python-source rdkit_wrapper.py \ 8 | --caml-module=Rdkit --of-pyo-ret-type=no_check > rdkit.ml 9 | 10 | # format the generated code 11 | ocamlformat --inplace --enable-outside-detected-project rdkit.ml 12 | -------------------------------------------------------------------------------- /src/gnuplot.ml: -------------------------------------------------------------------------------- 1 | 2 | module Fn = Filename 3 | module Log = Dolog.Log 4 | module Utls = Molenc.Utls 5 | 6 | open Printf 7 | 8 | (* WARNING: leaks tmp files. 9 | we require the means because the Gaussian fitting may not converge *) 10 | let plot_histograms histo_data_fn mean1 mean2 = 11 | let gnuplot_script_fn = Fn.temp_file "gnuplot_" ".gpl" in 12 | let gnuplot_log_fn = Fn.temp_file "gnuplot_" ".log" in 13 | Utls.string_list_to_file gnuplot_script_fn 14 | ["set xlabel 'score'"; 15 | "set ylabel 'frequency'"; 16 | "gauss1(x) = a1/(sqrt(2*pi)*sigma1)*exp(-(x-mean1)**2/(2*sigma1**2))"; 17 | "gauss2(x) = a2/(sqrt(2*pi)*sigma2)*exp(-(x-mean2)**2/(2*sigma2**2))"; 18 | sprintf "mean1 = %f" mean1; 19 | sprintf "mean2 = %f" mean2; 20 | sprintf "fit gauss1(x) '%s' u 1:2 via a1,sigma1,mean1" histo_data_fn; 21 | sprintf "fit gauss2(x) '%s' u 1:3 via a2,sigma2,mean2" histo_data_fn; 22 | sprintf "plot '%s' u 1:2 w l t 'smaller sample', \\" histo_data_fn; 23 | "'' u 1:3 w l t 'bigger sample', \\"; 24 | "gauss1(x) t 'smaller fit', \\"; 25 | "gauss2(x) t 'bigger fit'"]; 26 | Log.info "gnuplot script: %s log: %s" gnuplot_script_fn gnuplot_log_fn; 27 | Utls.run_command (sprintf "(gnuplot -persist %s 2>&1) > %s" 28 | gnuplot_script_fn gnuplot_log_fn) 29 | -------------------------------------------------------------------------------- /src/gram.ml: -------------------------------------------------------------------------------- 1 | open Printf 2 | 3 | module A = BatArray 4 | module L = BatList 5 | 6 | (* Parallel Gram matrix initialization *) 7 | let emit_one (i: int ref) (n: int) ((): unit): int = 8 | if !i >= n then raise Parany.End_of_input 9 | else 10 | let res = !i in 11 | incr i; 12 | res 13 | 14 | let process_one (dist: 'a -> 'a -> float) (samples: 'a array) (n: int) (i: int): 15 | (int * float array) = 16 | let res = A.create_float (n - i) in 17 | let si = samples.(i) in 18 | for j = i to n - 1 do 19 | res.(j - i) <- dist si samples.(j) 20 | done; 21 | (i, res) 22 | 23 | let gather_one (res: float array array) ((i, xs): (int * float array)): unit = 24 | A.iteri (fun j' x -> 25 | let j = j' + i in 26 | res.(i).(j) <- x; 27 | res.(j).(i) <- x (* symmetric matrix *) 28 | ) xs 29 | 30 | let initialize_matrix dist ncores csize samples res = 31 | let n = A.length samples in 32 | assert(n > 0); 33 | assert(ncores >= 1); 34 | if ncores = 1 then (* Sequential *) 35 | begin 36 | for i = 0 to n - 1 do 37 | (* WARNING: we initialize the diagonal while it is all 0s *) 38 | for j = i to n - 1 do 39 | let x = dist samples.(i) samples.(j) in 40 | res.(i).(j) <- x; 41 | (* WARNING: we could remove the next one *) 42 | res.(j).(i) <- x (* symmetric matrix *) 43 | done; 44 | printf "done: %d/%d\r%!" (i + 1) n; 45 | done; 46 | printf "\n%!"; 47 | end 48 | else (* parallel *) 49 | Parany.run ~csize ncores 50 | ~demux:(emit_one (ref 0) n) 51 | ~work:(process_one dist samples n) 52 | ~mux:(gather_one res) 53 | 54 | (* partial display *) 55 | let print_corners mat = 56 | let m = A.length mat in 57 | let n = A.length mat.(0) in 58 | let idots = ref false in 59 | for i = 0 to m - 1 do 60 | if i < 3 || i > m - 4 then 61 | begin 62 | let jdots = ref false in 63 | for j = 0 to n - 1 do 64 | if j < 3 || j > n - 4 then 65 | printf (if j <> 0 then "\t%6.2f" else "%6.2f") 66 | mat.(i).(j) 67 | else if not !jdots then 68 | (printf "\t..."; jdots := true) 69 | done; 70 | printf "\n" 71 | end 72 | else if not !idots then 73 | (printf "\t\t\t...\n"; idots := true) 74 | done; 75 | flush stdout 76 | -------------------------------------------------------------------------------- /src/index.ml: -------------------------------------------------------------------------------- 1 | 2 | module A = BatArray 3 | module L = BatList 4 | module Log = Dolog.Log 5 | 6 | module Bstree = struct 7 | 8 | include Bst.Bisec_tree.Make (FpMol) 9 | 10 | let of_molecules l = 11 | create 1 Two_bands (A.of_list l) 12 | end 13 | 14 | (* For each molecule, find its nearest neighbor name and distance, 15 | over all Bsts; parallelized over molecules. *) 16 | let nearest_neighbor_names ncores bst_fns mols = 17 | match bst_fns with 18 | | [] -> [] 19 | | fn :: fns -> 20 | let annot_mols = 21 | (* load one bst *) 22 | Log.info "loading %s..." fn; 23 | let (bst: Bstree.t) = Utls.restore fn in 24 | Parany.Parmap.parmap ncores 25 | (fun mol -> 26 | let nn, dist = Bstree.nearest_neighbor mol bst in 27 | (mol, FpMol.get_name nn, dist) 28 | ) mols in 29 | (* fold on the other BSTs *) 30 | L.fold_left (fun annotated bst_fn -> 31 | (* load another bst *) 32 | Log.info "loading %s..." bst_fn; 33 | let (bst: Bstree.t) = Utls.restore bst_fn in 34 | Parany.Parmap.parmap ncores (fun (mol, nn_name, dist) -> 35 | if dist = 0.0 then 36 | (* already nearest *) 37 | (mol, nn_name, dist) 38 | else 39 | let curr_nn, curr_dist = Bstree.nearest_neighbor mol bst in 40 | if curr_dist < dist then 41 | (mol, FpMol.get_name curr_nn, curr_dist) 42 | else 43 | (mol, nn_name, dist) 44 | ) annotated 45 | ) annot_mols fns 46 | 47 | let bst_nearest_name_dist bst mol = 48 | let nn, dist = Bstree.nearest_neighbor mol bst in 49 | (FpMol.get_name nn, dist) 50 | 51 | (* For each molecule, find its nearest neighbor name and distance, 52 | over all Bsts; parallelized over bisector trees (indexed chunks) *) 53 | let nearest_neighbor_names_a 54 | (ncores: int) (bst_fns: string list) (mols_a: FpMol.t array) 55 | : (FpMol.t * string * float) array = 56 | match bst_fns with 57 | | [] -> [||] 58 | | fn :: fns' -> 59 | (* init accumulater, for the muxer process. 60 | This is the only calculation parallelized over molecules. 61 | Remaining calculations will be paralellized over Bsts. *) 62 | let annot_mols = 63 | Log.info "loading %s..." fn; 64 | let (bst: Bstree.t) = Utls.restore fn in 65 | Parany.Parmap.array_parmap ncores 66 | (fun mol -> 67 | let name, dist = bst_nearest_name_dist bst mol in 68 | (mol, name, dist)) 69 | (mols_a.(0), "", 1.0) 70 | mols_a in 71 | let fns = A.of_list fns' in 72 | let () = 73 | Parany.run ncores 74 | ~demux:( 75 | let i = ref 1 in (* fn already processed *) 76 | let n = A.length fns in 77 | fun () -> 78 | if !i < n then 79 | let res = !i in 80 | incr i; 81 | res 82 | else 83 | raise Parany.End_of_input 84 | ) 85 | ~work:(fun i -> 86 | Log.info "loading %s..." fns.(i); 87 | let (bst: Bstree.t) = Utls.restore fns.(i) in 88 | A.map (bst_nearest_name_dist bst) mols_a 89 | ) 90 | ~mux:( 91 | let m = A.length mols_a in 92 | fun nearest_name_dists -> 93 | assert(A.length nearest_name_dists = m); 94 | for i = 0 to m - 1 do 95 | let mol, _prev_nearest_name, prev_dist = 96 | A.unsafe_get annot_mols i in 97 | if prev_dist = 0.0 then 98 | (* already nearest *) () 99 | else 100 | let curr_nearest_name, curr_dist = 101 | A.unsafe_get nearest_name_dists i in 102 | if curr_dist < prev_dist then (* update acc *) 103 | A.unsafe_set annot_mols i (mol, curr_nearest_name, curr_dist) 104 | done 105 | ) in 106 | annot_mols 107 | -------------------------------------------------------------------------------- /src/indexer.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2021, Francois Berenger 2 | Tsuda laboratory, Tokyo university, Japan. 3 | 4 | Indexing of fingerprint encoded molecules into bisector trees 5 | stored on disk. *) 6 | 7 | open Printf 8 | 9 | module A = BatArray 10 | module Bstree = Molenc.Index.Bstree 11 | module CLI = Minicli.CLI 12 | module FpMol = Molenc.FpMol 13 | module Ht = Hashtbl 14 | module L = BatList 15 | module LO = Line_oriented 16 | module Log = Dolog.Log 17 | module Utls = Molenc.Utls 18 | 19 | let verbose = ref false 20 | 21 | let read_one_chunk input_fn in_mol_count chunk_index csize input () = 22 | let res = ref [] in 23 | try 24 | for _i = 1 to csize do 25 | let line = input_line input in 26 | let mol = (!in_mol_count, line) in 27 | incr in_mol_count; 28 | res := mol :: !res 29 | done; 30 | let idx = !chunk_index in 31 | incr chunk_index; 32 | (idx, !res) 33 | with End_of_file -> 34 | if !res = [] then 35 | (Log.info "read %d from %s" !in_mol_count input_fn; 36 | raise Parany.End_of_input) 37 | else 38 | (* last chunk, maybe not full *) 39 | (!chunk_index, !res) 40 | 41 | let index_one_chunk input_fn (i, chunk') = 42 | let chunk = L.rev_map (fun (i, line) -> FpMol.parse_one i line) chunk' in 43 | assert(i <= 9999); 44 | let output_fn = sprintf "%s.%04d.bst" input_fn i in 45 | Log.info "creating %s" output_fn; 46 | let bst = Bstree.of_molecules chunk in 47 | LO.save output_fn bst; 48 | Utls.run_command (sprintf "gzip -f %s" output_fn) 49 | 50 | let main () = 51 | Log.(set_log_level INFO); 52 | Log.color_on (); 53 | let argc, args = CLI.init () in 54 | let default_block_size = ref 50_000 in 55 | if argc = 1 then 56 | begin 57 | eprintf "usage:\n\ 58 | %s\n \ 59 | -i : molecules input file\n \ 60 | -ifs : file containing a list of files\n \ 61 | -np : nprocs (default=1)\n \ 62 | -c : chunk size (molecules/bloc; default=%d)\n \ 63 | [-v]: verbose mode\n" 64 | Sys.argv.(0) !default_block_size; 65 | exit 1 66 | end; 67 | let input_fns = 68 | match (CLI.get_string_opt ["-i"] args, 69 | CLI.get_string_opt ["-ifs"] args) with 70 | | (None, None) 71 | | (Some _, Some _) -> failwith "provide either -i or -ifs" 72 | | (Some fn, None) -> [fn] 73 | | (None, Some fn) -> LO.lines_of_file fn in 74 | let nprocs = CLI.get_int_def ["-np"] args 1 in 75 | let csize = CLI.get_int_def ["-c"] args !default_block_size in 76 | if CLI.get_set_bool ["-v"] args then 77 | verbose := true; 78 | CLI.finalize (); (* ------------------------------------------------------ *) 79 | let chunk_count = ref 0 in 80 | let in_mol_count = ref 0 in 81 | L.iter (fun input_fn -> 82 | Log.info "%d molecules in %s" (LO.length input_fn) input_fn; 83 | LO.with_in_file input_fn (fun input -> 84 | Parany.run nprocs 85 | ~demux:(read_one_chunk input_fn in_mol_count chunk_count csize input) 86 | ~work:(index_one_chunk input_fn) 87 | ~mux:(fun () -> ()) 88 | ) 89 | ) input_fns 90 | 91 | let () = main () 92 | -------------------------------------------------------------------------------- /src/intSet.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | open Printf 10 | 11 | include BatSet.Int 12 | 13 | let to_string s = 14 | let buff = Buffer.create 11 in 15 | Buffer.add_char buff '['; 16 | iter (fun x -> 17 | Buffer.add_string buff 18 | (if Buffer.length buff = 1 19 | then sprintf "%d" x 20 | else sprintf ";%d" x) 21 | ) s; 22 | Buffer.add_char buff ']'; 23 | Buffer.contents buff 24 | -------------------------------------------------------------------------------- /src/mini_mol.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* mini molecule module *) 10 | 11 | module A = BatArray 12 | module IntSet = BatSet.Int 13 | module Ht = BatHashtbl 14 | module L = BatList 15 | module StringMap = BatMap.String 16 | 17 | type t = { name: string; 18 | graph: Node.t array; 19 | diameter: int; 20 | matrix: int array array } 21 | 22 | let get_name m = m.name 23 | 24 | let get_graph m = m.graph 25 | 26 | let nb_atoms m = 27 | A.length m.graph 28 | 29 | let create name graph diameter matrix = 30 | { name; graph; diameter; matrix } 31 | 32 | let get_typ (m: t) (i: int) = 33 | Node.get_typ (A.unsafe_get m.graph i) 34 | 35 | let get_succs (m: t) (i: int) = 36 | Node.get_succs (A.unsafe_get m.graph i) 37 | 38 | (* list (sorted-uniq-counted) atom types of all atoms 39 | at given distance from center atom *) 40 | let types_at_distance (center: int) (curr_height: int) (mol: t) = 41 | let matrix_line = mol.matrix.(center) in 42 | let unsorted = 43 | A.fold_lefti (fun acc i x -> 44 | if x = curr_height then 45 | (get_typ mol i) :: acc 46 | else 47 | acc 48 | ) [] matrix_line in 49 | (* layer at 'curr_height' *) 50 | (curr_height, Utls.list_uniq_count unsorted) 51 | 52 | let encode (max_height: int) (mol: t): (Atom_env.t * int) list = 53 | (* compute atom envs. of given atom up to maximum height allowed *) 54 | (* we cannot go deeper than 'maxi' on this molecule *) 55 | let maxi = min max_height mol.diameter in 56 | let encode_atom (n_i: int): Atom_env.t = 57 | let depths = L.range 0 `To maxi in 58 | let layers = 59 | L.map (fun height -> 60 | types_at_distance n_i height mol 61 | ) depths in 62 | (* non empty layers *) 63 | L.filter (fun (_h, typs) -> typs <> []) layers 64 | in 65 | let nb_atoms = A.length mol.graph in 66 | let atom_indexes = L.range 0 `To (nb_atoms - 1) in 67 | (* canonicalize the encoding of the molecule by sorting its atom envs 68 | and counting duplicates *) 69 | let atom_envs = L.map encode_atom atom_indexes in 70 | Utls.list_uniq_count atom_envs 71 | 72 | (* encode the molecule to counted atom pairs *) 73 | let atom_pairs (mol: t): (Atom_pair.t * int) list = 74 | let n = nb_atoms mol in 75 | assert(n >= 1); (* at least one heavy atom *) 76 | let max_nb_pairs = max 1 (n * (n - 1) / 2) in 77 | let pair2count = Ht.create max_nb_pairs in 78 | for i = 0 to n - 1 do 79 | let type_i = get_typ mol i in 80 | for j = i to n - 1 do 81 | let type_j = get_typ mol j in 82 | let dist = A.unsafe_get (A.unsafe_get mol.matrix i) j in 83 | let pair = Atom_pair.create type_i type_j dist in 84 | let prev_count = Ht.find_default pair2count pair 0 in 85 | Ht.replace pair2count pair (prev_count + 1) 86 | done; 87 | done; 88 | (* canonicalization will be done later; when the features (string) are 89 | * converted to feature ids (int) *) 90 | Ht.bindings pair2count 91 | -------------------------------------------------------------------------------- /src/mol2.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | module At = Syb_atom 10 | module IntSet = BatSet.Int 11 | module S = BatString 12 | 13 | let molecule_header = "@MOLECULE" 14 | let atoms_header = "@ATOM" 15 | let bonds_header = "@BOND" 16 | 17 | (* parse line just after the molecule name line *) 18 | let read_header l = 19 | try Scanf.sscanf l " %d %d %d %d %d " 20 | (fun nb_atoms nb_bonds _ _ _ -> (nb_atoms, nb_bonds)) 21 | with _ -> failwith ("read_header: could not parse: " ^ l) 22 | 23 | exception Read_one 24 | 25 | let buff = Buffer.create 10240 26 | 27 | (* read one molecule from a MOL2 file *) 28 | let read_one_raw (input: in_channel): string = 29 | try 30 | while true do 31 | let line = input_line input in 32 | if line = molecule_header && Buffer.length buff <> 0 then 33 | (* just finished reading one *) 34 | raise Read_one 35 | else 36 | (Buffer.add_string buff line; 37 | Buffer.add_char buff '\n') 38 | done; 39 | assert(false) 40 | with 41 | | Read_one -> 42 | let res = Buffer.contents buff in 43 | Buffer.reset buff; 44 | Buffer.add_string buff molecule_header; (* put in buffer next mol's header *) 45 | Buffer.add_char buff '\n'; 46 | res 47 | | End_of_file -> 48 | if Buffer.length buff = 0 then 49 | raise End_of_file 50 | else 51 | let res = Buffer.contents buff in 52 | Buffer.reset buff; 53 | res 54 | 55 | let get_name mol_lines = 56 | let _header, rest = S.split mol_lines ~by:"\n" in 57 | let name, _tail = S.split rest ~by:"\n" in 58 | name 59 | -------------------------------------------------------------------------------- /src/molenc_fcodec.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2024, Francois Berenger 2 | * Tsuda laboratory, Tokyo University, 3 | * 5-1-5 Kashiwa-no-ha, Kashiwa-shi, Chiba-ken, 277-8561, Japan. 4 | * 5 | * chemical formula integer encoder/decoder *) 6 | 7 | open Printf 8 | 9 | module A = BatArray 10 | module CLI = Minicli.CLI 11 | module Formula = Molenc.Formula 12 | module SMap = BatMap.String 13 | module L = BatList 14 | module LO = Line_oriented 15 | module Log = Dolog.Log 16 | module Rdkit = Molenc.Rdkit.Rdkit 17 | module S = BatString 18 | 19 | (* because the Rdkit module uses Pyml *) 20 | let () = Py.initialize ~version:3 () 21 | 22 | let formula_of_elements (elts: string array): string = 23 | let elt2count = 24 | A.fold (fun acc elt -> 25 | let prev_count = SMap.find_default 0 elt acc in 26 | SMap.add elt (prev_count + 1) acc 27 | ) SMap.empty elts in 28 | let buff = Buffer.create 128 in 29 | SMap.iter (bprintf buff "%s%d") elt2count; 30 | Buffer.contents buff 31 | 32 | type work_result = OK of string 33 | | Overflow of string 34 | 35 | let main () = 36 | Log.(set_log_level INFO); 37 | Log.color_on (); 38 | Log.(set_prefix_builder short_prefix_builder); 39 | let argc, args = CLI.init () in 40 | (if argc = 1 then 41 | (eprintf "usage:\n \ 42 | %s -i in.smi\n \ 43 | -i : input molecules\n \ 44 | [-np ]: parallelize on NCORES (default=1)\n \ 45 | [-c ]: chunk size (default=50)\n" 46 | Sys.argv.(0); 47 | exit 1) 48 | ); 49 | let smiles_fn = CLI.get_string ["-i"] args in 50 | let nprocs = CLI.get_int_def ["-np"] args 1 in 51 | let csize = CLI.get_int_def ["-c"] args 50 in 52 | CLI.finalize (); (* ------------------------------------------------------ *) 53 | (* read each molecule *) 54 | LO.with_in_file smiles_fn (fun input -> 55 | Parany.run nprocs ~csize 56 | ~demux:(fun () -> 57 | try input_line input 58 | with End_of_file -> raise Parany.End_of_input 59 | ) 60 | ~work:(fun line -> 61 | let smi, _name = S.split ~by:"\t" line in 62 | let mol_H = 63 | let mol = Rdkit.__init__ ~smi () in 64 | Rdkit.add_hydrogens mol () in 65 | let elements = Rdkit.get_elements mol_H () in 66 | (* get chemical formula *) 67 | let formula = formula_of_elements elements in 68 | (* Log.info "formula: %s" formula; *) 69 | (* encode to integer *) 70 | try 71 | let code = Formula.encode false formula in 72 | (* Log.info "code: %d" code; *) 73 | (* decode to chemical formula *) 74 | let formula' = Formula.decode code in 75 | (* Log.info "formula': %s" formula'; *) 76 | assert(formula = formula'); 77 | OK line 78 | with Z.Overflow -> 79 | Overflow line 80 | ) 81 | ~mux:(function 82 | | OK line -> printf "%s\n%!" line 83 | | Overflow line -> eprintf "%s\n%!" line 84 | ) 85 | ) 86 | 87 | let () = main () 88 | -------------------------------------------------------------------------------- /src/myList.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | include BatList 10 | 11 | let to_string to_str l = 12 | let buff = Buffer.create 80 in 13 | Buffer.add_char buff '['; 14 | iteri (fun i x -> 15 | if i > 0 then Buffer.add_char buff ';'; 16 | Buffer.add_string buff (to_str x); 17 | ) l; 18 | Buffer.add_char buff ']'; 19 | Buffer.contents buff 20 | 21 | let of_string of_str s = 22 | let s' = BatString.chop ~l:1 ~r:1 s in 23 | if s' = "" then 24 | (* the empty list case was not handled in the past *) 25 | [] 26 | else 27 | begin 28 | if String.contains s' ']' then 29 | failwith ("MyList.of_string: sub lists inside: " ^ s); 30 | map of_str (BatString.split_on_string s' ~by:";") 31 | end 32 | 33 | (* only map 'f' on elements satisfying 'p' *) 34 | let filter_map p f l = 35 | let res = 36 | fold_left (fun acc x -> 37 | if p x then (f x) :: acc 38 | else acc 39 | ) [] l in 40 | rev res 41 | 42 | (* split a list into n parts (the last part might have 43 | a different number of elements) *) 44 | let nparts n l = 45 | let len = length l in 46 | let res = ref [] in 47 | let curr = ref l in 48 | let m = BatFloat.round_to_int (float len /. float n) in 49 | for _ = 1 to n - 1 do 50 | let xs, ys = takedrop m !curr in 51 | curr := ys; 52 | res := xs :: !res 53 | done; 54 | rev (!curr :: !res) 55 | 56 | (* create folds of cross validation; each fold consists in (train, test) *) 57 | let cv_folds n l = 58 | let test_sets = nparts n l in 59 | let rec loop acc prev curr = 60 | match curr with 61 | | [] -> acc 62 | | x :: xs -> 63 | let before_after = flatten (rev_append prev xs) in 64 | let prev' = x :: prev in 65 | let train_test = (before_after, x) in 66 | let acc' = train_test :: acc in 67 | loop acc' prev' xs in 68 | loop [] [] test_sets 69 | 70 | (* List.combine for 4 lists *) 71 | let combine4 l1 l2 l3 l4 = 72 | let rec loop acc = function 73 | | ([], [], [], []) -> rev acc 74 | | (w :: ws, x :: xs, y :: ys, z :: zs) -> 75 | loop ((w, x, y, z) :: acc) (ws, xs, ys, zs) 76 | | _ -> raise (Invalid_argument "MyList.combine4: list lengths differ") 77 | in 78 | loop [] (l1, l2, l3, l4) 79 | 80 | let really_take n l = 81 | let res = take n l in 82 | assert(length res = n); 83 | res 84 | 85 | (* non reproducible randomization of a list *) 86 | let random_shuffle l = 87 | let rng = BatRandom.State.make_self_init () in 88 | shuffle ~state:rng l 89 | 90 | let rev_combine l1 l2 = 91 | let rec loop acc l r = 92 | match (l, r) with 93 | | ([], []) -> acc 94 | | (x :: xs, y :: ys) -> loop ((x, y) :: acc) xs ys 95 | | _ -> raise (Invalid_argument "MyList.rev_combine: list lengths differ") 96 | in 97 | loop [] l1 l2 98 | 99 | (* filter using bit-mask [m] *) 100 | let filter_mask m l = 101 | let rec loop acc = function 102 | | [] -> acc 103 | | (p, x) :: rest -> loop (if p then x :: acc else acc) rest 104 | in 105 | loop [] (rev_combine m l) 106 | -------------------------------------------------------------------------------- /src/node.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | module IntSet = BatSet.Int 10 | 11 | type t = { typ: PiEltHA.t ; (* atom type *) 12 | succs: IntSet.t } (* indexes of its direct successors 13 | in the molecular graph (it is bonded to them) *) 14 | 15 | let create typ succs = 16 | { typ; succs } 17 | 18 | let dummy = create PiEltHA.dummy IntSet.empty 19 | 20 | let add_succ (n: t) (succ: int): t = 21 | create n.typ (IntSet.add succ n.succs) 22 | 23 | let get_succs (n: t): IntSet.t = 24 | n.succs 25 | 26 | let get_typ (n: t): PiEltHA.t = 27 | n.typ 28 | -------------------------------------------------------------------------------- /src/norm.ml: -------------------------------------------------------------------------------- 1 | 2 | open Printf 3 | 4 | module IntMap = BatMap.Int 5 | 6 | (* max norm is probably to be preferred if we are going to minwise hash 7 | * the fingerprints later on *) 8 | type norm = Max_norm (* max feature value in current instance *) 9 | | L1_norm (* Manhatan distance *) 10 | 11 | let of_string = function 12 | | "l1" -> L1_norm 13 | | "max" -> Max_norm 14 | | other -> failwith (sprintf "Decoder: unknown norm: %s" other) 15 | 16 | let map_norm style map = 17 | float 18 | (match style with 19 | | L1_norm -> IntMap.fold (fun _k v acc -> v + acc) map 0 20 | | Max_norm -> IntMap.fold (fun _k v acc -> max v acc) map 0) 21 | -------------------------------------------------------------------------------- /src/ph4.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2022, Francois Berenger 2 | 3 | Tsuda Laboratory, Graduate School of Frontier Sciences, 4 | The University of Tokyo, Japan. 5 | 6 | Support .ph4 files. *) 7 | 8 | (* A .ph4 files has format: 9 | --- 10 | : 11 | ARO 1.47088 -0.706617 1.86095 12 | . 13 | . 14 | . 15 | --- *) 16 | 17 | module S = BatString 18 | 19 | let parse_header_line line = 20 | let num_feats, name = S.split ~by:":" line in 21 | (int_of_string num_feats, name) 22 | 23 | exception Read_one 24 | 25 | (* read one molecule from a .ph4 file *) 26 | let read_one (input: in_channel): string = 27 | let buff = Buffer.create 2048 in 28 | try 29 | let line = input_line input in 30 | let num_feats, _name = parse_header_line line in 31 | Buffer.add_string buff line; 32 | Buffer.add_char buff '\n'; 33 | for _i = 1 to num_feats do 34 | let line = input_line input in 35 | Buffer.add_string buff line; 36 | Buffer.add_char buff '\n' 37 | done; 38 | raise Read_one 39 | with | End_of_file | Read_one -> 40 | let res = Buffer.contents buff in 41 | if res = "" then 42 | raise End_of_file 43 | else res 44 | 45 | let get_name ph4_lines = 46 | let header, _rest = S.split ph4_lines ~by:"\n" in 47 | let _num_feats, name = parse_header_line header in 48 | name 49 | -------------------------------------------------------------------------------- /src/ph4_atom.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* small module to help compute atom environments from *.ph4 files *) 10 | 11 | open Printf 12 | 13 | type t = { idx: int; 14 | typ: Ph4.t } 15 | 16 | let create (idx: int) (typ: Ph4.t): t = 17 | (* indexes start at 1 in MOL2 files *) 18 | { idx = idx - 1; typ } 19 | 20 | let dummy = create (-1) Ph4.Non 21 | 22 | let of_ph4_line (l: string): t = 23 | try Scanf.sscanf l "%d %c" 24 | (fun idx char -> create idx (Ph4.of_char char)) 25 | with Scanf.Scan_failure msg -> 26 | failwith (sprintf "Ph4_atom.of_ph4_line: could not parse: %s: %s" l msg) 27 | 28 | let to_string (a: t): string = 29 | sprintf "%d %s" a.idx (Ph4.to_string a.typ) 30 | -------------------------------------------------------------------------------- /src/piEltHA.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* format: (#pi elec.,elt,#HA) cf. bin/type_atoms.py for details *) 10 | 11 | type t = string 12 | 13 | let dummy = "" 14 | 15 | let to_string x = 16 | x 17 | 18 | let of_string s = 19 | s 20 | 21 | let compare x y = 22 | String.compare x y 23 | -------------------------------------------------------------------------------- /src/prune.ml: -------------------------------------------------------------------------------- 1 | 2 | (* prune features from a dictionary file, provided a list of feature indexes 3 | * to remove *) 4 | 5 | module CLI = Minicli.CLI 6 | module Ht = Hashtbl 7 | module L = BatList 8 | module Log = Dolog.Log 9 | module LO = Line_oriented 10 | module S = BatString 11 | module Utls = Molenc.Utls 12 | 13 | open Printf 14 | 15 | let prune_dico features_to_drop in_dico_fn out_dico_fn = 16 | let n = L.length features_to_drop in 17 | Utls.enforce (n > 0) "Model.prune_dico: |features_to_drop| = 0"; 18 | let to_drop = Ht.create n in 19 | L.iter (fun i -> 20 | Ht.add to_drop i () 21 | ) features_to_drop; 22 | Log.info "pruning %d features" (Ht.length to_drop); 23 | LO.with_out_file out_dico_fn (fun out -> 24 | let new_feat_id = ref 0 in 25 | LO.iter in_dico_fn (fun line -> 26 | if S.starts_with line "#" then 27 | (* preserve comments *) 28 | fprintf out "%s\n" line 29 | else 30 | try 31 | (* this is the format for an Atom Pairs dictionary *) 32 | Scanf.sscanf line "%d %s@ %d" (fun featId featStr featCount -> 33 | (* drop some features and update feature ids *) 34 | if not (Ht.mem to_drop featId) then 35 | (fprintf out "%d %s %d\n" !new_feat_id featStr featCount; 36 | incr new_feat_id) 37 | ) 38 | with exn -> (Log.fatal "dico %s: cannot parse line %s" 39 | in_dico_fn line; 40 | raise exn) 41 | ) 42 | ) 43 | 44 | let main () = 45 | Log.(set_log_level INFO); 46 | Log.color_on (); 47 | let argc, args = CLI.init () in 48 | let show_help = CLI.get_set_bool ["-h";"--help"] args in 49 | if argc = 1 || show_help then 50 | (eprintf "usage:\n \ 51 | %s -i input.dix -o output.dix -f features.txt\n \ 52 | -i : input AP dictionary\n \ 53 | -o : output AP dictionary\n \ 54 | -f : file with list of features to drop\n \ 55 | (format: one integer per line)\n" 56 | Sys.argv.(0); 57 | exit 1) 58 | else 59 | let input_fn = CLI.get_string ["-i"] args in 60 | let output_fn = CLI.get_string ["-o"] args in 61 | let features_fn = CLI.get_string ["-f"] args in 62 | CLI.finalize(); 63 | let features = LO.map features_fn int_of_string in 64 | prune_dico features input_fn output_fn 65 | 66 | let () = main () 67 | -------------------------------------------------------------------------------- /src/pubchem_decoder.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* decode pubchem FPs (881 bits) to liblinear format 10 | Here is a pubchem FP example line: 11 | --- 12 | 4036230,0.0,0001010110...01010100001 13 | --- 14 | *) 15 | 16 | open Printf 17 | 18 | module CLI = Minicli.CLI 19 | module Log = Dolog.Log 20 | module LO = Line_oriented 21 | module String = BatString 22 | module Utls = Molenc.Utls 23 | 24 | let liblinear_line_of_pubchem_line line = 25 | match String.split_on_char ',' line with 26 | | [name; _IC50; bitstring] -> 27 | let is_active = String.starts_with name "active" in 28 | let nb_bits = String.length bitstring in 29 | assert(nb_bits = 881 || nb_bits = 2048 || nb_bits = 16384); 30 | let buff = Buffer.create 1024 in 31 | Buffer.add_string buff (if is_active then "+1" else "-1"); 32 | String.iteri (fun i c -> 33 | if c = '1' then 34 | let k = i + 1 in (* in liblinear: feature indexes start at 1 *) 35 | Printf.bprintf buff " %d:1" k 36 | ) bitstring; 37 | Buffer.contents buff 38 | | _ -> failwith ("Pubchem_decoder: invalide line: " ^ line) 39 | 40 | let main () = 41 | Log.(set_log_level INFO); 42 | Log.color_on (); 43 | let argc, args = CLI.init () in 44 | if argc = 1 then 45 | (eprintf "usage: %s\n \ 46 | -i : encoded molecules\n \ 47 | -o : decoded molecules for liblinear\n" 48 | Sys.argv.(0); 49 | exit 1); 50 | let input_fn = CLI.get_string ["-i"] args in 51 | let output_fn = CLI.get_string ["-o"] args in 52 | CLI.finalize (); 53 | let line_counter = ref 0 in 54 | LO.with_infile_outfile input_fn output_fn (fun input output -> 55 | try 56 | while true do 57 | let in_line = input_line input in 58 | incr line_counter; 59 | if !line_counter mod 1000 = 0 then 60 | eprintf "read: %d\r%!" !line_counter; 61 | let out_line = liblinear_line_of_pubchem_line in_line in 62 | fprintf output "%s\n" out_line 63 | done 64 | with End_of_file -> () 65 | ); 66 | eprintf "read: %d\n" !line_counter 67 | 68 | let () = main () 69 | -------------------------------------------------------------------------------- /src/rank.ml: -------------------------------------------------------------------------------- 1 | (* compute ranks associated to each score 2 | equal scores will be given equal ranks *) 3 | 4 | open Printf 5 | 6 | module CLI = Minicli.CLI 7 | module Ht = Hashtbl 8 | module L = BatList 9 | module Log = Dolog.Log 10 | module LO = Line_oriented 11 | module String = BatString 12 | module Utls = Molenc.Utls 13 | 14 | let main () = 15 | Log.(set_log_level INFO); 16 | Log.color_on (); 17 | let argc, args = CLI.init () in 18 | if argc = 1 then 19 | begin 20 | eprintf "usage:\n\ 21 | %s\n \ 22 | -i : input scores file\n \ 23 | -o : output rank and scores file\n \ 24 | -f : score field (>= 1)\n \ 25 | -d : field separator (default=\\t)\n \ 26 | [-r]: increasing scores order (default=decreasing)\n" 27 | Sys.argv.(0); 28 | exit 1 29 | end; 30 | let input_fn = CLI.get_string ["-i"] args in 31 | let output_fn = CLI.get_string ["-o"] args in 32 | let sep = CLI.get_char_def ["-d"] args '\t' in 33 | let field = (CLI.get_int ["-f"] args) - 1 in 34 | let revert = CLI.get_set_bool ["-r"] args in 35 | CLI.finalize(); 36 | let all_scores = ref [] in 37 | (* read all scores *) 38 | LO.iter input_fn (fun line -> 39 | let score_field = String.cut_on_char sep field line in 40 | let score = 41 | try Scanf.sscanf score_field "%f" (fun x -> x) 42 | with exn -> 43 | begin 44 | Log.fatal "Rank: cannot parse float: %s" score_field; 45 | raise exn 46 | end in 47 | all_scores := score :: !all_scores 48 | ); 49 | (* create the score to rank LUT *) 50 | let uniq_scores = 51 | let cmp = 52 | if revert then BatFloat.compare (* increasing sort *) 53 | else 54 | (* default: scores in decreasing order; i.e. the highest score 55 | gets the lowest rank *) 56 | (fun x y -> BatFloat.compare y x) in 57 | L.sort_uniq cmp !all_scores in 58 | let score2rank = Ht.create (L.length uniq_scores) in 59 | L.iteri (fun i score -> 60 | Ht.add score2rank score i 61 | ) uniq_scores; 62 | (* output all lines, allong with their rank *) 63 | LO.with_out_file output_fn (fun output -> 64 | LO.iter input_fn (fun line -> 65 | let score_field = String.cut_on_char sep field line in 66 | let score = 67 | try Scanf.sscanf score_field "%f" (fun x -> x) 68 | with exn -> 69 | begin 70 | Log.fatal "Rank: cannot parse float: %s" score_field; 71 | raise exn 72 | end in 73 | let rank = Ht.find score2rank score in 74 | fprintf output "%s%c%d\n" line sep rank 75 | ) 76 | ) 77 | 78 | let () = main () 79 | -------------------------------------------------------------------------------- /src/rdkit_wrapper_specs.txt: -------------------------------------------------------------------------------- 1 | 2 | val __init__: smi:string -> unit -> t 3 | 4 | val add_hydrogens: t -> unit -> t 5 | 6 | val type_atom: t -> i:int -> unit -> int array 7 | 8 | val type_EltFCaroNeighbs: t -> i:int -> unit -> int array 9 | 10 | val type_atom_simple: t -> i:int -> unit -> int array 11 | 12 | val daylight_type_heavy_atom: t -> i:int -> unit -> int array 13 | 14 | val get_num_atoms: t -> unit -> int 15 | 16 | val get_diameter: t -> unit -> int 17 | 18 | val get_distance: t -> i:int -> j:int -> unit -> int 19 | 20 | val get_distances: t -> i:int -> unit -> int array 21 | 22 | val get_deep_smiles: t -> seed:int -> n:int -> randomize:bool -> smi:string -> unit -> string array 23 | 24 | val get_elements: t -> unit -> string array 25 | 26 | val get_anums: t -> unit -> int array 27 | -------------------------------------------------------------------------------- /src/scale.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | open Printf 10 | 11 | type t = Single of int (* encoding radius *) 12 | | Multi of int * int (* start-stop encoding radii *) 13 | 14 | let of_string s = 15 | if BatString.contains s ':' then 16 | let istr, jstr = BatString.split s ~by:":" in 17 | let i, j = int_of_string istr, int_of_string jstr in 18 | assert(i >= 0 && j >= 0 && i <= j); 19 | Multi (i, j) 20 | else 21 | Single (int_of_string s) 22 | 23 | (* Example first line: ^#radius=0..1$ 24 | i.e. the radius indicator is everything after "#radius=" *) 25 | let of_dictionary_header fn = 26 | let header = Utls.get_first_line fn in 27 | let prfx = "#radius=" in 28 | let prfx_len = String.length prfx in 29 | Utls.enforce (BatString.starts_with header prfx) 30 | "Scale.of_dictionary_header: not a circular FP dictionary header; \ 31 | --pairs CLI option probably missing"; 32 | let s = BatString.lchop ~n:prfx_len header in 33 | of_string s 34 | 35 | let to_string = function 36 | | Single i -> sprintf "%d" i 37 | | Multi (i, j) -> sprintf "%d:%d" i j 38 | 39 | let to_list = function 40 | | Single i -> [i] 41 | | Multi (i, j) -> BatList.range i `To j 42 | -------------------------------------------------------------------------------- /src/sdf.ml: -------------------------------------------------------------------------------- 1 | 2 | (* one molecule in SDF format (i.e. consecutive lines from a .sdf file) *) 3 | type t = string 4 | 5 | exception Read_one 6 | 7 | let read_one (input: in_channel): t = 8 | let buff = Buffer.create 10240 in 9 | try 10 | while true do 11 | let line = input_line input in 12 | if line = "$$$$" then (* end of molecule in SDF format *) 13 | (Buffer.add_string buff line; 14 | Buffer.add_char buff '\n'; 15 | raise Read_one) 16 | else 17 | (Buffer.add_string buff line; 18 | Buffer.add_char buff '\n') 19 | done; 20 | assert(false) 21 | with 22 | | End_of_file | Read_one -> 23 | let res = Buffer.contents buff in 24 | if res = "" then 25 | raise End_of_file 26 | else 27 | res 28 | 29 | (* return the inchi string, no trailing '\n' *) 30 | let get_inchi (mol: t): string = 31 | let line_before = "> \n" in 32 | let n = String.length line_before in 33 | try 34 | let i = BatString.find mol line_before in 35 | let j = i + n in 36 | let k = BatString.find_from mol j "\n" in 37 | BatString.sub mol j (k - j) 38 | with Not_found -> 39 | failwith ("Sdf.get_inchi: no inchi for: " ^ mol) 40 | 41 | let get_inchikey (mol: t): string = 42 | let line_before = "> \n" in 43 | let n = String.length line_before in 44 | try 45 | let i = BatString.find mol line_before in 46 | let j = i + n in 47 | let k = BatString.find_from mol j "\n" in 48 | BatString.sub mol j (k - j) 49 | with Not_found -> 50 | failwith ("Sdf.get_inchikey: no inchikey for: " ^ mol) 51 | 52 | let get_fst_line m = 53 | fst (BatString.split m ~by:"\n") 54 | -------------------------------------------------------------------------------- /src/sdf_read.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2022, Francois Berenger 2 | 3 | Tsuda laboratory, Graduate School of Frontier Sciences, 4 | The University of Tokyo, Japan. 5 | 6 | Dump a .sdf file in txt format. *) 7 | 8 | open Printf 9 | 10 | module A = BatArray 11 | module L = BatList 12 | module LO = Line_oriented 13 | module Sdf_3D = Molenc.Sdf_3D 14 | module V3 = Vector3 15 | 16 | let main () = 17 | let input_fn = Sys.argv.(1) in 18 | LO.with_in_file input_fn (fun input -> 19 | try 20 | while true do 21 | let mol = Sdf_3D.read_one_molecule input in 22 | let name = Sdf_3D.(mol.name) in 23 | let elts = Sdf_3D.(mol.elements) in 24 | let coords = Sdf_3D.(mol.coords) in 25 | let bonds = Sdf_3D.(mol.bonds) in 26 | printf "%s\n" name; 27 | A.iter2 (fun xyz anum -> 28 | let (x, y, z) = V3.to_triplet xyz in 29 | let elt = Sdf_3D.symbol_of_anum anum in 30 | printf "%10.4f%10.4f%10.4f %s\n" x y z elt 31 | ) coords elts; 32 | A.iteri (fun src_a connected_atoms -> 33 | L.iter (fun dst_a -> 34 | printf "%3d%3d\n" (1 + src_a) (1 + dst_a) 35 | ) connected_atoms 36 | ) bonds 37 | done 38 | with End_of_file -> () 39 | ) 40 | 41 | let () = main () 42 | -------------------------------------------------------------------------------- /src/shannon.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* Compute Shannon information entropy of atom pair features *) 10 | 11 | open Printf 12 | 13 | module CLI = Minicli.CLI 14 | module Fp = Molenc.Fingerprint 15 | module FpMol = Molenc.FpMol 16 | module Ht = BatHashtbl 17 | module L = BatList 18 | module LO = Line_oriented 19 | module Log = Dolog.Log 20 | module Utls = Molenc.Utls 21 | 22 | let log2_scale = 1.0 /. (log 2.0) 23 | 24 | let log2 x = 25 | log2_scale *. (log x) 26 | 27 | let shannon_entropy n val_counts = 28 | let res = ref 0.0 in 29 | Ht.iter (fun _value count -> 30 | let p_i = (float count) /. n in 31 | res := !res +. (p_i *. log2 p_i) 32 | ) val_counts; 33 | -1.0 *. !res 34 | 35 | let main () = 36 | Log.(set_log_level INFO); 37 | Log.color_on (); 38 | let argc, args = CLI.init () in 39 | if argc = 1 then 40 | (eprintf "usage:\n \ 41 | %s -i mols.txt -n num_features -o entropy.txt\n \ 42 | -i : input molecules file\n \ 43 | -n : number of features\n \ 44 | -o : output file\n" 45 | Sys.argv.(0); 46 | exit 1); 47 | let input_fn = CLI.get_string ["-i"] args in 48 | let nb_features = CLI.get_int ["-n"] args in 49 | let output_fn = CLI.get_string ["-o"] args in 50 | CLI.finalize (); 51 | Log.info "reading molecules..."; 52 | let all_molecules = LO.map input_fn (FpMol.parse_one 0) in 53 | Log.info "read: %d" (L.length all_molecules); 54 | Log.info "computing entropy..."; 55 | let ht = Ht.create nb_features in 56 | for i = 0 to nb_features - 1 do 57 | Ht.add ht i (Ht.create 11) 58 | done; 59 | L.iter (fun mol -> 60 | Fp.kv_iter (fun feat_id feat_count -> 61 | let acc = Ht.find ht feat_id in 62 | let prev_count = Ht.find_default acc feat_count 0 in 63 | Ht.replace acc feat_count (prev_count + 1) 64 | ) (FpMol.get_fp mol) 65 | ) all_molecules; 66 | let total = ref 0 in 67 | for i = 0 to nb_features - 1 do 68 | let acc = Ht.find ht i in 69 | Ht.iter (fun _k v -> 70 | total := !total + v 71 | ) acc 72 | done; 73 | Log.info "total: %d" !total; 74 | let n = float !total in 75 | (* entropy of each feature *) 76 | let feat_ent = ref [] in 77 | for i = 0 to nb_features - 1 do 78 | let acc = Ht.find ht i in 79 | let ent = shannon_entropy n acc in 80 | if ent > 0.0 then 81 | feat_ent := (i, ent) :: !feat_ent 82 | done; 83 | (* sort features by decreasing entropy *) 84 | let feat_encr_decr = 85 | L.sort (fun (_i, ei) (_j, ej) -> 86 | BatFloat.compare ej ei 87 | ) !feat_ent in 88 | let cumulated = ref 0.0 in 89 | LO.with_out_file output_fn (fun out -> 90 | L.iter (fun (feat, ent) -> 91 | fprintf out "%d %f %f\n" feat ent !cumulated; 92 | cumulated := !cumulated +. ent 93 | ) feat_encr_decr 94 | ) 95 | 96 | let () = main () 97 | -------------------------------------------------------------------------------- /src/shuf.ml: -------------------------------------------------------------------------------- 1 | (* replacement for UNIX's shuf command, but seedable for reproducibility *) 2 | 3 | open Printf 4 | 5 | module A = BatArray 6 | module CLI = Minicli.CLI 7 | module LO = Line_oriented 8 | module Log = Dolog.Log 9 | module RNG = BatRandom.State 10 | 11 | exception Early_stop 12 | 13 | let main () = 14 | Log.(set_log_level INFO); 15 | Log.color_on (); 16 | let argc, args = CLI.init () in 17 | (if argc = 1 then 18 | begin 19 | eprintf "usage:\n\ 20 | %s\n \ 21 | -i : input file\n \ 22 | -o : output file\n \ 23 | [-n ]: output at most N lines (default=all)\n \ 24 | [-s ]: random seed (default=none)\n" 25 | Sys.argv.(0); 26 | exit 1 27 | end 28 | ); 29 | let input_fn = CLI.get_string ["-i"] args in 30 | let output_fn = CLI.get_string ["-o"] args in 31 | let maybe_n = CLI.get_int_opt ["-n"] args in 32 | let maybe_seed = CLI.get_int_opt ["-s"] args in 33 | CLI.finalize (); (* ------------------------------------------------------ *) 34 | (* input *) 35 | let all_lines = A.of_list (LO.lines_of_file input_fn) in 36 | let count = A.length all_lines in 37 | (* output all or not? *) 38 | let output_n = match maybe_n with 39 | | None -> count 40 | | Some m -> min count m in 41 | let rng = match maybe_seed with 42 | | None -> RNG.make_self_init () 43 | | Some s -> RNG.make [|s|] in 44 | (* shuffle *) 45 | A.shuffle ~state:rng all_lines; 46 | (* output *) 47 | LO.with_out_file output_fn (fun out -> 48 | try 49 | A.iteri (fun i line -> 50 | if i < output_n then 51 | fprintf out "%s\n" line 52 | else 53 | raise Early_stop 54 | ) all_lines 55 | with Early_stop -> () 56 | ) 57 | 58 | let () = main () 59 | -------------------------------------------------------------------------------- /src/smi.ml: -------------------------------------------------------------------------------- 1 | 2 | (* read one molecule from a SMILES file *) 3 | let read_one (input: in_channel): string = 4 | let line = input_line input in 5 | (* strip protects against trailing '\r' 6 | we append '\n' because all other formats 7 | end molecules with a '\n' *) 8 | (BatString.strip line) ^ "\n" 9 | 10 | let get_name smiles_line = 11 | let _smiles, name = 12 | try BatString.split smiles_line ~by:"\t" 13 | with Not_found -> failwith "Smi.get_name: smiles file not using tabs" in 14 | (* all other file formats expect molecule name only, 15 | * not molecule name followed by EOL *) 16 | if BatString.ends_with name "\r\n" then 17 | BatString.rchop ~n:2 name 18 | else if BatString.ends_with name "\n" then 19 | BatString.rchop ~n:1 name 20 | else 21 | name 22 | -------------------------------------------------------------------------------- /src/syb_atom.ml: -------------------------------------------------------------------------------- 1 | (* Copyright (C) 2020, Francois Berenger 2 | 3 | Yamanishi laboratory, 4 | Department of Bioscience and Bioinformatics, 5 | Faculty of Computer Science and Systems Engineering, 6 | Kyushu Institute of Technology, 7 | 680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *) 8 | 9 | (* very small atom module to help compute atom environments (a la molprint2d) 10 | from MOL2 files *) 11 | 12 | open Printf 13 | 14 | type t = { idx: int; 15 | typ: Sybyl.t } 16 | 17 | let create (idx: int) (typ: Sybyl.t): t = 18 | (* indexes start at 1 in MOL2 files *) 19 | { idx = idx - 1; typ } 20 | 21 | let dummy = create (-1) Sybyl.Du 22 | 23 | (* example line (output of OpenBabel): 24 | " 1 S -0.0218 1.7554 0.0117 S.2 1 LIG1 -0.0637" *) 25 | let of_mol2_line (l: string): t = 26 | try Scanf.sscanf l " %d %s %f %f %f %s@ %s@ %s@ %s" 27 | (fun idx _name _x _y _z typ _bs0 _bs1 _bs2 -> 28 | create idx (Sybyl.of_string typ)) 29 | with Scanf.Scan_failure msg -> 30 | failwith (sprintf "Atom.of_mol2_line: could not parse: %s: %s" l msg) 31 | 32 | let to_string (a: t): string = 33 | sprintf "%d %s" a.idx (Sybyl.to_string a.typ) 34 | -------------------------------------------------------------------------------- /src/test_RS.ml: -------------------------------------------------------------------------------- 1 | 2 | (* initial tests of random coordinates sub-sampling wihout replacement 3 | (for fast but approximate Jaccard computation) *) 4 | 5 | open Printf 6 | 7 | module A = BatArray 8 | module CLI = Minicli.CLI 9 | module Fp = Molenc.Fingerprint 10 | module FpMol = Molenc.FpMol 11 | module Ht = Hashtbl 12 | module L = BatList 13 | module Log = Dolog.Log 14 | module Utls = Molenc.Utls 15 | 16 | let main () = 17 | Log.color_on (); 18 | Log.set_log_level Log.INFO; 19 | let argc, args = CLI.init () in 20 | if argc = 1 then 21 | (eprintf "usage:\n\ 22 | %s -p -i [-n ]\n" 23 | Sys.argv.(0); 24 | exit 1); 25 | let input_fn = CLI.get_string ["-i"] args in 26 | let drop_p = CLI.get_float ["-p"] args in 27 | assert(drop_p > 0.0 && drop_p < 1.0); 28 | let nb_iter = CLI.get_int_def ["-n"] args 10_000 in 29 | CLI.finalize (); 30 | let alpha = 1.0 /. (1.0 -. drop_p) in 31 | (* read all molecules *) 32 | let molecules = FpMol.molecules_of_file input_fn in 33 | let nb_mols = L.length molecules in 34 | Log.info "nb_mols: %d" nb_mols; 35 | let fingerprints = A.of_list (L.map FpMol.get_fp molecules) in 36 | let nb_features = L.max (L.map FpMol.nb_features molecules) in 37 | Log.info "nb_features: %d" nb_features; 38 | let feat_id_max = nb_features - 1 in 39 | let rand_feat_ids = 40 | let all_features = L.range 0 `To feat_id_max in 41 | L.shuffle all_features in 42 | let truncated = 43 | let n = Utls.ceili (drop_p *. (float nb_features)) in 44 | let to_drop = Ht.create n in 45 | let candidates = L.take n rand_feat_ids in 46 | L.iter (fun i -> 47 | Ht.add to_drop i () 48 | ) candidates; 49 | A.map (Fp.drop_features to_drop) fingerprints in 50 | for _ = 1 to nb_iter do 51 | let i = Random.int nb_mols in 52 | let j = Random.int nb_mols in 53 | let fp_i = fingerprints.(i) in 54 | let fp_j = fingerprints.(j) in 55 | let sum_min, sum_max = Fp.sum_min_max fp_i fp_j in 56 | let exact_tani = Fp.tanimoto fp_i fp_j in 57 | let tfp_i = truncated.(i) in 58 | let tfp_j = truncated.(j) in 59 | let est_tani = Fp.tanimoto tfp_i tfp_j in 60 | let est_sum_min, est_sum_max = Fp.sum_min_max tfp_i tfp_j in 61 | let x = float sum_min /. float est_sum_min in 62 | let y = float sum_max /. float est_sum_max in 63 | printf "Tani:\t%.3f\t%.3f\t%.3f" exact_tani est_tani 64 | (abs_float (exact_tani -. est_tani)); 65 | printf "\tn: %d\t%d\t%.3f\t%.3f\t%.3f" sum_min est_sum_min 66 | x alpha ((x -. alpha) /. x); 67 | printf "\tu: %d\t%d\t%.3f\t%.3f\t%.3f\n" sum_max est_sum_max 68 | y alpha ((y -. alpha) /. y) 69 | done 70 | 71 | (* FBR: on stderr, print the average absolute error in each case *) 72 | 73 | (* FBR: write unit test *) 74 | 75 | let () = main () 76 | -------------------------------------------------------------------------------- /src/to_dense.ml: -------------------------------------------------------------------------------- 1 | 2 | (* read a molenc output file (.txt) and output it in dense csv format for R *) 3 | 4 | open Printf 5 | 6 | module A = BatArray 7 | module Bloom = Molenc.Bloom 8 | module CLI = Minicli.CLI 9 | module Log = Dolog.Log 10 | module LO = Line_oriented 11 | module Fp = Molenc.Fingerprint 12 | module Utls = Molenc.Utls 13 | 14 | let expand_line nb_features maybe_bloom line = 15 | try 16 | Scanf.sscanf line "%s@,%f,%s" 17 | (fun _name pIC50 fp_str -> 18 | printf "%f" pIC50; 19 | let fp = Fp.of_string fp_str in 20 | (match maybe_bloom with 21 | | None -> Fp.to_dense_printf nb_features fp; 22 | | Some params -> 23 | let bloom = Bloom.encode params fp in 24 | A.iter (printf " %d") bloom 25 | ); 26 | printf "\n" 27 | ) 28 | with exn -> 29 | (Log.fatal "cannot parse: %s" line; 30 | raise exn) 31 | 32 | let main () = 33 | Log.color_on (); 34 | Log.set_log_level Log.INFO; 35 | Log.info "start"; 36 | let argc, args = CLI.init () in 37 | let show_help = CLI.get_set_bool ["-h";"--help"] args in 38 | if argc = 1 || show_help then 39 | (eprintf "usage:\n\ 40 | %s [-np ] -i -n \n\ 41 | [--bloom ,]: k,m counted Bloom filter params\n" 42 | Sys.argv.(0); 43 | exit 1); 44 | let _nprocs = CLI.get_int_def ["-np"] args 1 in 45 | let input_fn = CLI.get_string ["-i"] args in 46 | let input_features = CLI.get_int ["-n"] args in 47 | let output_features, maybe_bloom = 48 | match CLI.get_string_opt ["--bloom"] args with 49 | | None -> (input_features, None) 50 | | Some k_m -> 51 | Scanf.sscanf k_m "%d,%d" (fun k m -> 52 | Utls.enforce (m < input_features) "m >= input_features"; 53 | (m, Some (Bloom.init input_features k m)) 54 | ) in 55 | CLI.finalize (); 56 | (* CSV header made of column numbers: IC50 in first column then features *) 57 | printf "0"; 58 | for i = 1 to output_features do 59 | printf " %d" i 60 | done; 61 | printf "\n"; 62 | (* dense data lines, with optional counted Bloom filter encoding *) 63 | LO.iter input_fn (expand_line input_features maybe_bloom) 64 | 65 | let () = main () 66 | -------------------------------------------------------------------------------- /src/uniq.ml: -------------------------------------------------------------------------------- 1 | 2 | (* uniq filter: keep only line if given field was never seen before *) 3 | 4 | open Printf 5 | 6 | module CLI = Minicli.CLI 7 | module Db = Dokeysto.Db.RW 8 | module Ht = Hashtbl 9 | module Log = Dolog.Log 10 | module LO = Line_oriented 11 | module String = BatString 12 | module Utls = Molenc.Utls 13 | 14 | module type HT = sig 15 | type t 16 | val create: string -> t 17 | val mem: t -> string -> bool 18 | val add: t -> string -> unit 19 | val close: t -> unit 20 | end 21 | 22 | module HtOnDisk: HT = struct 23 | 24 | type t = Dokeysto.Db.RW.t 25 | 26 | let create input_fn = 27 | Db.create (input_fn ^ ".uniq.db") 28 | 29 | let mem db field = 30 | Db.mem db field 31 | 32 | let add db field = 33 | Db.add db field "" 34 | 35 | let close db = 36 | Db.close db 37 | end 38 | 39 | module HtInRAM: HT = struct 40 | 41 | type t = (string, unit) Ht.t 42 | 43 | let create _input_fn = 44 | (* DO NOT ever try to read the whole input file in case of --in-RAM: 45 | * we want to be able to read molecules from a UNIX pipe *) 46 | Ht.create 1_000_000 47 | 48 | let mem db field = 49 | Ht.mem db field 50 | 51 | let add db field = 52 | Ht.add db field () 53 | 54 | let close _db = 55 | () 56 | end 57 | 58 | let main () = 59 | Log.(set_log_level INFO); 60 | Log.color_on (); 61 | let argc, args = CLI.init () in 62 | if argc = 1 then 63 | begin 64 | eprintf "usage:\n\ 65 | %s\n \ 66 | -i : input file\n \ 67 | -d : field separator (default=\\t)\n \ 68 | -f : field to filter on\n \ 69 | [--force]: erase index files, if any\n \ 70 | [--sorted]: file already sorted on that field\n \ 71 | [--in-RAM]: Ht in RAM rather than on disk\n" 72 | Sys.argv.(0); 73 | exit 1 74 | end; 75 | let mod_db = 76 | if CLI.get_set_bool ["--in-RAM"] args then 77 | (module HtInRAM: HT) 78 | else (module HtOnDisk: HT) in 79 | let module DB = (val mod_db: HT) in 80 | let sorted = CLI.get_set_bool ["--sorted"] args in 81 | let force = CLI.get_set_bool ["--force"] args in 82 | let input_fn = CLI.get_string ["-i"] args in 83 | (if force then 84 | (Utls.rm_file (input_fn ^ ".uniq.db"); 85 | Utls.rm_file (input_fn ^ ".uniq.db.idx")) 86 | ); 87 | let db = DB.create input_fn in 88 | let prev_field = ref "" in 89 | let uniq_field_check, register_field = 90 | if sorted then 91 | ((fun field -> !prev_field <> field), 92 | (fun field -> prev_field := field)) 93 | else 94 | ((fun field -> not (DB.mem db field)), 95 | (fun field -> DB.add db field)) 96 | in 97 | let sep = CLI.get_char_def ["-d"] args '\t' in 98 | let field_num = (CLI.get_int ["-f"] args) - 1 in 99 | let count = ref 0 in 100 | LO.with_in_file input_fn (fun input -> 101 | try 102 | while true do 103 | let line = input_line input in 104 | let field_str = String.cut_on_char sep field_num line in 105 | (if uniq_field_check field_str then 106 | (register_field field_str; 107 | printf "%s\n" line) 108 | ); 109 | incr count; 110 | (if !count mod 1000 = 0 then 111 | eprintf "done: %d\r%!" !count 112 | ) 113 | done 114 | with End_of_file -> DB.close db 115 | ) 116 | 117 | let () = main () 118 | -------------------------------------------------------------------------------- /src/wmh_bench.ml: -------------------------------------------------------------------------------- 1 | 2 | open Printf 3 | 4 | module A = BatArray 5 | module CLI = Minicli.CLI 6 | module Fp = Molenc.Fingerprint 7 | module FpMol = Molenc.FpMol 8 | module L = BatList 9 | module Log = Dolog.Log 10 | module Utls = Molenc.Utls 11 | module WMH = Molenc.WMH 12 | 13 | let main () = 14 | Log.color_on (); 15 | Log.set_log_level Log.INFO; 16 | let argc, args = CLI.init () in 17 | if argc = 1 then 18 | (eprintf "usage:\n\ 19 | %s -i encoded_molecules.txt\n" Sys.argv.(0); 20 | exit 1); 21 | let input_fn = CLI.get_string ["-i"] args in 22 | (* read all molecules *) 23 | let molecules = FpMol.molecules_of_file input_fn in 24 | let nb_features = L.max (L.map FpMol.nb_features molecules) in 25 | let sparse_fingerprints = A.of_list (L.map FpMol.get_fp molecules) in 26 | let bounds = WMH.bounds nb_features sparse_fingerprints in 27 | let idx2feat = WMH.lookup_table bounds in 28 | let rand_bound = A.length idx2feat in 29 | let feat2acc_bound = WMH.acc_bounds_table bounds in 30 | let dense_fingerprints = A.map (WMH.to_dense nb_features) sparse_fingerprints in 31 | let n = A.length sparse_fingerprints in 32 | Log.info "read %d molecules" n; 33 | let ks = [40] in 34 | (* bench hashing and scoring speeds *) 35 | L.iter (fun k -> 36 | (* hash them (and compute hashing rate) *) 37 | let seeds = WMH.get_seeds k in 38 | let rands = WMH.gen_rands seeds rand_bound in 39 | let dt0, hashes = Utls.time_it (fun () -> 40 | A.map (WMH.hash rands idx2feat feat2acc_bound) dense_fingerprints 41 | ) in 42 | Log.info "k: %d hashing-rate: %.2f" k (float n /. dt0); 43 | (* compute estimated tani for the same pairs (and compute scoring rate) *) 44 | let dt2, _est_dists = Utls.time_it (fun () -> 45 | let res = A.make n 0.0 in 46 | for i = 0 to n - 1 do 47 | let i1 = Random.int n in 48 | let i2 = Random.int n in 49 | let m1 = A.get hashes i1 in 50 | let m2 = A.get hashes i2 in 51 | let tani = WMH.estimate_jaccard m1 m2 in 52 | A.set res i tani 53 | done; 54 | res) in 55 | let est_tani_rate = (float n) /. dt2 in 56 | Log.info "k: %d est-Tani-rate: %.2f" k est_tani_rate 57 | ) ks 58 | 59 | let () = main () 60 | -------------------------------------------------------------------------------- /src/wmh_test.ml: -------------------------------------------------------------------------------- 1 | 2 | open Printf 3 | 4 | module A = BatArray 5 | module CLI = Minicli.CLI 6 | module Fp = Molenc.Fingerprint 7 | module FpMol = Molenc.FpMol 8 | module L = BatList 9 | module Log = Dolog.Log 10 | module Utls = Molenc.Utls 11 | module WMH = Molenc.WMH 12 | 13 | let main () = 14 | Log.color_on (); 15 | Log.set_log_level Log.INFO; 16 | let argc, args = CLI.init () in 17 | if argc = 1 then 18 | (eprintf "usage:\n\ 19 | %s -i encoded_molecules.txt\n" Sys.argv.(0); 20 | exit 1); 21 | let input_fn = CLI.get_string ["-i"] args in 22 | (* read all molecules *) 23 | let molecules = FpMol.molecules_of_file input_fn in 24 | let nb_features = L.max (L.map FpMol.nb_features molecules) in 25 | let sparse_fingerprints = A.of_list (L.map FpMol.get_fp molecules) in 26 | let bounds = WMH.bounds nb_features sparse_fingerprints in 27 | let idx2feat = WMH.lookup_table bounds in 28 | let rand_bound = A.length idx2feat in 29 | let feat2acc_bound = WMH.acc_bounds_table bounds in 30 | let dense_fingerprints = A.map (WMH.to_dense nb_features) sparse_fingerprints in 31 | let n = A.length sparse_fingerprints in 32 | Log.info "read %d molecules" n; 33 | (* compute Tani for many pairs (and compute scoring rate) *) 34 | Random.init 12345; (* seed PRNG *) 35 | let pairs = A.init 10_000 (fun _i -> (Random.int n, Random.int n)) in 36 | let dists = A.make 10_000 0.0 in 37 | let dt1, () = Utls.time_it (fun () -> 38 | A.iteri (fun i (i1, i2) -> 39 | let tani = 40 | Fp.tanimoto 41 | (A.unsafe_get sparse_fingerprints i1) 42 | (A.unsafe_get sparse_fingerprints i2) in 43 | A.unsafe_set dists i tani 44 | ) pairs 45 | ) in 46 | let tani_rate = (float n) /. dt1 in 47 | Log.info "Tani-rate: %.2f" tani_rate; 48 | let ks = [10; 20; 30; 40; 50; 100; 200; 500] in 49 | (* test the correctness and bench hashing and scoring speeds 50 | as a function of k (the number of hashes) *) 51 | L.iter (fun k -> 52 | let data_fn = sprintf "k_%03d.data" k in 53 | Utls.with_out_file data_fn (fun out -> 54 | (* hash them (and compute hashing rate) *) 55 | let seeds = WMH.get_seeds k in 56 | let rands = WMH.gen_rands seeds rand_bound in 57 | let dt0, hashes = Utls.time_it (fun () -> 58 | A.map (WMH.hash rands idx2feat feat2acc_bound) dense_fingerprints 59 | ) in 60 | Log.info "k: %d hashing-rate: %11.2f" k (float n /. dt0); 61 | (* compute estimated tani for the same pairs 62 | (and compute scoring rate) *) 63 | let est_dists = A.make 10_000 0.0 in 64 | let dt2, () = Utls.time_it (fun () -> 65 | A.iteri (fun i (i1, i2) -> 66 | let m1 = A.unsafe_get hashes i1 in 67 | let m2 = A.unsafe_get hashes i2 in 68 | let tani = WMH.estimate_jaccard m1 m2 in 69 | A.unsafe_set est_dists i tani 70 | ) pairs 71 | ) in 72 | let est_tani_rate = (float n) /. dt2 in 73 | (if est_tani_rate <= tani_rate 74 | then Log.warn 75 | else Log.info) "k: %d est-Tani-rate: %.2f accel: %.2f" 76 | k est_tani_rate (est_tani_rate /. tani_rate); 77 | A.iteri (fun i exact_dist -> 78 | let abs_error = abs_float (exact_dist -. est_dists.(i)) in 79 | fprintf out "%f %f %f\n" exact_dist est_dists.(i) abs_error 80 | ) dists; 81 | (* output maximum Tani error *) 82 | let diffs = 83 | A.map2 (fun d1 d2 -> abs_float (d1 -. d2)) dists est_dists in 84 | let max_error = A.max diffs in 85 | let avg_error = A.favg diffs in 86 | let med_error = Utls.list_medianf (A.to_list diffs) in 87 | Log.info "k: %d error(max, avg, med): %.2f %.2f %.2f" 88 | k max_error avg_error med_error 89 | ) 90 | ) ks; 91 | Utls.run_command "gnuplot -persist tani_est.gpl" 92 | 93 | let () = main () 94 | -------------------------------------------------------------------------------- /src/wmh_unit_test.ml: -------------------------------------------------------------------------------- 1 | 2 | open Printf 3 | 4 | module A = BatArray 5 | module CLI = Minicli.CLI 6 | module Fp = Molenc.Fingerprint 7 | module FpMol = Molenc.FpMol 8 | module L = BatList 9 | module Log = Dolog.Log 10 | module Utls = Molenc.Utls 11 | module WMH = Molenc.WMH 12 | 13 | let print_array title a = 14 | printf "%s:" title; 15 | for i = 0 to (A.length a) - 1 do 16 | printf " %d" a.(i) 17 | done; 18 | printf "\n" 19 | 20 | let printi_array title a = 21 | printf "%s:" title; 22 | for i = 0 to (A.length a) - 1 do 23 | printf " %d:%d" i a.(i) 24 | done; 25 | printf "\n" 26 | 27 | let main () = 28 | Log.color_on (); 29 | Log.set_log_level Log.INFO; 30 | (* read all molecules *) 31 | let molecules = 32 | L.mapi FpMol.parse_one 33 | ["m0,0.0,[2:1;3:1]"; 34 | "m1,0.0,[0:2;1:2;2:2;3:2]"] in 35 | let nb_features = L.max (L.map FpMol.nb_features molecules) in 36 | printf "nb_features: %d\n" nb_features; 37 | let sparse_fingerprints = A.of_list (L.map FpMol.get_fp molecules) in 38 | let bounds = WMH.bounds nb_features sparse_fingerprints in 39 | print_array "bounds" bounds; 40 | let idx2feat = WMH.lookup_table bounds in 41 | printi_array "idx2feat" idx2feat; 42 | let rand_bound = A.length idx2feat in 43 | printf "bound: %d\n" rand_bound; 44 | let feat2acc_bound = WMH.acc_bounds_table bounds in 45 | print_array "feat2acc_bound" feat2acc_bound; 46 | let dense_fingerprints = A.map (WMH.to_dense nb_features) sparse_fingerprints in 47 | (* k = 1 *) 48 | let rands = [|0;1;2;3;5;7;4|] in (* only last one should hit *) 49 | let hash = WMH.hash [|rands|] idx2feat feat2acc_bound dense_fingerprints.(0) in 50 | let hash_val = hash.(0) in 51 | assert(hash_val = 6); 52 | let rands = [|0;1;2;3;7;5;6|] in (* only last one should hit *) 53 | let hash = WMH.hash [|rands|] idx2feat feat2acc_bound dense_fingerprints.(0) in 54 | let hash_val = hash.(0) in 55 | assert(hash_val = 6); 56 | let rands = [|4|] in (* 1st one should hit *) 57 | let hash = WMH.hash [|rands|] idx2feat feat2acc_bound dense_fingerprints.(0) in 58 | let hash_val = hash.(0) in 59 | assert(hash_val = 0); 60 | let rands = [|6|] in (* 1st one should hit *) 61 | let hash = WMH.hash [|rands|] idx2feat feat2acc_bound dense_fingerprints.(0) in 62 | let hash_val = hash.(0) in 63 | assert(hash_val = 0) 64 | 65 | let () = main () 66 | -------------------------------------------------------------------------------- /tani_est.gpl: -------------------------------------------------------------------------------- 1 | 2 | set xrange [0:1] 3 | set yrange [0:1] 4 | 5 | set size square 6 | 7 | set xlabel 'Exact Tanimoto' 8 | set ylabel 'Estimated Tanimoto' 9 | 10 | set key outside 11 | 12 | f(x) = x 13 | 14 | plot f(x) not, \ 15 | 'k_010.data' u 1:2 t 'k=10' , \ 16 | 'k_020.data' u 1:2 t 'k=20' , \ 17 | 'k_030.data' u 1:2 t 'k=30' , \ 18 | 'k_040.data' u 1:2 t 'k=40' , \ 19 | 'k_050.data' u 1:2 t 'k=50' , \ 20 | 'k_500.data' u 1:2 t 'k=500' 21 | 22 | # 'k_100.data' u 1:2 t 'k=100' 23 | # 'k_200.data' u 1:2 t 'k=200' 24 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #set -x # DEBUG 4 | 5 | # encoding an SDF or a SMILES file is the same 6 | # and it is the one we expect 7 | diff <(./bin/molenc_type_atoms.py data/caff_coca.sdf) data/caff_coca_types.ref 8 | diff <(./bin/molenc_type_atoms.py data/caff_coca.smi) data/caff_coca_types.ref 9 | 10 | # ph4 features are the same than the ones extracted by ShowFeats.py 11 | # (that were checked by hand and stored in a reference file) 12 | diff <(./bin/molenc_ph4_type_atoms.py data/caff_coca.sdf) data/caff_coca_feats.ref 13 | 14 | diff <(_build/default/src/pubchem_decoder.exe -i data/test_in.pbc -o /dev/stdout) data/test_out.ref 15 | 16 | # atom pairs encoder tests 17 | rm -f data/AP_test.smi.dix data/AP_test.txt # clean any previous run 18 | molenc.sh --pairs -i data/AP_test.smi -o data/AP_test.txt 19 | diff data/AP_test.smi.dix data/AP_test.smi.dix.ref 20 | diff data/AP_test.txt data/AP_test.txt.ref 21 | -------------------------------------------------------------------------------- /test_BBAD.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #set -x # DEBUG 4 | 5 | # check some properties of the AP-BBAD 6 | 7 | # 1) the BBAD of a single molecule is the encoded single molecule 8 | rm -f caffeine_AP_BBAD.txt 9 | _build/default/src/AP_BBAD.exe -i data/caffeine.smi -o caffeine_AP_BBAD.txt 10 | awk -v sum=0 -F' ' '{sum += $2}END{if(sum == 105){print "|features| OK"}}' caffeine_AP_BBAD.txt 11 | 12 | # 2) the BBAD computed in parallel is the same as the sequential one 13 | rm -f seq_AD.txt par_AD.txt 14 | _build/default/src/AP_BBAD.exe -i data/chembl1868_std.smi -o seq_AD.txt -np 1 15 | nprocs=`getconf _NPROCESSORS_ONLN` 16 | _build/default/src/AP_BBAD.exe -i data/chembl1868_std.smi -o par_AD.txt -np ${nprocs} 17 | diff seq_AD.txt par_AD.txt 18 | 19 | # 3) compute a simple BBAD by hand; check this is the one we obtain 20 | rm -f data/alcools.AD.curr 21 | _build/default/src/AP_BBAD.exe -i data/alcools.smi -o data/alcools.AD.curr 22 | diff data/alcools.AD.curr data/alcools.AD.ref 23 | 24 | # 4) the BBAD of some molecules doesn't filter out any of those molecules 25 | rm -f filtered.txt 26 | _build/default/src/AP_BBAD.exe --bbad seq_AD.txt -i data/chembl1868_std.smi -o filtered.txt -np ${nprocs} 27 | diff <(cat data/chembl1868_std.smi | wc -l) <(cat filtered.txt | wc -l) 28 | 29 | # 5) the BBAD union for two sets of molecules should be the same as the AD for the union of the sets 30 | rm -f head_AD.txt tail_AD.txt head_tail_AD_union.txt head_tail_AD.txt 31 | _build/default/src/AP_BBAD.exe -i <(head data/chembl1868_std.smi) -o head_AD.txt -np ${nprocs} 32 | _build/default/src/AP_BBAD.exe -i <(tail data/chembl1868_std.smi) -o tail_AD.txt -np ${nprocs} 33 | _build/default/src/AP_BBAD.exe --bbad head_AD.txt,tail_AD.txt -o head_tail_AD_union.txt 34 | _build/default/src/AP_BBAD.exe -i <(head data/chembl1868_std.smi; tail data/chembl1868_std.smi) \ 35 | -o head_tail_AD.txt 36 | diff head_tail_AD_union.txt head_tail_AD.txt 37 | -------------------------------------------------------------------------------- /test_sdf_read.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x # DEBUG 4 | 5 | # check we parse correctly 10 3D molecules in a .sdf file 6 | rm -f data/chembl30_10mols.txt.curr 7 | _build/default/src/sdf_read.exe data/chembl30_10mols.sdf \ 8 | > data/chembl30_10mols.txt.curr 9 | diff data/chembl30_10mols.txt.ref data/chembl30_10mols.txt.curr 10 | -------------------------------------------------------------------------------- /test_uhd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # regression test for the UHD fingerprint 4 | make 5 | 6 | # cleanup any prior run 7 | \rm -f data/ethanol.uhd data/ethanol.smi.dix 8 | 9 | # run 10 | _build/default/src/molenc_UHD.exe -f -i data/ethanol.smi -o data/ethanol.uhd 11 | 12 | # check Vs refs 13 | diff data/ethanol.uhd data/ethanol.uhd.ref 14 | diff data/ethanol.smi.dix data/ethanol.uhd.dix.ref 15 | --------------------------------------------------------------------------------