├── model ├── __init__.py ├── input_fn.py ├── utils.py ├── mlp_model_fn.py ├── train_fn.py ├── ae_model_fn.py └── evaluate_fn.py ├── .gitignore ├── experiments ├── mlp_model │ └── params.json └── ae_mlp_model │ └── params.json ├── README.md ├── synthesize_results.py ├── scrap.py ├── train.py └── prepare_load_dataset.py /model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/cv_* 2 | **/*.log 3 | **/.pyc 4 | *.ipynb* 5 | __pycache__/ 6 | species.txt 7 | *data/ -------------------------------------------------------------------------------- /experiments/mlp_model/params.json: -------------------------------------------------------------------------------- 1 | {"mlp": 2 | {"num_fc_layers":2, 3 | "fc_hidden_units": [200, 150], 4 | "activation": "tf.nn.relu", 5 | "dropout_probs": [0.45, 0.15], 6 | "batch_size": 64, 7 | "learning_rate" : 0.00015, 8 | "num_epochs": 100, 9 | "threshold" : 0.5, 10 | "best_model_metric" : "loss", 11 | "save_frequency": 5}, 12 | "preprocess": 13 | {"method" : "linear", 14 | "order" : 1 15 | }, 16 | "n_splits" : 5, 17 | "train_ae" : 0 18 | } 19 | -------------------------------------------------------------------------------- /experiments/ae_mlp_model/params.json: -------------------------------------------------------------------------------- 1 | {"mlp": 2 | {"num_fc_layers":2, 3 | "fc_hidden_units": [200, 150], 4 | "activation": "tf.nn.relu", 5 | "dropout_probs": [0.45, 0.15], 6 | "batch_size": 64, 7 | "learning_rate" : 0.00015, 8 | "num_epochs": 100, 9 | "threshold" : 0.5, 10 | "best_model_metric" : "loss", 11 | "save_frequency": 5}, 12 | "ae": 13 | {"num_fc_layers":1, 14 | "fc_hidden_units": [256], 15 | "activation": "tf.nn.relu", 16 | "is_denoising" : 0, 17 | "denoise_inputs" : 0.05, 18 | "batch_size": 64, 19 | "learning_rate" : 0.001, 20 | "num_epochs": 100, 21 | "best_model_metric" : "loss", 22 | "save_frequency": 5}, 23 | "preprocess": 24 | {"method" : "linear", 25 | "order" : 1 26 | }, 27 | "n_splits" : 5, 28 | "train_ae" : 1 29 | } 30 | -------------------------------------------------------------------------------- /model/input_fn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def input_fn(is_training, data, params): 5 | ''' Input function for spectra dataset 6 | 7 | Args: 8 | is_training: (bool) Whether it is training or not 9 | data: (tuple) containing (spectra, target) arrays. 10 | params: (dict) Hyperparameters of the model 11 | 12 | Returns: 13 | inputs: (dict) Contains the iterator and data to be fed to the model 14 | 15 | ''' 16 | 17 | #Shuffle training dataset 18 | if is_training: 19 | dataset = tf.data.Dataset.from_tensor_slices(data)\ 20 | .shuffle(len(data))\ 21 | .batch(params['batch_size'])\ 22 | .prefetch(1) 23 | else: 24 | dataset = tf.data.Dataset.from_tensor_slices(data)\ 25 | .batch(params['batch_size'])\ 26 | .prefetch(1) 27 | 28 | #Create initializable iterator to re-feed data after every epoch 29 | iterator = dataset.make_initializable_iterator() 30 | spectra_data, target = iterator.get_next() 31 | 32 | iterator_initializer_op = iterator.initializer 33 | 34 | inputs = {'spectra_data' : spectra_data, 'target' : target, 'iterator_initializer': iterator_initializer_op} 35 | 36 | return inputs 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CANDIY-spectrum 2 | 3 | Human analyis of chemical spectra such as Mass Spectra (MS), Infra-Red Specta (FTIR), and Nuclear Magnetic Resonance is both time consuming and potentially inaccurate. This project aims to develop a set of methodologies incorporating these spectra for the prediction of chemical functional groups and structures. 4 | 5 | This project is a stub, but we hope that it will spur development of machine learning methods for the analysis of chemical spectra. 6 | 7 | 8 | ## Required Packages 9 | 1) Numpy==1.18.5 10 | 2) Rdkit==2020.03.1 11 | 3) Pandas==1.0.5 12 | 4) Jcamp 13 | 5) Tensorflow==1.15.0 14 | 6) Matplotlib==3.1.1 15 | 7) Sklearn==0.22.1 16 | 17 | ## Scraping 18 | ### Manual Scraping 19 | IR and MS spectra were downloaded from NIST website. https://webbook.nist.gov/chemistry/. 20 | Scraping can be done through replacing the correct CAS number in the placeholder. https://webbook.nist.gov/cgi/cbook.cgi?ID="insert_cas"&Units=SI and downloading the required spectra. 21 | 22 | (Or) 23 | ### Automatic Scraping 24 | Download all the species name available in NIST from this link https://webbook.nist.gov/chemistry/download/. Change path of cas_list to where the species name file is stored. 25 | ``` 26 | python scrap.py --save_dir='./data/' --cas_list='species.txt' --scrap_IR=true --scrap_MS=true --scrap_InChi=true 27 | ``` 28 | 29 | ## Prepare dataset 30 | Parse all jdx files of IR and Mass spectra to standardize and store in a csv format. Also, parse inchi.txt to create target csv indicating presence of functional groups 31 | 32 | ``` 33 | python prepare_load_dataset.py --data_dir='./data/' --cas_list='species.txt' 34 | ``` 35 | 36 | ## Train the model 37 | Run train.py to train the model. The model directory should contain params.json file listing all hyperparameters used for building the model. Optionally weights can be restored from pretrained model. 38 | 39 | ``` 40 | python train.py --model_dir=./experiments/mlp_model --data_dir=./data/ 41 | ``` 42 | -------------------------------------------------------------------------------- /model/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | from sklearn.model_selection import KFold 5 | 6 | def set_logger(model_dir, log_name): 7 | '''Set logger to write info to terminal and save in a file. 8 | 9 | Args: 10 | model_dir: (string) path to store the log file 11 | 12 | Returns: 13 | None 14 | ''' 15 | logger = logging.getLogger() 16 | logger.setLevel(logging.INFO) 17 | 18 | #Don't create redundant handlers everytime set_logger is called 19 | if not logger.handlers: 20 | 21 | #File handler with debug level stored in model_dir/generation.log 22 | fh = logging.FileHandler(os.path.join(model_dir, log_name)) 23 | fh.setLevel(logging.DEBUG) 24 | fh.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s')) 25 | logger.addHandler(fh) 26 | 27 | #Stream handler with info level written to terminal 28 | sh = logging.StreamHandler() 29 | sh.setLevel(logging.INFO) 30 | sh.setFormatter(logging.Formatter('%(message)s')) 31 | logger.addHandler(sh) 32 | 33 | return logger 34 | 35 | def train_test_generator(X, y, n_splits): 36 | ''' 37 | Create a generator to return next train and test data split when called 38 | 39 | Args: 40 | X: (np.array) of dimension [num_samples x features] 41 | y: (np.array) of dimension [num_samples x target groups] 42 | n_splits: (int) Number of cross validation folds 43 | 44 | Returns: 45 | (X_train, y_train): (tuple) of np.arrays containing single fold of train data 46 | (X_test, y_test): (tuple) of np.arrays containing single fold of test data 47 | ''' 48 | 49 | kfold = KFold(n_splits=5,shuffle=True,random_state=4) 50 | 51 | for train_index, val_index in kfold.split(X, y): 52 | X_train, X_val = X[train_index], X[val_index] 53 | y_train, y_val = y[train_index], y[val_index] 54 | 55 | yield (X_train, y_train), (X_val, y_val) 56 | 57 | 58 | -------------------------------------------------------------------------------- /synthesize_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.metrics import precision_recall_curve,f1_score, accuracy_score 6 | 7 | 8 | def compute_thresholds(data_ls): 9 | '''Compete dynamic thresholds for every functional group using val data 10 | 11 | Args: 12 | data_ls: (list) containing val predictions and target of all folds 13 | 14 | Returns: 15 | thresholds: (np.array) containing the thresholds of groups 16 | ''' 17 | logging.info('Computing Thresholds') 18 | #Combine all test data into singel array 19 | data_arr = np.concatenate(data_ls, axis = 1) 20 | data_target = data_arr[1] 21 | data_probs = data_arr[0] 22 | 23 | num_func_groups = data_target.shape[1] 24 | thresholds = np.zeros((1, num_func_groups)) 25 | eps = 1e-7 26 | 27 | #Find threshold resulting in maximum f1 score for each functional group 28 | for i in range(num_func_groups): 29 | pre,rec,thre=precision_recall_curve(data_target[:,i],data_probs[:, i]) 30 | f1 = 2*pre*rec/(pre+rec+eps) 31 | max_ind = np.argmax(f1) 32 | thresholds[0,i] = thre[max_ind] 33 | return thresholds 34 | 35 | def compute_metrics(data_ls, thresholds, func_names): 36 | '''Compete metrics for every fold of train and val data 37 | 38 | Args: 39 | data_ls: (list) containing predictions and target of all folds 40 | thresholds: (np.array) of every functional group 41 | func_names: (list) used as part of target 42 | 43 | Returns: 44 | mol_score_df: (pd.DataFrame) contains mean and std of mol. scores 45 | func_f1_df: (pd.DataFrame) contains mean and std of func. f1 score 46 | ''' 47 | 48 | logging.info('Computing func_f1, mol_f1 and mol_perfection metrics') 49 | num_folds = len(data_ls) 50 | num_groups = thresholds.shape[1] 51 | func_f1= np.zeros((num_folds, num_groups)) 52 | mol_score = np.zeros((num_folds, 2)) 53 | 54 | #Using thresholds find func_f1, mol_f1 and mol_perfection for all folds 55 | for ind, data_fold in enumerate(data_ls): 56 | fold_target = data_fold[1] 57 | fold_preds = (data_fold[0]>thresholds).astype('int') 58 | func_f1[ind,:] = f1_score(fold_target, fold_preds, average = None) 59 | mol_score[ind,1] = f1_score(fold_target, fold_preds, average = 'samples') 60 | mol_score[ind,0] = accuracy_score(fold_target, fold_preds) 61 | 62 | overall_score = np.array([np.mean(mol_score, axis = 0), np.std(mol_score,axis = 0)]) 63 | overall_f1 = np.array([np.mean(func_f1, axis = 0), np.std(func_f1, axis = 0)]) 64 | 65 | print (overall_f1.shape, overall_score.shape) 66 | #Create a dataframe with the results 67 | func_f1_df = pd.DataFrame(overall_f1, columns = func_names, index=['mean', 'std']).T 68 | mol_score_df = pd.DataFrame(overall_score, columns = ['mol. perfection', 'mol. f1'], index=['mean', 'std']).T 69 | 70 | return mol_score_df, func_f1_df 71 | 72 | 73 | def store_results(train_predictions, test_predictions, func_group_names, save_path): 74 | '''Store results in a csv file 75 | 76 | Args: 77 | data_ls: (list) containing predictions and target of all folds 78 | thresholds: (np.array) of every functional group 79 | func_names: (list) used as part of target 80 | 81 | Returns: 82 | None 83 | ''' 84 | thresholds = compute_thresholds(test_predictions) 85 | train_score_df,train_f1_df = compute_metrics(train_predictions, thresholds, func_group_names) 86 | test_score_df,test_f1_df = compute_metrics(test_predictions, thresholds, func_group_names) 87 | 88 | f1_df = pd.concat([train_f1_df, test_f1_df], keys = ['Train', 'Val'], axis = 1) 89 | perf_df = pd.concat([train_score_df, test_score_df], keys = ['Train', 'Val'], axis = 1) 90 | 91 | logging.info('Storing results in {}'.format(save_path)) 92 | f1_df.to_csv(os.path.join(save_path, 'func_f1.csv')) 93 | perf_df.to_csv(os.path.join(save_path, 'mol_score.csv')) 94 | 95 | -------------------------------------------------------------------------------- /scrap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests, urllib 3 | import argparse 4 | import logging 5 | import pandas as pd 6 | 7 | from model.utils import set_logger 8 | 9 | nist_url = "https://webbook.nist.gov/cgi/cbook.cgi" 10 | 11 | def scrap_data(cas_ls, params, data_dir): 12 | '''Collect data from NIST database and store them in jdx format. 13 | 14 | Args: 15 | cas_ls: (list) CAS ids to download data for 16 | params: (dict) queries to be added to url 17 | data_dir: (string) path to store the data 18 | 19 | Returns: 20 | None 21 | ''' 22 | 23 | #Create directory for the relevant spetra 24 | spectra_path = os.path.join(data_dir, params['Type'].lower(), '') 25 | if not os.path.exists(spectra_path): 26 | os.makedirs(spectra_path) 27 | 28 | num_created = 0 29 | for cas_id in cas_ls: 30 | params['JCAMP'] = 'C' + cas_id 31 | response = requests.get(nist_url, params=params) 32 | 33 | if response.text == '##TITLE=Spectrum not found.\n##END=\n': 34 | continue 35 | num_created+=1 36 | logging.info('Creating {} spectra for id: {}. Total spectra created {}'.format(params['Type'].lower(), cas_id, num_created)) 37 | with open(spectra_path +cas_id +'.jdx', 'wb') as data: 38 | data.write(response.content) 39 | 40 | def scrap_inchi(cas_ls, params, data_dir): 41 | '''Collect Inchi keys from NIST database and store them in txt format. 42 | 43 | Args: 44 | cas_ls: (list) CAS ids to download data for 45 | params: (dict) queries to be added to url 46 | data_dir: (string) path to store the data 47 | 48 | Returns: 49 | None 50 | ''' 51 | 52 | #Create file path for storing inchi keys 53 | inchi_path = os.path.join(data_dir, 'inchi.txt') 54 | num_created = 0 55 | with open(inchi_path,'a') as file: 56 | content = '{}\t{}\n'.format('cas_id', 'inchi') 57 | file.write(content) 58 | 59 | for cas_id in cas_ls: 60 | params['GetInChI'] = 'C' + cas_id 61 | response = requests.get(nist_url, params=params) 62 | 63 | num_created+=1 64 | logging.info('Creating InChi key for id: {}. Total keys created {}'.format(cas_id, num_created)) 65 | content = '{}\t{}\n'.format(cas_id,response.content.decode("utf-8")) 66 | file.write(content) 67 | 68 | 69 | 70 | 71 | 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument('--save_dir', default= './data',\ 74 | help = "Directory path to store scrapped data") 75 | parser.add_argument('--cas_list', default= 'species.txt',\ 76 | help = "File containing CAS number and formula of molecules") 77 | parser.add_argument('--scrap_IR', default= True,\ 78 | help = "Whether to download IR or not") 79 | parser.add_argument('--scrap_MS', default= True,\ 80 | help = "Whether to download MS or not") 81 | parser.add_argument('--scrap_InChi', default= True,\ 82 | help = "Whether to download InChi or not") 83 | 84 | args = parser.parse_args() 85 | 86 | #Check if file containing CAS ids exist 87 | assert os.path.isfile(args.cas_list),"No file named {} exists".format(args.cas_list) 88 | 89 | #Create data directory to store logs and spectra 90 | data_dir = args.save_dir 91 | if not os.path.exists(data_dir): 92 | os.makedirs(data_dir) 93 | 94 | set_logger(data_dir, 'scrap.log') 95 | 96 | #Obtain CAS ids used for downloading the content from NIST 97 | logging.info('Loading CAS file') 98 | cas_df = pd.read_csv(args.cas_list, sep='\t', names = ['name', 'formula', 'cas'], header = 0) 99 | cas_df.dropna(subset=['cas'], inplace=True) 100 | cas_df.cas = cas_df.cas.str.replace('-', '') 101 | 102 | cas_ids = list(cas_df.cas) 103 | 104 | 105 | 106 | 107 | logging.info('Scrap Mass spectra') 108 | if args.scrap_MS: 109 | params = params={'JCAMP': '', 'Index': 0, 'Type': 'Mass'} 110 | scrap_data(cas_ids, params, data_dir) 111 | 112 | logging.info('Scrap IR spectra') 113 | if args.scrap_IR: 114 | params={'JCAMP': '', 'Type': 'IR', 'Index': 0} 115 | scrap_data(cas_ids, params, data_dir) 116 | 117 | logging.info('Scrap InChi keys') 118 | if args.scrap_InChi: 119 | params={} 120 | scrap_inchi(cas_ids, params, data_dir) 121 | -------------------------------------------------------------------------------- /model/mlp_model_fn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def build_mlp_model(is_training, inputs, params): 4 | '''Build forward model and compute logits 5 | 6 | Args: 7 | is_training: (tf.placeholder) indicates training or evaluation 8 | inputs: (dict) contains tensors of inputs and labels fed to the graph 9 | params: (dict) hyperparameters of the model 10 | 11 | Returns: 12 | output: (tf.tensor) logits of the model 13 | 14 | ''' 15 | 16 | #Read all hyperparameters 17 | num_fc_layers = params['num_fc_layers'] 18 | fc_hidden_units = params['fc_hidden_units'] 19 | activation = params['activation'] 20 | dropout_probs = params['dropout_probs'] 21 | dropout_layer = inputs 22 | output_shape = params['output_shape'] 23 | 24 | #Construct hidden layers of the forward model 25 | for layer in range(num_fc_layers): 26 | with tf.variable_scope('fc_{}'.format(layer+1)): 27 | hidden_layer = tf.layers.dense(dropout_layer, fc_hidden_units[layer]) 28 | batch_norm_layer = tf.layers.batch_normalization(hidden_layer, training = is_training) 29 | activation_layer = eval(activation)(batch_norm_layer) 30 | dropout_layer = tf.layers.dropout(activation_layer, rate = dropout_probs[layer],training = is_training) 31 | 32 | 33 | 34 | #Compute output of the model 35 | with tf.variable_scope('output'): 36 | output = tf.layers.dense(dropout_layer, output_shape, None) 37 | 38 | return output 39 | 40 | 41 | def mlp_model_fn(is_training, inputs, params): 42 | 43 | '''Define graph operations for training and evaluating 44 | 45 | Args: 46 | is_training: (bool) indicates training or evaluation 47 | inputs: (dict) contains tensors of inputs and labels fed to the graph 48 | reuse: (bool) To or not to reuse the variables with same name 49 | params: (dict) hyperparameters of the model 50 | 51 | Returns: 52 | model_spec: (dict) Contains the operations needed for training and evaluating the model 53 | 54 | ''' 55 | target = inputs['target'] 56 | spectra_data = inputs['spectra_data'] 57 | is_train_ph = tf.placeholder_with_default(is_training, shape=()) #Define a placeholder for setting mode during evaluation 58 | params['output_shape'] = target.shape[1] 59 | num_functional_groups = tf.cast(target.shape[1], tf.float64) 60 | 61 | #Compute logits and make predictions 62 | with tf.variable_scope('model', reuse = not is_training): 63 | logits = build_mlp_model(is_train_ph, spectra_data, params) 64 | pred_probs = tf.sigmoid(logits) 65 | predictions = tf.cast(tf.greater_equal(pred_probs, params['threshold']), tf.float64) 66 | 67 | #Binary cross entropy loss computed across every dimension for multi label classification 68 | loss = tf.reduce_mean(tf.losses.sigmoid_cross_entropy(target, logits)) 69 | num_correct_predictions = tf.reduce_sum(tf.cast(tf.equal(target, predictions),tf.float64), axis = 1)/num_functional_groups 70 | accuracy = tf.reduce_mean(tf.cast(tf.equal(num_correct_predictions, 1.0), tf.float64)) 71 | 72 | 73 | 74 | if is_training: 75 | optimizer = tf.train.AdamOptimizer(params['learning_rate']) 76 | global_step = tf.train.get_or_create_global_step() 77 | 78 | #Perform update_op to update moving mean and variance before minimizing the loss 79 | update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 80 | with tf.control_dependencies(update_op): 81 | train_op = optimizer.minimize(loss, global_step = global_step) 82 | 83 | 84 | with tf.variable_scope('metrics'): 85 | metrics = {'loss' : tf.metrics.mean(loss), 86 | 'accuracy' : tf.metrics.mean(accuracy)} 87 | 88 | 89 | 90 | #Group all metrics update ops 91 | metrics_update_op = tf.group(*[metric[1] for _, metric in metrics.items()]) 92 | 93 | #Collect all metrics variables to initialize before every epoch 94 | metrics_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") 95 | metrics_initializer_op = tf.variables_initializer(metrics_variables) 96 | 97 | tf.summary.scalar('loss', loss) 98 | tf.summary.scalar('accuracy', accuracy) 99 | 100 | model_spec = inputs 101 | model_spec['loss'] = loss 102 | model_spec['pred_probs'] = pred_probs 103 | model_spec['metrics'] = metrics 104 | model_spec['metric_initializer_op'] = metrics_initializer_op 105 | model_spec['metrics_update_op'] = metrics_update_op 106 | model_spec['summary_op'] = tf.summary.merge_all() 107 | model_spec['variables_init_op'] = tf.global_variables_initializer() 108 | model_spec['train_ph'] = is_train_ph 109 | 110 | 111 | if is_training: 112 | model_spec['train_op'] = train_op 113 | 114 | return model_spec -------------------------------------------------------------------------------- /model/train_fn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from tqdm import trange 4 | import tensorflow as tf 5 | 6 | from .evaluate_fn import evaluate_sess 7 | 8 | def train_sess(sess, model_spec, num_steps, writer, params): 9 | '''Train the model on the data for one epoch 10 | 11 | Args: 12 | sess: (tf.Session) indicates current session 13 | model_spec: (dict) contains all graph operations for training the model 14 | params: (dict) hyperparameters of the model 15 | num_steps: (int) Number of batches 16 | writer: (tf.summary.FileWriter) writer for storing summaries 17 | 18 | Returns: 19 | None 20 | 21 | ''' 22 | 23 | #Collect all update ops and metrics 24 | metrics_update_op = model_spec['metrics_update_op'] 25 | train_op = model_spec['train_op'] 26 | summary_op = model_spec['summary_op'] 27 | loss = model_spec['loss'] 28 | metrics = model_spec['metrics'] 29 | global_step = tf.train.get_global_step() 30 | 31 | #Initialize the dataset iterator and metrics local variables 32 | sess.run(model_spec['iterator_initializer']) 33 | sess.run(model_spec['metric_initializer_op']) 34 | 35 | progress_bar = trange(num_steps, position = 0, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') 36 | for step in progress_bar: 37 | if step%params['save_frequency']==0: 38 | _,_,summary,loss_val,global_step_val=sess.run([train_op, metrics_update_op, summary_op, loss, global_step]) 39 | writer.add_summary(summary, global_step_val) 40 | else: 41 | _,_,loss_val=sess.run([train_op, metrics_update_op, loss]) 42 | progress_bar.set_postfix(loss=round(loss_val,6)) 43 | 44 | #Compute metrics over entire training data 45 | train_metrics_values = sess.run({key : val[0] for key, val in metrics.items()}) 46 | train_metrics_string = ' '.join(['{} : {:.6f}'.format(key, val) for key, val in train_metrics_values.items()]) 47 | logging.info("- Train metrics: "+ train_metrics_string) 48 | 49 | 50 | 51 | def train_and_save(train_model_spec, eval_model_spec, model_dir, params, restore_weights = None): 52 | '''Train the model and save the weights of last 5 epochs and the best epoch 53 | 54 | Args: 55 | train_model_spec: (dict) contains all graph operations for training the model 56 | eval_model_spec: (dict) contains all graph operations for evaluating the model 57 | model_dir: (string) directory path to store weights and summaries 58 | params: (dict) hyperparameters of the model 59 | restore_weights: (string) directory path to restore weights from 60 | 61 | Returns: 62 | None 63 | ''' 64 | #Initiliaze the saver 65 | last_saver = tf.train.Saver() 66 | best_saver = tf.train.Saver(max_to_keep=1) 67 | 68 | with tf.Session() as sess: 69 | begin_epoch = 0 70 | sess.run(train_model_spec['variables_init_op']) 71 | if restore_weights is not None: 72 | #Restore weights from model_dir/restore_weights 73 | restore_dir = os.path.join(model_dir, restore_weights) 74 | logging.info('Restoring weights from {}'.format(restore_dir)) 75 | latest_ckpt = tf.train.latest_checkpoint(restore_dir) 76 | begin_epoch = int(latest_ckpt.split('-')[-1]) 77 | last_saver.restore(sess, latest_ckpt) 78 | 79 | #Create summary writer for training and evaluation 80 | train_writer = tf.summary.FileWriter(os.path.join(model_dir, 'train_summary'), sess.graph) 81 | eval_writer = tf.summary.FileWriter(os.path.join(model_dir, 'eval_summary'), sess.graph) 82 | 83 | if params['best_model_metric'] == 'acc': 84 | eval_name = 'accuracy' 85 | eval_comp = '>' 86 | best_eval_val = 0 87 | 88 | else : 89 | eval_name = 'loss' 90 | eval_comp = '<' 91 | best_eval_val = 1e4 92 | 93 | 94 | for epoch in range(begin_epoch, begin_epoch + params['num_epochs']): 95 | logging.info('Epoch {}/{}'.format(epoch+1, begin_epoch + params['num_epochs'])) 96 | num_steps = (params['train_size'] + params['batch_size'] - 1)//params['batch_size'] 97 | train_sess(sess, train_model_spec, num_steps, train_writer, params) 98 | 99 | num_steps = (params['eval_size'] + params['batch_size'] - 1)//params['batch_size'] 100 | eval_metrics = evaluate_sess(sess, eval_model_spec, num_steps, eval_writer) 101 | 102 | last_save_path = os.path.join(model_dir, 'last_weights', 'epoch') 103 | last_saver.save(sess, last_save_path, global_step = epoch+1) 104 | 105 | #Update the weights with current best model 106 | if eval(str(eval_metrics[eval_name]) + eval_comp + str(best_eval_val)): 107 | best_eval_val = eval_metrics[eval_name] 108 | 109 | best_save_path = os.path.join(model_dir, 'best_weights', 'epoch') 110 | best_saver.save(sess, best_save_path, global_step = epoch+1) 111 | logging.info('- Found new best {}. Saving in {}'.format(eval_name, best_save_path)) -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 3 | import logging 4 | import argparse 5 | import json 6 | 7 | import tensorflow as tf 8 | tf.get_logger().setLevel('ERROR') 9 | 10 | from model.utils import set_logger, train_test_generator 11 | from model.input_fn import input_fn 12 | from model.ae_model_fn import ae_model_fn 13 | from model.mlp_model_fn import mlp_model_fn 14 | from model.train_fn import train_and_save 15 | from model.evaluate_fn import evaluate_and_predict 16 | from prepare_load_dataset import load_dataset 17 | from synthesize_results import store_results 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--model_dir', default= './experiments/ae_mlp_model',\ 21 | help = "Directory path containing params.json and to store summary and weights") 22 | parser.add_argument('--data_dir', default= './data',\ 23 | help = "Directory path containing IR and MS spectra data") 24 | parser.add_argument('--restore_ae_from', default= None,\ 25 | help = "Restore AE weights before training the model") 26 | parser.add_argument('--restore_mlp_from', default= None,\ 27 | help = "Restore MLP weights before training the model") 28 | 29 | args = parser.parse_args() 30 | 31 | #Model directory should contain params.json file listing all hyperparameters 32 | json_path = os.path.join(args.model_dir, 'params.json') 33 | assert os.path.isfile(json_path),"No params.json found at {} path".format(args.model_dir) 34 | 35 | with open(json_path) as json_data: 36 | params = json.load(json_data) 37 | 38 | set_logger(args.model_dir, 'train.log') 39 | 40 | logging.info('Load the dataset from {}'.format(args.data_dir)) 41 | X, y, func_names = load_dataset(args.data_dir, True, **params['preprocess']) 42 | 43 | 44 | #Train and test generator for every fold 45 | data_generator = train_test_generator(X, y, params['n_splits']) 46 | 47 | train_predictions = [] 48 | test_predictions = [] 49 | 50 | for cv, (train_data, test_data) in enumerate(data_generator): 51 | logging.info('Starting fold {}'.format(cv+1)) 52 | train_size = train_data[0].shape[0] 53 | eval_size = test_data[0].shape[0] 54 | 55 | if params['train_ae']: 56 | tf.reset_default_graph() 57 | logging.info('Training autoencoder to compute embeddings') 58 | 59 | ae_params = params['ae'] 60 | ae_params['train_size'] = train_size 61 | ae_params['eval_size'] = eval_size 62 | 63 | logging.info('Creating the inputs for the model') 64 | train_inputs = input_fn(True, train_data, ae_params) 65 | eval_inputs = input_fn(False, test_data, ae_params) 66 | 67 | logging.info('Building the model') 68 | train_model = ae_model_fn(True, train_inputs, ae_params) 69 | eval_model = ae_model_fn(False, eval_inputs, ae_params) 70 | 71 | 72 | logging.info('Start training {} epochs'.format(params['ae']['num_epochs'])) 73 | model_dir = os.path.join(args.model_dir, 'cv_' + str(cv+1), 'ae') 74 | train_and_save(train_model, eval_model, model_dir, ae_params, restore_weights = args.restore_ae_from) 75 | 76 | #Update spectra data with embeddings computed from the model 77 | logging.info('Compute embeddings of the spectra data') 78 | emb_params = {'restore_path' :os.path.join(model_dir,'best_weights'), 'params' :ae_params,\ 79 | 'layer_name' :'embeddings', 'evaluate_model' :False} 80 | 81 | train_data = evaluate_and_predict(train_model, is_train_data = True, **emb_params) 82 | test_data = evaluate_and_predict(eval_model, is_train_data = False, **emb_params) 83 | 84 | tf.reset_default_graph() 85 | logging.info('Training MLP model') 86 | 87 | mlp_params = params['mlp'] 88 | mlp_params['train_size'] = train_size 89 | mlp_params['eval_size'] = eval_size 90 | 91 | 92 | logging.info('Creating the inputs for the model') 93 | train_inputs = input_fn(True, train_data, mlp_params) 94 | eval_inputs = input_fn(False, test_data, mlp_params) 95 | 96 | logging.info('Building the model') 97 | train_model = mlp_model_fn(True, train_inputs, mlp_params) 98 | eval_model = mlp_model_fn(False, eval_inputs, mlp_params) 99 | 100 | logging.info('Start training {} epochs'.format(params['mlp']['num_epochs'])) 101 | model_dir = os.path.join(args.model_dir, 'cv_' + str(cv+1), 'mlp') 102 | train_and_save(train_model, eval_model, model_dir, mlp_params, restore_weights = args.restore_mlp_from) 103 | 104 | logging.info('Compute prediction probabilities of the spectra data') 105 | pred_params = {'restore_path' :os.path.join(model_dir,'best_weights'), 'params' :mlp_params,\ 106 | 'layer_name' :'pred_probs', 'evaluate_model' :False} 107 | 108 | #Compute prediction probabilites of the model to compute f1 and perfection rate 109 | train_data = evaluate_and_predict(train_model, is_train_data = True, **pred_params) 110 | test_data = evaluate_and_predict(eval_model, is_train_data = False, **pred_params) 111 | 112 | train_predictions.append(train_data) 113 | test_predictions.append(test_data) 114 | 115 | #Compute and save the metrics 116 | store_results(train_predictions, test_predictions, func_names, args.model_dir) 117 | 118 | logging.info('Successfully Completed!!!!!') 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /model/ae_model_fn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def build_ae_model(is_training, inputs, params): 4 | '''Build forward model and reconstruct the spectra 5 | 6 | Args: 7 | is_training: (tf.placeholder) indicates training or evaluation 8 | inputs: (dict) contains tensors of inputs fed to the graph 9 | params: (dict) hyperparameters of the model 10 | 11 | Returns: 12 | emb_layer: (tf.tensor) Embeddings computed by the encoder 13 | output: (tf.tensor) Reconstructed spectra 14 | 15 | ''' 16 | 17 | #Read all hyperparameters 18 | num_ae_layers = params['num_fc_layers'] 19 | ae_hidden_units = params['fc_hidden_units'] 20 | activation = params['activation'] 21 | is_denoising = params.get('is_denoising', False) 22 | denoise_prob = params.get('denoise_inputs', 0.05) 23 | hidden_layer = inputs 24 | 25 | # Randomly flip inputs to 0 with the probability of denoise_prob 26 | if is_denoising: 27 | input_shape = tf.shape(inputs) 28 | hidden_layer *= tf.where(tf.random_uniform(input_shape) > denoise_prob, tf.ones(input_shape)\ 29 | , tf.zeros(input_shape)) 30 | 31 | #Construct hidden layers of the encoder 32 | for layer in range(num_ae_layers): 33 | with tf.variable_scope('enc_{}'.format(layer+1)): 34 | hidden_layer = tf.layers.dense(hidden_layer, ae_hidden_units[layer], eval(activation)) 35 | # batch_norm_layer = tf.layers.batch_normalization(hidden_layer, training = is_training) 36 | # activation_layer = eval(activation)(batch_norm_layer) 37 | # dropout_layer = tf.layers.dropout(activation_layer, rate = dropout_probs[layer],training = is_training) 38 | 39 | 40 | 41 | 42 | emb_layer = hidden_layer 43 | 44 | #Construct hidden layers of the decoder 45 | for layer in range(num_ae_layers-2, -1, -1): 46 | with tf.variable_scope('dec_{}'.format(layer+1)): 47 | hidden_layer = tf.layers.dense(hidden_layer, ae_hidden_units[layer], eval(activation)) 48 | # batch_norm_layer = tf.layers.batch_normalization(hidden_layer, training = is_training) 49 | # activation_layer = eval(activation)(batch_norm_layer) 50 | # dropout_layer = tf.layers.dropout(activation_layer, rate = dropout_probs[layer],training = is_training) 51 | 52 | #Compute reconstructed spectra (use sigmoid as activation to get [0,1] range like input) 53 | with tf.variable_scope('dec_{}'.format(layer+1)): 54 | output = tf.layers.dense(hidden_layer, inputs.shape[-1], 'sigmoid') 55 | 56 | return emb_layer, output 57 | 58 | 59 | def ae_model_fn(is_training, inputs, params): 60 | 61 | '''Define graph operations for training and evaluating 62 | 63 | Args: 64 | is_training: (bool) indicates training or evaluation 65 | inputs: (dict) contains tensors of inputs and labels fed to the graph 66 | params: (dict) hyperparameters of the model 67 | 68 | Returns: 69 | model_spec: (dict) Contains the operations needed for training and evaluating the model 70 | 71 | ''' 72 | 73 | 74 | spectra_data = inputs['spectra_data'] 75 | is_train_ph = tf.placeholder_with_default(is_training, shape=()) #Define a placeholder for setting mode during evaluation 76 | 77 | #Compute embeddings and reconstructed data 78 | with tf.variable_scope('model', reuse = not is_training): 79 | embeddings, spectra_recon = build_ae_model(is_train_ph, spectra_data, params) 80 | 81 | #Mean squared loss between input and reconstructed spectra 82 | loss = tf.losses.mean_squared_error(spectra_data, spectra_recon) 83 | 84 | 85 | 86 | 87 | if is_training: 88 | optimizer = tf.train.AdamOptimizer(params['learning_rate']) 89 | global_step = tf.train.get_or_create_global_step() 90 | 91 | #Perform update_op to update moving mean and variance before minimizing the loss 92 | update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 93 | with tf.control_dependencies(update_op): 94 | train_op = optimizer.minimize(loss, global_step = global_step) 95 | 96 | 97 | with tf.variable_scope('metrics'): 98 | metrics = {'loss' : tf.metrics.mean(loss)} 99 | 100 | 101 | 102 | #Group all metrics update ops 103 | metrics_update_op = tf.group(*[metric[1] for _, metric in metrics.items()]) 104 | 105 | #Collect all metrics variables to initialize before every epoch 106 | metrics_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") 107 | metrics_initializer_op = tf.variables_initializer(metrics_variables) 108 | 109 | tf.summary.scalar('loss', loss) 110 | 111 | model_spec = inputs 112 | model_spec['loss'] = loss 113 | model_spec['embeddings'] = embeddings 114 | model_spec['metrics'] = metrics 115 | model_spec['metric_initializer_op'] = metrics_initializer_op 116 | model_spec['metrics_update_op'] = metrics_update_op 117 | model_spec['summary_op'] = tf.summary.merge_all() 118 | model_spec['variables_init_op'] = tf.global_variables_initializer() 119 | model_spec['train_ph'] = is_train_ph 120 | 121 | 122 | if is_training: 123 | model_spec['train_op'] = train_op 124 | 125 | return model_spec -------------------------------------------------------------------------------- /model/evaluate_fn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from tqdm import trange 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | def evaluate_sess(sess, model_spec, num_steps, writer, feed_dict = {}): 9 | '''Evaluate the model on entire data 10 | 11 | Args: 12 | sess: (tf.Session) indicates current session 13 | model_spec: (dict) contains all graph operations for evaluating the model 14 | num_steps: (int) Number of batches 15 | writer: (tf.summary.FileWriter) writer for storing summaries, can be None 16 | feed_dict: (dict) containing mode during evaluation 17 | 18 | Returns: 19 | eval_metrics_values: (string) contains evaluation metrics of data 20 | 21 | ''' 22 | 23 | #Collect all operations for evaluation 24 | metrics_update_op = model_spec['metrics_update_op'] 25 | metrics = model_spec['metrics'] 26 | global_step = tf.train.get_global_step() 27 | 28 | #Initiliaze the dataset iterator and metrics local variables 29 | sess.run(model_spec['iterator_initializer']) 30 | sess.run(model_spec['metric_initializer_op']) 31 | 32 | progress_bar = trange(num_steps, position = 0, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') 33 | for _ in progress_bar: 34 | _ = sess.run(metrics_update_op, feed_dict = feed_dict) 35 | 36 | #Compute and log evaluation metrics 37 | eval_metrics_values = sess.run({key : val[0] for key, val in metrics.items()}) 38 | eval_metrics_string = ' '.join(['{} : {:.6f}'.format(key, val) for key, val in eval_metrics_values.items()]) 39 | logging.info("- Eval metrics: "+ eval_metrics_string) 40 | 41 | #Add evaluation summaries to the writer 42 | if writer is not None: 43 | global_step_val = sess.run(global_step) 44 | for key, val in eval_metrics_values.items(): 45 | summary = tf.Summary(value = [tf.Summary.Value(tag = key, simple_value = val)]) 46 | writer.add_summary(summary, global_step = global_step_val) 47 | return eval_metrics_values 48 | 49 | 50 | def predictions_sess(sess, model_spec, size, params, layer_name = 'pred_probs', feed_dict = {}): 51 | '''Compute predictions of a model layer in model specification 52 | 53 | Args: 54 | sess: (tf.Session) indicates current session 55 | model_spec: (dict) contains graph operations for making prediction 56 | size: (int) dataset size 57 | params: (dict) hyperparameters of the model 58 | feed_dict: (dict) containing mode during evaluation 59 | 60 | 61 | Returns: 62 | data: (tuple) containing arrays of prediction and target 63 | 64 | ''' 65 | 66 | #Compute dimension size to create data arrays 67 | target_dim = model_spec['target'].shape[-1] 68 | pred_dim = model_spec[layer_name].shape[-1] 69 | 70 | #Initialize target and predictions array 71 | target_arr = np.zeros((size, target_dim)) 72 | pred_arr = np.zeros((size, pred_dim)) 73 | 74 | #Initiliaze the dataset iterator 75 | sess.run(model_spec['iterator_initializer']) 76 | 77 | 78 | batch_size = params['batch_size'] 79 | #Compute number of batches 80 | num_steps = (size + batch_size- 1)//batch_size 81 | 82 | #Compute batch wise target and predictions. Add it to data array. 83 | progress_bar = trange(num_steps, position = 0, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') 84 | for step in progress_bar: 85 | pred_batch, target_batch = sess.run([model_spec[layer_name], model_spec['target']], feed_dict = feed_dict) 86 | target_arr[step*batch_size: (step+1)* batch_size] = target_batch 87 | pred_arr[step*batch_size: (step+1)* batch_size] = pred_batch 88 | 89 | return pred_arr, target_arr 90 | 91 | def evaluate_and_predict(model_spec, layer_name, is_train_data,\ 92 | params, restore_path, evaluate_model = True): 93 | '''Evaluate the model and make predictions after restoring the weights 94 | 95 | Args: 96 | model_spec: (dict) contains all graph operations for evaluating the model 97 | layer_name: (string) name of the layer to compute model predictions 98 | is_train_data: (bool) whether dataset is train data or val data 99 | params: (dict) hyperparameters of the model 100 | restore_path: (string) directory path to restore weights from 101 | evaluate_model: (bool) whether or not to evaluate the model 102 | 103 | Returns: 104 | data: (tuple) containing arrays of prediction and target 105 | ''' 106 | 107 | saver = tf.train.Saver() 108 | 109 | with tf.Session() as sess: 110 | #Restore weights from model_dir/restore_weights 111 | restore_dir = os.path.join(restore_path) 112 | logging.info('Restoring weights from {}'.format(restore_dir)) 113 | latest_ckpt = tf.train.latest_checkpoint(restore_dir) 114 | saver.restore(sess, latest_ckpt) 115 | 116 | size = params['train_size'] if is_train_data else params['eval_size'] 117 | num_steps = (size + params['batch_size'] - 1)//params['batch_size'] 118 | 119 | is_train_ph = model_spec['train_ph'] 120 | feed_dict = {is_train_ph: False} 121 | 122 | if evaluate_model: 123 | _ = evaluate_sess(sess, model_spec, num_steps, None, feed_dict) 124 | 125 | 126 | data = predictions_sess(sess, model_spec, size, params, layer_name, feed_dict) 127 | return data 128 | 129 | -------------------------------------------------------------------------------- /prepare_load_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import argparse 4 | import sys 5 | 6 | import pandas as pd 7 | import numpy as np 8 | from jcamp import jcamp_read 9 | import matplotlib.pyplot as plt 10 | 11 | from rdkit import Chem, RDLogger 12 | lg = RDLogger.logger() 13 | lg.setLevel(RDLogger.CRITICAL) 14 | 15 | from model.utils import set_logger 16 | 17 | 18 | # Initialize all constants necessary for standardizing the spectra 19 | min_ir = 399 20 | max_ir = 4001 21 | step_ir = 3.25 22 | 23 | min_mass = 1 24 | max_mass = 650 25 | step_mass = 1 26 | 27 | eps = 1e-4 28 | 29 | # Create dictionary of functional group names and their corresponding smarts string 30 | func_grp_smarts = {'alkane':'[CX4;H0,H1,H2,H4]','methyl':'[CH3]','alkene':'[CX3]=[CX3]','alkyne':'[CX2]#C', 31 | 'alcohols':'[#6][OX2H]','amines':'[NX3;H2,H1;!$(NC=O)]', 'nitriles':'[NX1]#[CX2]', 32 | 'aromatics':'[$([cX3](:*):*),$([cX2+](:*):*)]','alkyl halides':'[#6][F,Cl,Br,I]', 33 | 'esters':'[#6][CX3](=O)[OX2H0][#6]', 'ketones':'[#6][CX3](=O)[#6]','aldehydes':'[CX3H1](=O)[#6]', 34 | 'carboxylic acids':'[CX3](=O)[OX2H1]', 'ether': '[OD2]([#6])[#6]','acyl halides':'[CX3](=[OX1])[F,Cl,Br,I]', 35 | 'amides':'[NX3][CX3](=[OX1])[#6]','nitro':'[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]'} 36 | 37 | 38 | 39 | 40 | def JCAMP_reader(filename): 41 | '''Overload function in jcamp to use latin-1 encoding instead of utf-8 42 | 43 | Args: 44 | filename: (string) jdx file containing spectra 45 | 46 | Returns: 47 | data: (dict) contains parsed information from file 48 | ''' 49 | with open(filename, 'r', encoding = 'latin-1') as filehandle: 50 | data = jcamp_read(filehandle) 51 | data['filename'] = filename 52 | return data 53 | 54 | 55 | def check_spectra_prop(mol_dict): 56 | '''Check if IR spectra satisfies certain conditions to be included in dataset 57 | 58 | Args: 59 | mol_dict: (dict) contains information about the spectra data 60 | 61 | Returns: 62 | _: (bool) whether spectra meets conditions 63 | ''' 64 | cond1 = mol_dict.get('state', r'N\A').lower() == 'gas' 65 | cond2 = mol_dict.get('xunits', r'N\A').lower() != 'micrometers' 66 | cond3 = mol_dict.get('yunits', r'N\A').lower() == 'absorbance' 67 | 68 | return all((cond1, cond2, cond3)) 69 | 70 | def add_spectra_to_df(spectra_df, file_path, bins, is_mass = False): 71 | '''Add a spectra from filepath to the dataframe after standardizing 72 | 73 | Args: 74 | spectra_df: (pd.DataFrame) contains standardized spectra 75 | file_path: (string) path containing jdx file 76 | bins: (np.array) used for standardizing 77 | is_mass: (bool) whether data being parsed is Mass or IR 78 | 79 | Returns: 80 | spectra_df: (pd.DataFrame) contains new spectrum aded to dataframe 81 | ''' 82 | 83 | mol_dict = JCAMP_reader(file_path) 84 | 85 | #if conditions are not met, don't add the data 86 | if not is_mass and not check_spectra_prop(mol_dict): 87 | return spectra_df 88 | 89 | #Standardize the new spectrum and prepare for merging 90 | mol_id = mol_dict['cas registry no'].replace('-','') 91 | mol_xvalues = mol_dict['x'] 92 | mol_yvalues = mol_dict['y'] 93 | mol_df = pd.DataFrame(data = {mol_id : mol_yvalues}, index = mol_xvalues) 94 | mol_df.index = pd.cut(mol_df.index, bins = bins) 95 | mol_df = mol_df.groupby(level=0).agg('mean') 96 | 97 | logging.info('Adding spectra with id {} to dataframe'.format(mol_id)) 98 | if spectra_df is None: 99 | spectra_df = mol_df 100 | else: 101 | spectra_df = pd.merge(spectra_df, mol_df, left_index = True, right_index = True, how='outer') 102 | 103 | return spectra_df 104 | 105 | def save_spectra_to_csv(root, files, save_path, bins, is_mass = False): 106 | '''Save the spectra dataframe as csv to path 107 | 108 | Args: 109 | root: (string) path to spectra data 110 | files: (list) jdx files present in root 111 | save_path: (string) path to store csv file 112 | bins: (np.array) used for standardizing 113 | is_mass: (bool) whether data being parsed is Mass or IR 114 | 115 | Returns: 116 | None 117 | ''' 118 | 119 | spectra_df = None 120 | for file_name in files: 121 | file_path = os.path.join(root,file_name) 122 | spectra_df = add_spectra_to_df(spectra_df, file_path\ 123 | ,bins, is_mass) 124 | logging.info('Creating dataset in {}'.format(save_path)) 125 | spectra_df.to_csv(save_path) 126 | 127 | 128 | def identify_functional_groups(inchi): 129 | '''Identify the presence of functional groups present in molecule 130 | denoted by inchi 131 | 132 | Args: 133 | root: (string) path to spectra data 134 | files: (list) jdx files present in root 135 | save_path: (string) path to store csv file 136 | bins: (np.array) used for standardizing 137 | is_mass: (bool) whether data being parsed is Mass or IR 138 | 139 | Returns: 140 | mol_func_groups: (list) contains binary values of functional groups presence 141 | None if inchi to molecule conversion returns warning or error 142 | ''' 143 | 144 | try: 145 | #Convert inchi to molecule 146 | mol = Chem.MolFromInchi(inchi, treatWarningAsError=True) 147 | mol_func_grps = [] 148 | 149 | #populate the list with binary values 150 | for _, func_struct in func_grp_structs.items(): 151 | struct_matches = mol.GetSubstructMatches(func_struct) 152 | contains_func_grp = int(len(struct_matches)>0) 153 | mol_func_grps.append(contains_func_grp) 154 | return mol_func_grps 155 | except: 156 | 157 | return None 158 | 159 | def save_target_to_csv(cas_inchi_df, save_path): 160 | '''Save the target dataframe as csv to path 161 | 162 | Args: 163 | cas_inchi_df: (pd.DataFrame) contains CAS and Inchi of molecules 164 | save_path: (string) path to store csv file 165 | 166 | Returns: 167 | None 168 | ''' 169 | column_names = list(func_grp_structs.keys()) 170 | target_df = pd.DataFrame(index = cas_inchi_df.index, columns = column_names) 171 | 172 | #Iterate the rows, don't use df.apply since a list is being returned. 173 | for ind, (_, row) in enumerate(cas_inchi_df.iterrows()): 174 | target_df.iloc[ind, :] = identify_functional_groups(row['inchi']) 175 | 176 | 177 | target_df.dropna(inplace = True) 178 | target_df.to_csv(save_path) 179 | 180 | def preprocess_spectra_df(spectra_df, is_mass = False, **kwargs): 181 | '''Preprocess the spectra dataframe by normalizing and interpolating 182 | 183 | Args: 184 | spectra_df: (pd.DataFrame) contains standardized spectra 185 | is_mass: (bool) whether data being parsed is Mass or IR 186 | kwargs: (dict) containing methods for interpolation 187 | 188 | Returns: 189 | spectra_df: (pd.DataFrame) contains processed spectra 190 | ''' 191 | if is_mass: 192 | 193 | #Fill NaN with zero and remove m/z ratio where all values are zero 194 | spectra_df.fillna(0, inplace = True) 195 | spectra_df = spectra_df.loc[:,spectra_df.sum(axis=0)!=0] 196 | 197 | else: 198 | 199 | #Interpolate with linear or spline based on kwargs 200 | spectra_df.reset_index(inplace = True) 201 | spectra_df.iloc[:, 1:] = spectra_df.iloc[:,1:].interpolate(**kwargs,\ 202 | limit_direction='both', axis = 0) 203 | spectra_df.set_index('index', inplace = True) 204 | 205 | #Normalize each spectra 206 | return spectra_df.div(spectra_df.max(axis=0), axis=1) 207 | 208 | 209 | 210 | def load_dataset(data_dir, include_mass = True, **params): 211 | '''Load the spectra and target dataset for training 212 | 213 | Args: 214 | data_dir: (string) contains data path for csv file 215 | include_mass: (bool) whether to include mass spectra while training 216 | params: (dict) containing methods for interpolation 217 | 218 | Returns: 219 | X: (np.array) contains processed spectra values 220 | y: (np.array) contains target values of corresponding spectra 221 | ''' 222 | 223 | #load and prepare IR data 224 | ir_path = os.path.join(data_dir, 'ir.csv') 225 | logging.info('Loading IR data from {}'.format(ir_path)) 226 | ir_df = pd.read_csv(ir_path, index_col = 0) 227 | ir_df = preprocess_spectra_df(ir_df, is_mass = False, **params).T 228 | 229 | spectra_df = ir_df 230 | 231 | if include_mass: 232 | 233 | #Load and prepare mass data 234 | mass_path = os.path.join(data_dir, 'mass.csv') 235 | logging.info('Loading mass data from {}'.format(mass_path)) 236 | mass_df = pd.read_csv(mass_path, index_col = 0).T 237 | mass_df = mass_df.loc[mass_df.index.isin(ir_df.index)] 238 | mass_df = preprocess_spectra_df(mass_df, is_mass = True) 239 | 240 | # mass_df = mass_df.reindex(ir_df.index) 241 | # spectra_df = pd.concat([spectra_df, mass_df], axis = 1) 242 | # spectra_df.dropna(inplace = True) 243 | 244 | #Merge mass data with IR 245 | spectra_df = pd.merge(spectra_df, mass_df, left_index = True, right_index = True, how = 'inner') 246 | 247 | #Prepare target data and rearrange to match the spectra 248 | spectra_df.index = spectra_df.index.astype('int') 249 | target_path = os.path.join(data_dir, 'target.csv') 250 | logging.info('Loading target data from {}'.format(target_path)) 251 | target_df = pd.read_csv(target_path, index_col = 0, dtype = np.float64) 252 | 253 | fn_groups = target_df.shape[1] 254 | total_df = pd.merge(spectra_df, target_df, left_index = True, right_index = True, how = 'inner') 255 | 256 | return total_df.values[:, :-fn_groups], total_df.values[:, -fn_groups:], list(func_grp_smarts.keys()) 257 | 258 | 259 | if __name__ == '__main__': 260 | #Parsing the data from jdx and storing it in csv 261 | 262 | parser = argparse.ArgumentParser() 263 | parser.add_argument('--data_dir', default= './data',\ 264 | help = "Directory path containing scrapped data") 265 | parser.add_argument('--cas_list', default= 'species.txt',\ 266 | help = "File containing CAS number and smiles of molecules") 267 | 268 | args = parser.parse_args() 269 | 270 | data_dir = args.data_dir 271 | set_logger(data_dir, 'prepare_data.log') 272 | 273 | 274 | # Create bins for IR and mass spectra 275 | logging.info('Creating bins for standardizing the spectra') 276 | ir_bins = np.arange(min_ir - eps, max_ir + eps, step_ir) 277 | mass_bins = np.arange(min_mass - eps, max_mass + eps, step_mass) 278 | 279 | # Compute structures of different molecular groups 280 | logging.info('Computing the structures of functional groups') 281 | func_grp_structs = {func_name : Chem.MolFromSmarts(func_smarts)\ 282 | for func_name, func_smarts in func_grp_smarts.items()} 283 | 284 | # Create and save csv files of spectra 285 | for root, dirs, files in os.walk(data_dir): 286 | if root == os.path.join(data_dir, 'ir'): 287 | logging.info('Starting to parse IR jdx files') 288 | ir_path = os.path.join(data_dir, 'ir.csv') 289 | save_spectra_to_csv(root, files, ir_path, ir_bins, False) 290 | 291 | if root == os.path.join(data_dir, 'mass'): 292 | logging.info('Starting to parse mass jdx files') 293 | mass_path = os.path.join(data_dir, 'mass.csv') 294 | save_spectra_to_csv(root, files, mass_path, mass_bins, True) 295 | 296 | #Load CAS data and merge with inchi 297 | logging.info('Loading CAS file from {}'.format(args.cas_list)) 298 | cas_df = pd.read_csv(args.cas_list, sep='\t', header = 0, usecols = [1,2], names = ['formula','cas']) 299 | cas_df.dropna(subset=['cas'], inplace=True) 300 | cas_df.cas = cas_df.cas.str.replace('-', '') 301 | cas_df.set_index('cas', inplace = True) 302 | 303 | 304 | inchi_path = os.path.join(data_dir, 'inchi.txt') 305 | logging.info('Loading inchi file from {}'.format(inchi_path)) 306 | inchi_df = pd.read_csv(inchi_path, sep='\t', header = 0, usecols = [0,1],\ 307 | names = ['cas','inchi'], dtype = str) 308 | inchi_df.dropna(inplace = True) 309 | inchi_df.set_index('cas', inplace = True) 310 | 311 | # Create and save csv of target 312 | cas_inchi_df = pd.merge(cas_df, inchi_df, left_index = True, right_index = True, how = 'inner') 313 | target_path = os.path.join(data_dir, 'target.csv') 314 | logging.info('Creating target csv dataset in {}'.format(target_path)) 315 | save_target_to_csv(cas_inchi_df, target_path) --------------------------------------------------------------------------------