├── model
    ├── __init__.py
    ├── input_fn.py
    ├── utils.py
    ├── mlp_model_fn.py
    ├── train_fn.py
    ├── ae_model_fn.py
    └── evaluate_fn.py
├── .gitignore
├── experiments
    ├── mlp_model
    │   └── params.json
    └── ae_mlp_model
    │   └── params.json
├── README.md
├── synthesize_results.py
├── scrap.py
├── train.py
└── prepare_load_dataset.py


/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **/cv_*
2 | **/*.log
3 | **/.pyc
4 | *.ipynb*
5 | __pycache__/
6 | species.txt
7 | *data/


--------------------------------------------------------------------------------
/experiments/mlp_model/params.json:
--------------------------------------------------------------------------------
 1 | {"mlp":
 2 |     {"num_fc_layers":2, 
 3 |     "fc_hidden_units": [200, 150], 
 4 |     "activation": "tf.nn.relu",    
 5 |     "dropout_probs": [0.45, 0.15],                     
 6 |     "batch_size": 64,                       
 7 |     "learning_rate" : 0.00015, 
 8 |     "num_epochs": 100, 
 9 |     "threshold" : 0.5,
10 |     "best_model_metric" : "loss",
11 |     "save_frequency": 5},
12 | "preprocess":
13 |     {"method" : "linear",
14 |     "order" : 1
15 |     },
16 | "n_splits" : 5,
17 | "train_ae" : 0
18 | }
19 | 


--------------------------------------------------------------------------------
/experiments/ae_mlp_model/params.json:
--------------------------------------------------------------------------------
 1 | {"mlp":
 2 |     {"num_fc_layers":2, 
 3 |     "fc_hidden_units": [200, 150], 
 4 |     "activation": "tf.nn.relu",    
 5 |     "dropout_probs": [0.45, 0.15],                     
 6 |     "batch_size": 64,                       
 7 |     "learning_rate" : 0.00015, 
 8 |     "num_epochs": 100, 
 9 |     "threshold" : 0.5,
10 |     "best_model_metric" : "loss",
11 |     "save_frequency": 5},
12 | "ae":
13 |     {"num_fc_layers":1, 
14 |     "fc_hidden_units": [256], 
15 |     "activation": "tf.nn.relu",   
16 |     "is_denoising" : 0,
17 |     "denoise_inputs" : 0.05,                  
18 |     "batch_size": 64, 
19 |     "learning_rate" : 0.001, 
20 |     "num_epochs": 100, 
21 |     "best_model_metric" : "loss",
22 |     "save_frequency": 5},
23 | "preprocess":
24 |     {"method" : "linear",
25 |     "order" : 1
26 |     },
27 | "n_splits" : 5,
28 | "train_ae" : 1
29 | }
30 | 


--------------------------------------------------------------------------------
/model/input_fn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf 
 2 | 
 3 | 
 4 | def input_fn(is_training, data, params):
 5 |     ''' Input function for spectra dataset
 6 | 
 7 |     Args:
 8 |         is_training: (bool) Whether it is training or not
 9 |         data: (tuple) containing (spectra, target) arrays. 
10 |         params: (dict) Hyperparameters of the model
11 | 
12 |     Returns:
13 |         inputs: (dict) Contains the iterator and data to be fed to the model
14 | 
15 |     '''
16 | 
17 |     #Shuffle training dataset
18 |     if is_training:
19 |         dataset = tf.data.Dataset.from_tensor_slices(data)\
20 |                     .shuffle(len(data))\
21 |                     .batch(params['batch_size'])\
22 |                     .prefetch(1)
23 |     else:
24 |         dataset = tf.data.Dataset.from_tensor_slices(data)\
25 |                     .batch(params['batch_size'])\
26 |                     .prefetch(1)
27 | 
28 |     #Create initializable iterator to re-feed data after every epoch
29 |     iterator = dataset.make_initializable_iterator()
30 |     spectra_data, target = iterator.get_next()
31 |     
32 |     iterator_initializer_op = iterator.initializer
33 |     
34 |     inputs = {'spectra_data' : spectra_data, 'target' : target, 'iterator_initializer': iterator_initializer_op}
35 |         
36 |     return inputs
37 |         


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CANDIY-spectrum
 2 | 
 3 | Human analyis of chemical spectra such as Mass Spectra (MS), Infra-Red Specta (FTIR), and Nuclear Magnetic Resonance is both time consuming and potentially inaccurate. This project aims to develop a set of methodologies incorporating these spectra for the prediction of chemical functional groups and structures.
 4 | 
 5 | This project is a stub, but we hope that it will spur development of machine learning methods for the analysis of chemical spectra.
 6 | 
 7 | 
 8 | ## Required Packages
 9 | 1) Numpy==1.18.5
10 | 2) Rdkit==2020.03.1
11 | 3) Pandas==1.0.5
12 | 4) Jcamp
13 | 5) Tensorflow==1.15.0 
14 | 6) Matplotlib==3.1.1
15 | 7) Sklearn==0.22.1
16 | 
17 | ## Scraping
18 | ### Manual Scraping
19 | IR and MS spectra were downloaded from NIST website. https://webbook.nist.gov/chemistry/. 
20 | Scraping can be done through replacing the correct CAS number in the placeholder. https://webbook.nist.gov/cgi/cbook.cgi?ID="insert_cas"&Units=SI and downloading the required spectra. 
21 | 
22 | (Or)
23 | ### Automatic Scraping
24 | Download all the species name available in NIST from this link https://webbook.nist.gov/chemistry/download/. Change path of cas_list to where the species name file is stored.
25 | ```
26 | python scrap.py --save_dir='./data/' --cas_list='species.txt' --scrap_IR=true --scrap_MS=true --scrap_InChi=true
27 | ```
28 | 
29 | ## Prepare dataset
30 | Parse all jdx files of IR and Mass spectra to standardize and store in a csv format. Also, parse inchi.txt to create target csv indicating presence of functional groups
31 | 
32 | ```
33 | python prepare_load_dataset.py --data_dir='./data/' --cas_list='species.txt'
34 | ```
35 | 
36 | ## Train the model
37 | Run train.py to train the model. The model directory should contain params.json file listing all hyperparameters used for building the model. Optionally weights can be restored from pretrained model. 
38 | 
39 | ```
40 | python train.py --model_dir=./experiments/mlp_model --data_dir=./data/ 
41 | ```
42 | 


--------------------------------------------------------------------------------
/model/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | 
 4 | from sklearn.model_selection import KFold
 5 | 
 6 | def set_logger(model_dir, log_name):
 7 |     '''Set logger to write info to terminal and save in a file.
 8 | 
 9 |     Args:
10 |         model_dir: (string) path to store the log file
11 | 
12 |     Returns:
13 |         None
14 |     '''
15 |     logger = logging.getLogger()
16 |     logger.setLevel(logging.INFO)
17 | 
18 |     #Don't create redundant handlers everytime set_logger is called
19 |     if not logger.handlers:
20 | 
21 |         #File handler with debug level stored in model_dir/generation.log
22 |         fh = logging.FileHandler(os.path.join(model_dir, log_name))
23 |         fh.setLevel(logging.DEBUG)
24 |         fh.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
25 |         logger.addHandler(fh)
26 | 
27 |         #Stream handler with info level written to terminal
28 |         sh = logging.StreamHandler()
29 |         sh.setLevel(logging.INFO)
30 |         sh.setFormatter(logging.Formatter('%(message)s'))
31 |         logger.addHandler(sh)
32 |     
33 |     return logger
34 | 
35 | def train_test_generator(X, y, n_splits):
36 |     '''
37 |     Create a generator to return next train and test data split when called
38 | 
39 |     Args:
40 |         X: (np.array) of dimension [num_samples x features]
41 |         y: (np.array) of dimension [num_samples x target groups]
42 |         n_splits: (int) Number of cross validation folds
43 |     
44 |     Returns:
45 |         (X_train, y_train): (tuple) of np.arrays containing single fold of train data
46 |         (X_test, y_test): (tuple) of np.arrays containing single fold of test data
47 |     '''
48 | 
49 |     kfold = KFold(n_splits=5,shuffle=True,random_state=4)
50 | 
51 |     for train_index, val_index in kfold.split(X, y):
52 |         X_train, X_val = X[train_index], X[val_index]
53 |         y_train, y_val = y[train_index], y[val_index]
54 | 
55 |         yield (X_train, y_train), (X_val, y_val)
56 | 
57 | 
58 |     


--------------------------------------------------------------------------------
/synthesize_results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import numpy as np
 4 | import pandas as pd
 5 | from sklearn.metrics import precision_recall_curve,f1_score, accuracy_score
 6 | 
 7 | 
 8 | def compute_thresholds(data_ls):
 9 |     '''Compete dynamic thresholds for every functional group using val data
10 | 
11 |     Args:
12 |         data_ls: (list) containing val predictions and target of all folds
13 | 
14 |     Returns:
15 |         thresholds: (np.array) containing the thresholds of groups
16 |     '''
17 |     logging.info('Computing Thresholds')
18 |     #Combine all test data into singel array
19 |     data_arr = np.concatenate(data_ls, axis = 1)
20 |     data_target = data_arr[1]
21 |     data_probs = data_arr[0]
22 |     
23 |     num_func_groups = data_target.shape[1]
24 |     thresholds = np.zeros((1, num_func_groups))
25 |     eps = 1e-7 
26 |     
27 |     #Find threshold resulting in maximum f1 score for each functional group
28 |     for i in range(num_func_groups):
29 |         pre,rec,thre=precision_recall_curve(data_target[:,i],data_probs[:, i])
30 |         f1 = 2*pre*rec/(pre+rec+eps)
31 |         max_ind = np.argmax(f1)
32 |         thresholds[0,i] = thre[max_ind]
33 |     return thresholds
34 | 
35 | def compute_metrics(data_ls, thresholds, func_names):
36 |     '''Compete metrics for every fold of train and val data
37 | 
38 |     Args:
39 |         data_ls: (list) containing predictions and target of all folds
40 |         thresholds: (np.array) of every functional group
41 |         func_names: (list) used as part of target
42 | 
43 |     Returns:
44 |         mol_score_df: (pd.DataFrame) contains mean and std of mol. scores
45 |         func_f1_df: (pd.DataFrame) contains mean and std of func. f1 score
46 |     '''
47 | 
48 |     logging.info('Computing func_f1, mol_f1 and mol_perfection metrics')
49 |     num_folds = len(data_ls)
50 |     num_groups = thresholds.shape[1]
51 |     func_f1= np.zeros((num_folds, num_groups))
52 |     mol_score = np.zeros((num_folds, 2))  
53 |                 
54 |     #Using thresholds find func_f1, mol_f1 and mol_perfection for all folds
55 |     for ind, data_fold in enumerate(data_ls):
56 |         fold_target = data_fold[1]
57 |         fold_preds = (data_fold[0]>thresholds).astype('int')
58 |         func_f1[ind,:] = f1_score(fold_target, fold_preds, average = None)
59 |         mol_score[ind,1] = f1_score(fold_target, fold_preds, average = 'samples')
60 |         mol_score[ind,0] = accuracy_score(fold_target, fold_preds)
61 |         
62 |     overall_score = np.array([np.mean(mol_score, axis = 0), np.std(mol_score,axis = 0)])
63 |     overall_f1 = np.array([np.mean(func_f1, axis = 0), np.std(func_f1, axis = 0)])
64 |     
65 |     print (overall_f1.shape, overall_score.shape)
66 |     #Create a dataframe with the results
67 |     func_f1_df = pd.DataFrame(overall_f1, columns = func_names, index=['mean', 'std']).T
68 |     mol_score_df = pd.DataFrame(overall_score, columns = ['mol. perfection', 'mol. f1'], index=['mean', 'std']).T
69 |     
70 |     return mol_score_df, func_f1_df 
71 | 
72 | 
73 | def store_results(train_predictions, test_predictions, func_group_names, save_path):
74 |     '''Store results in a csv file
75 | 
76 |     Args:
77 |         data_ls: (list) containing predictions and target of all folds
78 |         thresholds: (np.array) of every functional group
79 |         func_names: (list) used as part of target
80 | 
81 |     Returns:
82 |         None
83 |     '''
84 |     thresholds = compute_thresholds(test_predictions)
85 |     train_score_df,train_f1_df = compute_metrics(train_predictions, thresholds, func_group_names)
86 |     test_score_df,test_f1_df = compute_metrics(test_predictions, thresholds, func_group_names)
87 |     
88 |     f1_df = pd.concat([train_f1_df, test_f1_df], keys = ['Train', 'Val'], axis = 1)
89 |     perf_df = pd.concat([train_score_df, test_score_df], keys = ['Train', 'Val'], axis = 1)
90 |     
91 |     logging.info('Storing results in {}'.format(save_path))
92 |     f1_df.to_csv(os.path.join(save_path, 'func_f1.csv'))
93 |     perf_df.to_csv(os.path.join(save_path, 'mol_score.csv'))
94 | 
95 |     


--------------------------------------------------------------------------------
/scrap.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import requests, urllib
  3 | import argparse
  4 | import logging
  5 | import pandas as pd 
  6 | 
  7 | from model.utils import set_logger
  8 | 
  9 | nist_url = "https://webbook.nist.gov/cgi/cbook.cgi"
 10 | 
 11 | def scrap_data(cas_ls, params, data_dir):
 12 | 	'''Collect data from NIST database and store them in jdx format.
 13 | 
 14 |     Args:
 15 |         cas_ls: (list) CAS ids to download data for
 16 | 		params: (dict) queries to be added to url
 17 | 		data_dir: (string) path to store the data
 18 | 
 19 |     Returns:
 20 |         None
 21 |     '''	
 22 | 
 23 | 	#Create directory for the relevant spetra 
 24 | 	spectra_path = os.path.join(data_dir, params['Type'].lower(), '')
 25 | 	if not os.path.exists(spectra_path):
 26 | 		os.makedirs(spectra_path)
 27 | 
 28 | 	num_created = 0
 29 | 	for cas_id in cas_ls:
 30 | 		params['JCAMP'] = 'C' + cas_id
 31 | 		response = requests.get(nist_url, params=params)
 32 | 
 33 | 		if response.text == '##TITLE=Spectrum not found.\n##END=\n':
 34 | 			continue
 35 | 		num_created+=1
 36 | 		logging.info('Creating {} spectra for id: {}. Total spectra created {}'.format(params['Type'].lower(), cas_id, num_created))
 37 | 		with open(spectra_path +cas_id +'.jdx', 'wb') as data:
 38 | 			data.write(response.content)
 39 | 
 40 | def scrap_inchi(cas_ls, params, data_dir):
 41 | 	'''Collect Inchi keys from NIST database and store them in txt format.
 42 | 
 43 |     Args:
 44 |         cas_ls: (list) CAS ids to download data for
 45 | 		params: (dict) queries to be added to url
 46 | 		data_dir: (string) path to store the data
 47 | 
 48 |     Returns:
 49 |         None
 50 |     '''	
 51 | 
 52 | 	#Create file path for storing inchi keys
 53 | 	inchi_path = os.path.join(data_dir, 'inchi.txt')
 54 | 	num_created = 0
 55 | 	with open(inchi_path,'a') as file:
 56 | 		content = '{}\t{}\n'.format('cas_id', 'inchi')
 57 | 		file.write(content)
 58 | 
 59 | 		for cas_id in cas_ls:
 60 | 			params['GetInChI'] = 'C' + cas_id
 61 | 			response = requests.get(nist_url, params=params)
 62 | 
 63 | 			num_created+=1
 64 | 			logging.info('Creating InChi key for id: {}. Total keys created {}'.format(cas_id, num_created))
 65 | 			content = '{}\t{}\n'.format(cas_id,response.content.decode("utf-8"))
 66 | 			file.write(content)
 67 | 			
 68 | 	
 69 | 
 70 | 
 71 | 
 72 | parser = argparse.ArgumentParser()
 73 | parser.add_argument('--save_dir', default= './data',\
 74 |      help = "Directory path to store scrapped data")
 75 | parser.add_argument('--cas_list', default= 'species.txt',\
 76 |     help = "File containing CAS number and formula of molecules")
 77 | parser.add_argument('--scrap_IR', default= True,\
 78 |     help = "Whether to download IR or not")
 79 | parser.add_argument('--scrap_MS', default= True,\
 80 |     help = "Whether to download MS or not")
 81 | parser.add_argument('--scrap_InChi', default= True,\
 82 |     help = "Whether to download InChi or not")
 83 | 
 84 | args = parser.parse_args()
 85 | 
 86 | #Check if file containing CAS ids exist
 87 | assert os.path.isfile(args.cas_list),"No file named {} exists".format(args.cas_list)
 88 | 
 89 | #Create data directory to store logs and spectra
 90 | data_dir = args.save_dir
 91 | if not os.path.exists(data_dir):
 92 | 	os.makedirs(data_dir)
 93 | 
 94 | set_logger(data_dir, 'scrap.log')
 95 | 
 96 | #Obtain CAS ids used for downloading the content from NIST
 97 | logging.info('Loading CAS file')
 98 | cas_df = pd.read_csv(args.cas_list, sep='\t', names = ['name', 'formula', 'cas'], header = 0)
 99 | cas_df.dropna(subset=['cas'], inplace=True)
100 | cas_df.cas = cas_df.cas.str.replace('-', '')
101 | 
102 | cas_ids = list(cas_df.cas)
103 | 
104 | 
105 | 
106 | 
107 | logging.info('Scrap Mass spectra')
108 | if args.scrap_MS:
109 | 	params = params={'JCAMP': '',  'Index': 0, 'Type': 'Mass'}
110 | 	scrap_data(cas_ids, params, data_dir)
111 | 
112 | logging.info('Scrap IR spectra')
113 | if args.scrap_IR:
114 | 	params={'JCAMP': '', 'Type': 'IR', 'Index': 0}	
115 | 	scrap_data(cas_ids, params, data_dir)
116 | 
117 | logging.info('Scrap InChi keys')
118 | if args.scrap_InChi:
119 | 	params={}
120 | 	scrap_inchi(cas_ids, params, data_dir)
121 | 


--------------------------------------------------------------------------------
/model/mlp_model_fn.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf 
  2 | 
  3 | def build_mlp_model(is_training, inputs, params):
  4 |     '''Build forward model and compute logits
  5 | 
  6 |     Args:
  7 |         is_training: (tf.placeholder) indicates training or evaluation
  8 |         inputs: (dict) contains tensors of inputs and labels fed to the graph
  9 |         params: (dict) hyperparameters of the model
 10 | 
 11 |     Returns:
 12 |         output: (tf.tensor) logits of the model
 13 |     
 14 |     '''
 15 |     
 16 |     #Read all hyperparameters
 17 |     num_fc_layers = params['num_fc_layers']
 18 |     fc_hidden_units = params['fc_hidden_units']
 19 |     activation = params['activation']
 20 |     dropout_probs = params['dropout_probs']
 21 |     dropout_layer = inputs 
 22 |     output_shape = params['output_shape']
 23 |     
 24 |     #Construct hidden layers of the forward model
 25 |     for layer in range(num_fc_layers):
 26 |         with tf.variable_scope('fc_{}'.format(layer+1)):
 27 |             hidden_layer = tf.layers.dense(dropout_layer, fc_hidden_units[layer])
 28 |             batch_norm_layer = tf.layers.batch_normalization(hidden_layer, training = is_training)
 29 |             activation_layer = eval(activation)(batch_norm_layer)
 30 |             dropout_layer = tf.layers.dropout(activation_layer, rate = dropout_probs[layer],training = is_training)
 31 |             
 32 |             
 33 |         
 34 |     #Compute output of the model   
 35 |     with tf.variable_scope('output'):
 36 |         output = tf.layers.dense(dropout_layer, output_shape, None)
 37 |     
 38 |     return output
 39 | 
 40 | 
 41 | def mlp_model_fn(is_training, inputs, params):
 42 | 
 43 |     '''Define graph operations for training and evaluating
 44 |     
 45 |     Args:
 46 |         is_training: (bool) indicates training or evaluation
 47 |         inputs: (dict) contains tensors of inputs and labels fed to the graph
 48 |         reuse: (bool) To or not to reuse the variables with same name
 49 |         params: (dict) hyperparameters of the model
 50 | 
 51 |     Returns:
 52 |         model_spec: (dict) Contains the operations needed for training and evaluating the model
 53 |     
 54 |     '''
 55 |     target = inputs['target']
 56 |     spectra_data = inputs['spectra_data']
 57 |     is_train_ph = tf.placeholder_with_default(is_training, shape=()) #Define a placeholder for setting mode during evaluation
 58 |     params['output_shape'] = target.shape[1]
 59 |     num_functional_groups = tf.cast(target.shape[1], tf.float64)
 60 | 
 61 |     #Compute logits and make predictions 
 62 |     with tf.variable_scope('model', reuse = not is_training):
 63 |         logits = build_mlp_model(is_train_ph, spectra_data, params)
 64 |         pred_probs = tf.sigmoid(logits)
 65 |         predictions = tf.cast(tf.greater_equal(pred_probs, params['threshold']), tf.float64)
 66 |         
 67 |     #Binary cross entropy loss computed across every dimension for multi label classification
 68 |     loss = tf.reduce_mean(tf.losses.sigmoid_cross_entropy(target, logits))
 69 |     num_correct_predictions = tf.reduce_sum(tf.cast(tf.equal(target, predictions),tf.float64), axis = 1)/num_functional_groups
 70 |     accuracy = tf.reduce_mean(tf.cast(tf.equal(num_correct_predictions, 1.0), tf.float64))
 71 | 
 72 | 
 73 | 
 74 |     if is_training:
 75 |         optimizer = tf.train.AdamOptimizer(params['learning_rate'])
 76 |         global_step = tf.train.get_or_create_global_step()
 77 |         
 78 |         #Perform update_op to update moving mean and variance before minimizing the loss
 79 |         update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
 80 |         with tf.control_dependencies(update_op):
 81 |             train_op = optimizer.minimize(loss, global_step = global_step)
 82 |     
 83 |     
 84 |     with tf.variable_scope('metrics'):
 85 |         metrics = {'loss' : tf.metrics.mean(loss),
 86 |                    'accuracy' : tf.metrics.mean(accuracy)}
 87 |         
 88 |     
 89 |         
 90 |     #Group all metrics update ops 
 91 |     metrics_update_op = tf.group(*[metric[1] for _, metric in metrics.items()])
 92 |         
 93 |     #Collect all metrics variables to initialize before every epoch
 94 |     metrics_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics")
 95 |     metrics_initializer_op = tf.variables_initializer(metrics_variables)
 96 |         
 97 |     tf.summary.scalar('loss', loss)
 98 |     tf.summary.scalar('accuracy', accuracy)
 99 |         
100 |     model_spec = inputs
101 |     model_spec['loss'] = loss
102 |     model_spec['pred_probs'] = pred_probs
103 |     model_spec['metrics'] = metrics
104 |     model_spec['metric_initializer_op'] = metrics_initializer_op
105 |     model_spec['metrics_update_op'] = metrics_update_op
106 |     model_spec['summary_op'] = tf.summary.merge_all()
107 |     model_spec['variables_init_op'] = tf.global_variables_initializer()
108 |     model_spec['train_ph'] = is_train_ph
109 |     
110 |     
111 |     if is_training:
112 |         model_spec['train_op'] = train_op
113 |     
114 |     return model_spec


--------------------------------------------------------------------------------
/model/train_fn.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from tqdm import trange
  4 | import tensorflow as tf
  5 | 
  6 | from .evaluate_fn import evaluate_sess
  7 | 
  8 | def train_sess(sess, model_spec, num_steps, writer, params):
  9 |     '''Train the model on the data for one epoch
 10 | 
 11 |     Args:
 12 |         sess: (tf.Session) indicates current session
 13 |         model_spec: (dict) contains all graph operations for training the model
 14 |         params: (dict) hyperparameters of the model
 15 |         num_steps: (int) Number of batches 
 16 |         writer: (tf.summary.FileWriter) writer for storing summaries
 17 | 
 18 |     Returns:
 19 |         None
 20 |     
 21 |     '''
 22 |     
 23 |     #Collect all update ops and metrics
 24 |     metrics_update_op = model_spec['metrics_update_op']
 25 |     train_op = model_spec['train_op']
 26 |     summary_op = model_spec['summary_op']
 27 |     loss = model_spec['loss']
 28 |     metrics = model_spec['metrics']
 29 |     global_step = tf.train.get_global_step()
 30 |     
 31 |     #Initialize the dataset iterator and metrics local variables
 32 |     sess.run(model_spec['iterator_initializer'])
 33 |     sess.run(model_spec['metric_initializer_op'])
 34 |     
 35 |     progress_bar = trange(num_steps, position = 0, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')
 36 |     for step in progress_bar:
 37 |         if step%params['save_frequency']==0:
 38 |             _,_,summary,loss_val,global_step_val=sess.run([train_op, metrics_update_op, summary_op, loss, global_step])
 39 |             writer.add_summary(summary, global_step_val)
 40 |         else:
 41 |             _,_,loss_val=sess.run([train_op, metrics_update_op, loss])
 42 |         progress_bar.set_postfix(loss=round(loss_val,6))
 43 |             
 44 |     #Compute metrics over entire training data
 45 |     train_metrics_values = sess.run({key : val[0] for key, val in metrics.items()})
 46 |     train_metrics_string = ' '.join(['{} : {:.6f}'.format(key, val) for key, val in train_metrics_values.items()])
 47 |     logging.info("- Train metrics: "+ train_metrics_string)
 48 | 
 49 | 
 50 | 
 51 | def train_and_save(train_model_spec, eval_model_spec, model_dir, params, restore_weights = None):
 52 |     '''Train the model and save the weights of last 5 epochs and the best epoch
 53 | 
 54 |     Args:
 55 |         train_model_spec: (dict) contains all graph operations for training the model
 56 |         eval_model_spec: (dict) contains all graph operations for evaluating the model
 57 |         model_dir: (string) directory path to store weights and summaries
 58 |         params: (dict) hyperparameters of the model
 59 |         restore_weights: (string) directory path to restore weights from
 60 | 
 61 |     Returns:
 62 |         None
 63 |     '''
 64 |     #Initiliaze the saver
 65 |     last_saver = tf.train.Saver()
 66 |     best_saver = tf.train.Saver(max_to_keep=1)
 67 |     
 68 |     with tf.Session() as sess:
 69 |         begin_epoch = 0
 70 |         sess.run(train_model_spec['variables_init_op'])
 71 |         if restore_weights is not None:
 72 |             #Restore weights from model_dir/restore_weights
 73 |             restore_dir = os.path.join(model_dir, restore_weights)
 74 |             logging.info('Restoring weights from {}'.format(restore_dir))
 75 |             latest_ckpt = tf.train.latest_checkpoint(restore_dir)
 76 |             begin_epoch = int(latest_ckpt.split('-')[-1])
 77 |             last_saver.restore(sess, latest_ckpt)
 78 |             
 79 |         #Create summary writer for training and evaluation
 80 |         train_writer = tf.summary.FileWriter(os.path.join(model_dir, 'train_summary'), sess.graph)
 81 |         eval_writer = tf.summary.FileWriter(os.path.join(model_dir, 'eval_summary'), sess.graph)
 82 | 
 83 |         if params['best_model_metric'] == 'acc':
 84 |             eval_name = 'accuracy'
 85 |             eval_comp = '>'
 86 |             best_eval_val = 0
 87 |         
 88 |         else :
 89 |             eval_name = 'loss'
 90 |             eval_comp = '<'
 91 |             best_eval_val = 1e4
 92 | 
 93 |         
 94 |         for epoch in range(begin_epoch, begin_epoch + params['num_epochs']):
 95 |             logging.info('Epoch {}/{}'.format(epoch+1, begin_epoch + params['num_epochs']))
 96 |             num_steps = (params['train_size'] + params['batch_size'] - 1)//params['batch_size']
 97 |             train_sess(sess, train_model_spec, num_steps, train_writer, params)
 98 |             
 99 |             num_steps = (params['eval_size'] + params['batch_size'] - 1)//params['batch_size']
100 |             eval_metrics = evaluate_sess(sess, eval_model_spec, num_steps, eval_writer)
101 |             
102 |             last_save_path = os.path.join(model_dir, 'last_weights', 'epoch')
103 |             last_saver.save(sess, last_save_path, global_step = epoch+1)
104 |             
105 |             #Update the weights with current best model
106 |             if eval(str(eval_metrics[eval_name]) +  eval_comp + str(best_eval_val)):
107 |                 best_eval_val = eval_metrics[eval_name]
108 |                 
109 |                 best_save_path = os.path.join(model_dir, 'best_weights', 'epoch')
110 |                 best_saver.save(sess, best_save_path, global_step = epoch+1)
111 |                 logging.info('- Found new best {}. Saving in {}'.format(eval_name, best_save_path))


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
  3 | import logging
  4 | import argparse
  5 | import json
  6 | 
  7 | import tensorflow as tf
  8 | tf.get_logger().setLevel('ERROR')
  9 | 
 10 | from model.utils import set_logger, train_test_generator
 11 | from model.input_fn import input_fn
 12 | from model.ae_model_fn import  ae_model_fn
 13 | from model.mlp_model_fn import mlp_model_fn
 14 | from model.train_fn import train_and_save
 15 | from model.evaluate_fn import evaluate_and_predict
 16 | from prepare_load_dataset import load_dataset
 17 | from synthesize_results import store_results
 18 | 
 19 | parser = argparse.ArgumentParser()
 20 | parser.add_argument('--model_dir', default= './experiments/ae_mlp_model',\
 21 |      help = "Directory path containing params.json and to store summary and weights")
 22 | parser.add_argument('--data_dir', default= './data',\
 23 |     help = "Directory path containing IR and MS spectra data")
 24 | parser.add_argument('--restore_ae_from', default= None,\
 25 |     help = "Restore AE weights before training the model")
 26 | parser.add_argument('--restore_mlp_from', default= None,\
 27 |     help = "Restore MLP weights before training the model")
 28 | 
 29 | args = parser.parse_args()
 30 | 
 31 | #Model directory should contain params.json file listing all hyperparameters
 32 | json_path = os.path.join(args.model_dir, 'params.json')
 33 | assert os.path.isfile(json_path),"No params.json found at {} path".format(args.model_dir)
 34 | 
 35 | with open(json_path) as json_data:
 36 |     params = json.load(json_data)
 37 | 
 38 | set_logger(args.model_dir, 'train.log')
 39 | 
 40 | logging.info('Load the dataset from {}'.format(args.data_dir))
 41 | X, y, func_names = load_dataset(args.data_dir, True, **params['preprocess'])
 42 | 
 43 | 
 44 | #Train and test generator for every fold
 45 | data_generator = train_test_generator(X, y, params['n_splits'])
 46 | 
 47 | train_predictions = []
 48 | test_predictions = []
 49 | 
 50 | for cv, (train_data, test_data) in enumerate(data_generator):
 51 |     logging.info('Starting fold {}'.format(cv+1))
 52 |     train_size = train_data[0].shape[0]
 53 |     eval_size = test_data[0].shape[0]
 54 |     
 55 |     if params['train_ae']:
 56 |         tf.reset_default_graph()
 57 |         logging.info('Training autoencoder to compute embeddings')
 58 | 
 59 |         ae_params = params['ae']
 60 |         ae_params['train_size'] = train_size
 61 |         ae_params['eval_size'] = eval_size
 62 | 
 63 |         logging.info('Creating the inputs for the model')
 64 |         train_inputs = input_fn(True, train_data, ae_params)
 65 |         eval_inputs = input_fn(False, test_data, ae_params)
 66 | 
 67 |         logging.info('Building the model')
 68 |         train_model = ae_model_fn(True, train_inputs, ae_params)
 69 |         eval_model = ae_model_fn(False, eval_inputs, ae_params)
 70 | 
 71 | 
 72 |         logging.info('Start training {} epochs'.format(params['ae']['num_epochs']))
 73 |         model_dir = os.path.join(args.model_dir, 'cv_' + str(cv+1), 'ae')
 74 |         train_and_save(train_model, eval_model, model_dir, ae_params, restore_weights = args.restore_ae_from)
 75 | 
 76 |         #Update spectra data with embeddings computed from the model
 77 |         logging.info('Compute embeddings of the spectra data')
 78 |         emb_params = {'restore_path' :os.path.join(model_dir,'best_weights'), 'params' :ae_params,\
 79 |                         'layer_name' :'embeddings', 'evaluate_model' :False}
 80 |         
 81 |         train_data = evaluate_and_predict(train_model, is_train_data = True, **emb_params)
 82 |         test_data = evaluate_and_predict(eval_model, is_train_data = False, **emb_params)
 83 | 
 84 |     tf.reset_default_graph()
 85 |     logging.info('Training MLP model')
 86 | 
 87 |     mlp_params = params['mlp']
 88 |     mlp_params['train_size'] = train_size
 89 |     mlp_params['eval_size'] = eval_size
 90 | 
 91 | 
 92 |     logging.info('Creating the inputs for the model')
 93 |     train_inputs = input_fn(True, train_data, mlp_params)
 94 |     eval_inputs = input_fn(False, test_data, mlp_params)
 95 | 
 96 |     logging.info('Building the model')
 97 |     train_model = mlp_model_fn(True, train_inputs, mlp_params)
 98 |     eval_model = mlp_model_fn(False, eval_inputs, mlp_params)
 99 | 
100 |     logging.info('Start training {} epochs'.format(params['mlp']['num_epochs']))
101 |     model_dir = os.path.join(args.model_dir, 'cv_' + str(cv+1), 'mlp')
102 |     train_and_save(train_model, eval_model, model_dir, mlp_params, restore_weights = args.restore_mlp_from)
103 | 
104 |     logging.info('Compute prediction probabilities of the spectra data')
105 |     pred_params = {'restore_path' :os.path.join(model_dir,'best_weights'), 'params' :mlp_params,\
106 |                         'layer_name' :'pred_probs', 'evaluate_model' :False}
107 |         
108 |     #Compute prediction probabilites of the model to compute f1 and perfection rate
109 |     train_data = evaluate_and_predict(train_model, is_train_data = True, **pred_params)
110 |     test_data = evaluate_and_predict(eval_model, is_train_data = False, **pred_params)
111 | 
112 |     train_predictions.append(train_data)
113 |     test_predictions.append(test_data)
114 | 
115 | #Compute and save the metrics
116 | store_results(train_predictions, test_predictions, func_names, args.model_dir)
117 | 
118 | logging.info('Successfully Completed!!!!!')
119 | 
120 | 
121 |     
122 | 
123 | 


--------------------------------------------------------------------------------
/model/ae_model_fn.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf 
  2 | 
  3 | def build_ae_model(is_training, inputs, params):
  4 |     '''Build forward model and reconstruct the spectra
  5 | 
  6 |     Args:
  7 |         is_training: (tf.placeholder) indicates training or evaluation
  8 |         inputs: (dict) contains tensors of inputs fed to the graph
  9 |         params: (dict) hyperparameters of the model
 10 | 
 11 |     Returns:
 12 |         emb_layer: (tf.tensor) Embeddings computed by the encoder
 13 |         output: (tf.tensor) Reconstructed spectra
 14 |     
 15 |     '''
 16 |     
 17 |     #Read all hyperparameters
 18 |     num_ae_layers = params['num_fc_layers']
 19 |     ae_hidden_units = params['fc_hidden_units']
 20 |     activation = params['activation']
 21 |     is_denoising = params.get('is_denoising', False)
 22 |     denoise_prob = params.get('denoise_inputs', 0.05)
 23 |     hidden_layer = inputs 
 24 | 
 25 |     # Randomly flip inputs to 0 with the probability of denoise_prob
 26 |     if is_denoising:
 27 |         input_shape = tf.shape(inputs)
 28 |         hidden_layer *= tf.where(tf.random_uniform(input_shape) > denoise_prob, tf.ones(input_shape)\
 29 |                             , tf.zeros(input_shape))
 30 | 
 31 |     #Construct hidden layers of the encoder
 32 |     for layer in range(num_ae_layers):
 33 |         with tf.variable_scope('enc_{}'.format(layer+1)):
 34 |             hidden_layer = tf.layers.dense(hidden_layer, ae_hidden_units[layer], eval(activation))
 35 |             # batch_norm_layer = tf.layers.batch_normalization(hidden_layer, training = is_training)
 36 |             # activation_layer = eval(activation)(batch_norm_layer)
 37 |             # dropout_layer = tf.layers.dropout(activation_layer, rate = dropout_probs[layer],training = is_training)
 38 | 
 39 |             
 40 |             
 41 | 
 42 |     emb_layer = hidden_layer
 43 | 
 44 |     #Construct hidden layers of the decoder
 45 |     for layer in range(num_ae_layers-2, -1, -1):
 46 |         with tf.variable_scope('dec_{}'.format(layer+1)):
 47 |             hidden_layer = tf.layers.dense(hidden_layer, ae_hidden_units[layer], eval(activation))
 48 |             # batch_norm_layer = tf.layers.batch_normalization(hidden_layer, training = is_training)
 49 |             # activation_layer = eval(activation)(batch_norm_layer)
 50 |             # dropout_layer = tf.layers.dropout(activation_layer, rate = dropout_probs[layer],training = is_training)
 51 |     
 52 |     #Compute reconstructed spectra (use sigmoid as activation to get [0,1] range like input)
 53 |     with tf.variable_scope('dec_{}'.format(layer+1)):
 54 |         output = tf.layers.dense(hidden_layer, inputs.shape[-1], 'sigmoid')
 55 |     
 56 |     return emb_layer, output
 57 | 
 58 | 
 59 | def ae_model_fn(is_training, inputs, params):
 60 | 
 61 |     '''Define graph operations for training and evaluating
 62 |     
 63 |     Args:
 64 |         is_training: (bool) indicates training or evaluation
 65 |         inputs: (dict) contains tensors of inputs and labels fed to the graph
 66 |         params: (dict) hyperparameters of the model
 67 | 
 68 |     Returns:
 69 |         model_spec: (dict) Contains the operations needed for training and evaluating the model
 70 |     
 71 |     '''
 72 | 
 73 |     
 74 |     spectra_data = inputs['spectra_data']
 75 |     is_train_ph = tf.placeholder_with_default(is_training, shape=()) #Define a placeholder for setting mode during evaluation
 76 | 
 77 |     #Compute embeddings and reconstructed data
 78 |     with tf.variable_scope('model', reuse = not is_training):
 79 |         embeddings, spectra_recon = build_ae_model(is_train_ph, spectra_data, params)
 80 |         
 81 |     #Mean squared loss between input and reconstructed spectra
 82 |     loss = tf.losses.mean_squared_error(spectra_data, spectra_recon)
 83 | 
 84 | 
 85 | 
 86 | 
 87 |     if is_training:
 88 |         optimizer = tf.train.AdamOptimizer(params['learning_rate'])
 89 |         global_step = tf.train.get_or_create_global_step()
 90 |         
 91 |         #Perform update_op to update moving mean and variance before minimizing the loss
 92 |         update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
 93 |         with tf.control_dependencies(update_op):
 94 |             train_op = optimizer.minimize(loss, global_step = global_step)
 95 |     
 96 |     
 97 |     with tf.variable_scope('metrics'):
 98 |         metrics = {'loss' : tf.metrics.mean(loss)}
 99 |         
100 |     
101 |         
102 |     #Group all metrics update ops 
103 |     metrics_update_op = tf.group(*[metric[1] for _, metric in metrics.items()])
104 |         
105 |     #Collect all metrics variables to initialize before every epoch
106 |     metrics_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics")
107 |     metrics_initializer_op = tf.variables_initializer(metrics_variables)
108 |         
109 |     tf.summary.scalar('loss', loss)
110 |         
111 |     model_spec = inputs
112 |     model_spec['loss'] = loss
113 |     model_spec['embeddings'] = embeddings
114 |     model_spec['metrics'] = metrics
115 |     model_spec['metric_initializer_op'] = metrics_initializer_op
116 |     model_spec['metrics_update_op'] = metrics_update_op
117 |     model_spec['summary_op'] = tf.summary.merge_all()
118 |     model_spec['variables_init_op'] = tf.global_variables_initializer()
119 |     model_spec['train_ph'] = is_train_ph
120 |     
121 |     
122 |     if is_training:
123 |         model_spec['train_op'] = train_op
124 |     
125 |     return model_spec


--------------------------------------------------------------------------------
/model/evaluate_fn.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from tqdm import trange
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | def evaluate_sess(sess, model_spec, num_steps, writer, feed_dict = {}):
  9 |     '''Evaluate the model on entire data
 10 | 
 11 |     Args:
 12 |         sess: (tf.Session) indicates current session
 13 |         model_spec: (dict) contains all graph operations for evaluating the model
 14 |         num_steps: (int) Number of batches 
 15 |         writer: (tf.summary.FileWriter) writer for storing summaries, can be None
 16 |         feed_dict: (dict) containing mode during evaluation
 17 | 
 18 |     Returns:
 19 |         eval_metrics_values: (string) contains evaluation metrics of data
 20 |     
 21 |     '''
 22 |     
 23 |     #Collect all operations for evaluation
 24 |     metrics_update_op = model_spec['metrics_update_op']
 25 |     metrics = model_spec['metrics']
 26 |     global_step = tf.train.get_global_step()
 27 |     
 28 |     #Initiliaze the dataset iterator and metrics local variables
 29 |     sess.run(model_spec['iterator_initializer'])
 30 |     sess.run(model_spec['metric_initializer_op'])
 31 |     
 32 |     progress_bar = trange(num_steps, position = 0, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')
 33 |     for _ in progress_bar:
 34 |         _ = sess.run(metrics_update_op, feed_dict = feed_dict)
 35 |     
 36 |     #Compute and log evaluation metrics
 37 |     eval_metrics_values = sess.run({key : val[0] for key, val in metrics.items()})
 38 |     eval_metrics_string = ' '.join(['{} : {:.6f}'.format(key, val) for key, val in eval_metrics_values.items()])
 39 |     logging.info("- Eval metrics: "+ eval_metrics_string)
 40 |     
 41 |     #Add evaluation summaries to the writer
 42 |     if writer is not None:
 43 |         global_step_val = sess.run(global_step)
 44 |         for key, val in eval_metrics_values.items():
 45 |             summary = tf.Summary(value = [tf.Summary.Value(tag = key, simple_value = val)])
 46 |             writer.add_summary(summary, global_step = global_step_val)
 47 |     return eval_metrics_values
 48 | 
 49 | 
 50 | def predictions_sess(sess, model_spec, size, params, layer_name = 'pred_probs', feed_dict = {}):
 51 |     '''Compute predictions of a model layer in model specification
 52 | 
 53 |     Args:
 54 |         sess: (tf.Session) indicates current session
 55 |         model_spec: (dict) contains graph operations for making prediction
 56 |         size: (int) dataset size
 57 |         params: (dict) hyperparameters of the model
 58 |         feed_dict: (dict) containing mode during evaluation
 59 | 
 60 | 
 61 |     Returns:
 62 |         data: (tuple) containing arrays of prediction and target
 63 |     
 64 |     '''
 65 |     
 66 |     #Compute dimension size to create data arrays
 67 |     target_dim = model_spec['target'].shape[-1]
 68 |     pred_dim = model_spec[layer_name].shape[-1]
 69 | 
 70 |     #Initialize target and predictions array
 71 |     target_arr = np.zeros((size, target_dim)) 
 72 |     pred_arr = np.zeros((size, pred_dim))
 73 | 
 74 |     #Initiliaze the dataset iterator
 75 |     sess.run(model_spec['iterator_initializer'])
 76 |     
 77 | 
 78 |     batch_size = params['batch_size']
 79 |     #Compute number of batches
 80 |     num_steps = (size + batch_size- 1)//batch_size
 81 | 
 82 |     #Compute batch wise target and predictions. Add it to data array. 
 83 |     progress_bar = trange(num_steps, position = 0, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')
 84 |     for step in progress_bar:
 85 |         pred_batch, target_batch = sess.run([model_spec[layer_name], model_spec['target']], feed_dict = feed_dict)
 86 |         target_arr[step*batch_size: (step+1)* batch_size] = target_batch
 87 |         pred_arr[step*batch_size: (step+1)* batch_size] = pred_batch
 88 | 
 89 |     return pred_arr, target_arr
 90 | 
 91 | def evaluate_and_predict(model_spec, layer_name, is_train_data,\
 92 |                 params, restore_path, evaluate_model = True):
 93 |     '''Evaluate the model and make predictions after restoring the weights
 94 | 
 95 |     Args:
 96 |         model_spec: (dict) contains all graph operations for evaluating the model
 97 |         layer_name: (string) name of the layer to compute model predictions
 98 |         is_train_data: (bool) whether dataset is train data or val data
 99 |         params: (dict) hyperparameters of the model
100 |         restore_path: (string) directory path to restore weights from
101 |         evaluate_model: (bool) whether or not to evaluate the model
102 | 
103 |     Returns:
104 |         data: (tuple) containing arrays of prediction and target
105 |     '''
106 | 
107 |     saver = tf.train.Saver()
108 |     
109 |     with tf.Session() as sess:
110 |         #Restore weights from model_dir/restore_weights
111 |         restore_dir = os.path.join(restore_path)
112 |         logging.info('Restoring weights from {}'.format(restore_dir))
113 |         latest_ckpt = tf.train.latest_checkpoint(restore_dir)
114 |         saver.restore(sess, latest_ckpt)
115 | 
116 |         size = params['train_size'] if is_train_data else params['eval_size']
117 |         num_steps = (size + params['batch_size'] - 1)//params['batch_size']
118 | 
119 |         is_train_ph = model_spec['train_ph']
120 |         feed_dict = {is_train_ph: False}
121 |         
122 |         if evaluate_model:
123 |             _ = evaluate_sess(sess, model_spec, num_steps, None, feed_dict)
124 |         
125 |         
126 |         data = predictions_sess(sess, model_spec, size, params, layer_name, feed_dict)
127 |         return data
128 |             
129 | 


--------------------------------------------------------------------------------
/prepare_load_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import argparse
  4 | import sys
  5 | 
  6 | import pandas as pd 
  7 | import numpy as np
  8 | from jcamp import jcamp_read
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | from rdkit import Chem, RDLogger
 12 | lg = RDLogger.logger()
 13 | lg.setLevel(RDLogger.CRITICAL)
 14 | 
 15 | from model.utils import set_logger
 16 | 
 17 | 
 18 | # Initialize all constants necessary for standardizing the spectra
 19 | min_ir = 399
 20 | max_ir = 4001
 21 | step_ir = 3.25
 22 | 
 23 | min_mass = 1 
 24 | max_mass = 650
 25 | step_mass = 1
 26 | 
 27 | eps = 1e-4
 28 | 
 29 | # Create dictionary of functional group names and their corresponding smarts string
 30 | func_grp_smarts = {'alkane':'[CX4;H0,H1,H2,H4]','methyl':'[CH3]','alkene':'[CX3]=[CX3]','alkyne':'[CX2]#C',
 31 |                    'alcohols':'[#6][OX2H]','amines':'[NX3;H2,H1;!$(NC=O)]', 'nitriles':'[NX1]#[CX2]', 
 32 |                    'aromatics':'[$([cX3](:*):*),$([cX2+](:*):*)]','alkyl halides':'[#6][F,Cl,Br,I]', 
 33 |                    'esters':'[#6][CX3](=O)[OX2H0][#6]', 'ketones':'[#6][CX3](=O)[#6]','aldehydes':'[CX3H1](=O)[#6]', 
 34 |                    'carboxylic acids':'[CX3](=O)[OX2H1]', 'ether': '[OD2]([#6])[#6]','acyl halides':'[CX3](=[OX1])[F,Cl,Br,I]',
 35 |                    'amides':'[NX3][CX3](=[OX1])[#6]','nitro':'[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]'}
 36 | 
 37 | 
 38 | 
 39 |                 
 40 | def JCAMP_reader(filename):
 41 |     '''Overload function in jcamp to use latin-1 encoding instead of utf-8
 42 | 
 43 |     Args:
 44 |         filename: (string) jdx file containing spectra
 45 | 
 46 |     Returns:
 47 |         data: (dict) contains parsed information from file
 48 |     '''
 49 |     with open(filename, 'r', encoding = 'latin-1') as filehandle:
 50 |         data = jcamp_read(filehandle)
 51 |     data['filename'] = filename
 52 |     return data
 53 | 
 54 | 
 55 | def check_spectra_prop(mol_dict):
 56 |     '''Check if IR spectra satisfies certain conditions to be included in dataset
 57 | 
 58 |     Args:
 59 |         mol_dict: (dict) contains information about the spectra data
 60 | 
 61 |     Returns:
 62 |         _: (bool) whether spectra meets conditions
 63 |     '''
 64 |     cond1 = mol_dict.get('state', r'N\A').lower() == 'gas'
 65 |     cond2 = mol_dict.get('xunits', r'N\A').lower() != 'micrometers'
 66 |     cond3 = mol_dict.get('yunits', r'N\A').lower() == 'absorbance'
 67 |     
 68 |     return all((cond1, cond2, cond3))
 69 | 
 70 | def add_spectra_to_df(spectra_df, file_path, bins, is_mass = False):
 71 |     '''Add a spectra from filepath to the dataframe after standardizing
 72 | 
 73 |     Args:
 74 |         spectra_df: (pd.DataFrame) contains standardized spectra
 75 |         file_path: (string) path containing jdx file 
 76 |         bins: (np.array) used for standardizing
 77 |         is_mass: (bool) whether data being parsed is Mass or IR
 78 | 
 79 |     Returns:
 80 |         spectra_df: (pd.DataFrame) contains new spectrum aded to dataframe
 81 |     '''
 82 | 
 83 |     mol_dict = JCAMP_reader(file_path)
 84 | 
 85 |     #if conditions are not met, don't add the data
 86 |     if not is_mass and not check_spectra_prop(mol_dict):
 87 |         return spectra_df
 88 |     
 89 |     #Standardize the new spectrum and prepare for merging
 90 |     mol_id = mol_dict['cas registry no'].replace('-','')
 91 |     mol_xvalues = mol_dict['x']
 92 |     mol_yvalues = mol_dict['y']
 93 |     mol_df = pd.DataFrame(data = {mol_id : mol_yvalues}, index = mol_xvalues)
 94 |     mol_df.index = pd.cut(mol_df.index, bins = bins)
 95 |     mol_df = mol_df.groupby(level=0).agg('mean')
 96 | 
 97 |     logging.info('Adding spectra with id {} to dataframe'.format(mol_id))
 98 |     if spectra_df is None:
 99 |         spectra_df = mol_df
100 |     else:
101 |         spectra_df = pd.merge(spectra_df, mol_df, left_index = True, right_index = True, how='outer')
102 |         
103 |     return spectra_df
104 | 
105 | def save_spectra_to_csv(root, files, save_path, bins, is_mass = False):
106 |     '''Save the spectra dataframe as csv to path
107 | 
108 |     Args:
109 |         root: (string) path to spectra data
110 |         files: (list) jdx files present in root
111 |         save_path: (string) path to store csv file
112 |         bins: (np.array) used for standardizing
113 |         is_mass: (bool) whether data being parsed is Mass or IR
114 | 
115 |     Returns:
116 |         None
117 |     '''
118 | 
119 |     spectra_df = None
120 |     for file_name in files:
121 |         file_path = os.path.join(root,file_name)
122 |         spectra_df = add_spectra_to_df(spectra_df, file_path\
123 |                                                 ,bins, is_mass)
124 |     logging.info('Creating dataset in {}'.format(save_path))
125 |     spectra_df.to_csv(save_path)
126 | 
127 | 
128 | def identify_functional_groups(inchi):
129 |     '''Identify the presence of functional groups present in molecule 
130 |        denoted by inchi
131 | 
132 |     Args:
133 |         root: (string) path to spectra data
134 |         files: (list) jdx files present in root
135 |         save_path: (string) path to store csv file
136 |         bins: (np.array) used for standardizing
137 |         is_mass: (bool) whether data being parsed is Mass or IR
138 | 
139 |     Returns:
140 |         mol_func_groups: (list) contains binary values of functional groups presence
141 |                           None if inchi to molecule conversion returns warning or error
142 |     '''
143 |     
144 |     try:
145 |         #Convert inchi to molecule
146 |         mol = Chem.MolFromInchi(inchi, treatWarningAsError=True)   
147 |         mol_func_grps = []
148 | 
149 |         #populate the list with binary values
150 |         for _, func_struct in func_grp_structs.items():
151 |             struct_matches = mol.GetSubstructMatches(func_struct)
152 |             contains_func_grp = int(len(struct_matches)>0)
153 |             mol_func_grps.append(contains_func_grp)
154 |         return mol_func_grps
155 |     except:
156 | 
157 |         return None
158 |     
159 | def save_target_to_csv(cas_inchi_df, save_path):
160 |     '''Save the target dataframe as csv to path
161 | 
162 |     Args:
163 |         cas_inchi_df: (pd.DataFrame) contains CAS and Inchi of molecules
164 |         save_path: (string) path to store csv file
165 | 
166 |     Returns:
167 |         None
168 |     '''
169 |     column_names = list(func_grp_structs.keys())    
170 |     target_df = pd.DataFrame(index = cas_inchi_df.index, columns = column_names)
171 | 
172 |     #Iterate the rows, don't use df.apply since a list is being returned.
173 |     for ind, (_, row) in enumerate(cas_inchi_df.iterrows()):
174 |         target_df.iloc[ind, :] = identify_functional_groups(row['inchi'])
175 |     
176 | 
177 |     target_df.dropna(inplace = True)
178 |     target_df.to_csv(save_path)
179 | 
180 | def preprocess_spectra_df(spectra_df, is_mass = False, **kwargs):
181 |     '''Preprocess the spectra dataframe by normalizing and interpolating
182 | 
183 |     Args:
184 |         spectra_df: (pd.DataFrame) contains standardized spectra
185 |         is_mass: (bool) whether data being parsed is Mass or IR
186 |         kwargs: (dict) containing methods for interpolation
187 | 
188 |     Returns:
189 |         spectra_df: (pd.DataFrame) contains processed spectra
190 |     '''
191 |     if is_mass:
192 | 
193 |         #Fill NaN with zero and remove m/z ratio where all values are zero
194 |         spectra_df.fillna(0, inplace = True)
195 |         spectra_df = spectra_df.loc[:,spectra_df.sum(axis=0)!=0]
196 |         
197 |     else:
198 | 
199 |         #Interpolate with linear or spline based on kwargs
200 |         spectra_df.reset_index(inplace = True)
201 |         spectra_df.iloc[:, 1:] = spectra_df.iloc[:,1:].interpolate(**kwargs,\
202 |                                          limit_direction='both', axis = 0)
203 |         spectra_df.set_index('index', inplace = True)
204 | 
205 |     #Normalize each spectra
206 |     return spectra_df.div(spectra_df.max(axis=0), axis=1)
207 |         
208 | 
209 | 
210 | def load_dataset(data_dir, include_mass = True, **params):
211 |     '''Load the spectra and target dataset for training
212 | 
213 |     Args:
214 |         data_dir: (string) contains data path for csv file
215 |         include_mass: (bool) whether to include mass spectra while training
216 |         params: (dict) containing methods for interpolation
217 | 
218 |     Returns:
219 |         X: (np.array) contains processed spectra values
220 |         y: (np.array) contains target values of corresponding spectra
221 |     '''
222 | 
223 |     #load and prepare IR data
224 |     ir_path = os.path.join(data_dir, 'ir.csv')
225 |     logging.info('Loading IR data from {}'.format(ir_path))
226 |     ir_df = pd.read_csv(ir_path, index_col = 0)
227 |     ir_df = preprocess_spectra_df(ir_df, is_mass = False, **params).T
228 |     
229 |     spectra_df = ir_df
230 |     
231 |     if include_mass:
232 | 
233 |         #Load and prepare mass data
234 |         mass_path = os.path.join(data_dir, 'mass.csv')
235 |         logging.info('Loading mass data from {}'.format(mass_path))
236 |         mass_df = pd.read_csv(mass_path, index_col = 0).T
237 |         mass_df = mass_df.loc[mass_df.index.isin(ir_df.index)]
238 |         mass_df = preprocess_spectra_df(mass_df, is_mass = True)
239 |         
240 | #         mass_df = mass_df.reindex(ir_df.index)
241 | #         spectra_df = pd.concat([spectra_df, mass_df], axis = 1)
242 | #         spectra_df.dropna(inplace = True)
243 | 
244 |         #Merge mass data with IR
245 |         spectra_df = pd.merge(spectra_df, mass_df, left_index = True, right_index = True, how = 'inner')
246 |      
247 |     #Prepare target data and rearrange to match the spectra
248 |     spectra_df.index = spectra_df.index.astype('int')
249 |     target_path = os.path.join(data_dir, 'target.csv')
250 |     logging.info('Loading target data from {}'.format(target_path))
251 |     target_df = pd.read_csv(target_path, index_col = 0, dtype = np.float64)
252 | 
253 |     fn_groups = target_df.shape[1]
254 |     total_df = pd.merge(spectra_df, target_df, left_index = True, right_index = True, how = 'inner')
255 |     
256 |     return total_df.values[:, :-fn_groups], total_df.values[:, -fn_groups:], list(func_grp_smarts.keys())
257 |     
258 |     
259 | if __name__ == '__main__':
260 |     #Parsing the data from jdx and storing it in csv
261 | 
262 |     parser = argparse.ArgumentParser()
263 |     parser.add_argument('--data_dir', default= './data',\
264 |         help = "Directory path containing scrapped data")
265 |     parser.add_argument('--cas_list', default= 'species.txt',\
266 |         help = "File containing CAS number and smiles of molecules")
267 | 
268 |     args = parser.parse_args()
269 | 
270 |     data_dir = args.data_dir
271 |     set_logger(data_dir, 'prepare_data.log')
272 | 
273 | 
274 |     # Create bins for IR and mass spectra
275 |     logging.info('Creating bins for standardizing the spectra')
276 |     ir_bins = np.arange(min_ir - eps, max_ir + eps, step_ir)
277 |     mass_bins = np.arange(min_mass - eps, max_mass + eps, step_mass)
278 | 
279 |     # Compute structures of different molecular groups
280 |     logging.info('Computing the structures of functional groups')
281 |     func_grp_structs = {func_name : Chem.MolFromSmarts(func_smarts)\
282 |                         for func_name, func_smarts in func_grp_smarts.items()}
283 | 
284 |     # Create and save csv files of spectra
285 |     for root, dirs, files in os.walk(data_dir):
286 |         if root == os.path.join(data_dir, 'ir'):
287 |             logging.info('Starting to parse IR jdx files')
288 |             ir_path = os.path.join(data_dir, 'ir.csv')
289 |             save_spectra_to_csv(root, files, ir_path, ir_bins, False)
290 | 
291 |         if root == os.path.join(data_dir, 'mass'):
292 |             logging.info('Starting to parse mass jdx files')
293 |             mass_path = os.path.join(data_dir, 'mass.csv')
294 |             save_spectra_to_csv(root, files, mass_path, mass_bins, True)
295 |             
296 |     #Load CAS data and merge with inchi
297 |     logging.info('Loading CAS file from {}'.format(args.cas_list))
298 |     cas_df = pd.read_csv(args.cas_list, sep='\t', header = 0, usecols = [1,2], names = ['formula','cas'])
299 |     cas_df.dropna(subset=['cas'], inplace=True)
300 |     cas_df.cas = cas_df.cas.str.replace('-', '')
301 |     cas_df.set_index('cas', inplace = True)
302 | 
303 |     
304 |     inchi_path = os.path.join(data_dir, 'inchi.txt')
305 |     logging.info('Loading inchi file from {}'.format(inchi_path))
306 |     inchi_df = pd.read_csv(inchi_path, sep='\t', header = 0, usecols = [0,1],\
307 |                         names = ['cas','inchi'], dtype = str)
308 |     inchi_df.dropna(inplace = True)
309 |     inchi_df.set_index('cas', inplace = True)
310 | 
311 |     # Create and save csv of target 
312 |     cas_inchi_df = pd.merge(cas_df, inchi_df, left_index = True, right_index = True, how = 'inner')
313 |     target_path = os.path.join(data_dir, 'target.csv')
314 |     logging.info('Creating target csv dataset in {}'.format(target_path))
315 |     save_target_to_csv(cas_inchi_df, target_path)


--------------------------------------------------------------------------------