├── GPU_encode_smiles.py ├── refine_code_17012023_valid.json ├── README.md ├── composite_math_v2_nist.py ├── composite_math_v2.py ├── refine_code_17012023_train.json ├── cddd_decode.py ├── tcn_seq_train_used.py ├── tcn_seq_train_hp.py ├── mass2smiles_transformer.py ├── molecularformula.ipynb └── preprocessing_onlin-v3_mgf.ipynb /GPU_encode_smiles.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from cddd.inference import InferenceModel 4 | from cddd.preprocessing import preprocess_smiles 5 | 6 | ames_df = pd.read_csv("/home2020/home/ibmp/delser/cddd/nist/all_HRMS_train_24012023_cddd_refine_s.tsv", index_col="spectrum_id",sep="\t") 7 | ames_df["smiles"] = ames_df.smiles_preprocessed.map(preprocess_smiles) 8 | ames_df = ames_df.dropna() 9 | smiles_list = ames_df["smiles"].tolist() 10 | 11 | inference_model = InferenceModel() 12 | print("Encoding now!") 13 | smiles_embedding = inference_model.seq_to_emb(smiles_list) 14 | print("Saving file") 15 | np.save('/home2020/home/ibmp/delser/cddd/nist/cddd_all_HRMS_train_24012023_cddd_refine.npy', smiles_embedding) 16 | 17 | 18 | print("Done!") 19 | 20 | 21 | ames_df = pd.read_csv("/home2020/home/ibmp/delser/cddd/nist/all_HRMS_valid_24012023_cddd_refine_s.tsv", index_col="spectrum_id",sep="\t") 22 | ames_df["smiles"] = ames_df.smiles_preprocessed.map(preprocess_smiles) 23 | ames_df = ames_df.dropna() 24 | smiles_list = ames_df["smiles"].tolist() 25 | 26 | inference_model = InferenceModel() 27 | print("Encoding now!") 28 | smiles_embedding = inference_model.seq_to_emb(smiles_list) 29 | print("Saving file") 30 | np.save('/home2020/home/ibmp/delser/cddd/nist/cddd_all_HRMS_valid_24012023_cddd_refine.npy', smiles_embedding) 31 | 32 | print("Done!") 33 | -------------------------------------------------------------------------------- /refine_code_17012023_valid.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "op": "core/mass-edit", 4 | "engineConfig": { 5 | "facets": [], 6 | "mode": "row-based" 7 | }, 8 | "columnName": "adduct", 9 | "expression": "value", 10 | "edits": [ 11 | { 12 | "from": [ 13 | "[Cat]+" 14 | ], 15 | "fromBlank": false, 16 | "fromError": false, 17 | "to": "[M]+" 18 | } 19 | ], 20 | "description": "Mass edit cells in column adduct" 21 | }, 22 | { 23 | "op": "core/mass-edit", 24 | "engineConfig": { 25 | "facets": [], 26 | "mode": "row-based" 27 | }, 28 | "columnName": "adduct", 29 | "expression": "value", 30 | "edits": [ 31 | { 32 | "from": [ 33 | "[M+H-2H2O]+" 34 | ], 35 | "fromBlank": false, 36 | "fromError": false, 37 | "to": "[M-2H2O+H]+" 38 | } 39 | ], 40 | "description": "Mass edit cells in column adduct" 41 | }, 42 | { 43 | "op": "core/mass-edit", 44 | "engineConfig": { 45 | "facets": [], 46 | "mode": "row-based" 47 | }, 48 | "columnName": "adduct", 49 | "expression": "value", 50 | "edits": [ 51 | { 52 | "from": [ 53 | "[M+H-H2O]+" 54 | ], 55 | "fromBlank": false, 56 | "fromError": false, 57 | "to": "[M-H2O+H]+" 58 | } 59 | ], 60 | "description": "Mass edit cells in column adduct" 61 | }, 62 | { 63 | "op": "core/mass-edit", 64 | "engineConfig": { 65 | "facets": [], 66 | "mode": "row-based" 67 | }, 68 | "columnName": "adduct", 69 | "expression": "value", 70 | "edits": [ 71 | { 72 | "from": [ 73 | "[M+H-NH3]+" 74 | ], 75 | "fromBlank": false, 76 | "fromError": false, 77 | "to": "[M-NH3+H]+" 78 | } 79 | ], 80 | "description": "Mass edit cells in column adduct" 81 | } 82 | ] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ![logo](https://github.com/volvox292/mass2smiles/assets/63146629/7e5b37dc-534b-4780-b310-45f197283709) 3 | 4 | Mass2SMILES is an open-source Python based deep learning approach for structure and functional group prediction from mass spectrometry data (MS/MS). Spectral data can be provided as MGF files (GNPS-syle) and model inference is most effciently performed via the provided docker container. 5 | 6 | 7 | supplementary data with container and model at (you must have a vaild licence for NIST): [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7883491.svg)](https://doi.org/10.5281/zenodo.7883491) 8 | 9 | recent update containing dockerfiles to build two separate containers, adjust to your needs, this Mass2SMILES model container is using GPU, the cddd does not seem to work on newer cuda drivers, therefore it is 10 | build using tensorflow cpu, but can be speed up by changing the number of cores: e.g. InferenceModel(cpu_threads=128). You need to point to your input and output dir, now the mass2smiles model is built into the container. Using this setup inference speed is highly improved. [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14778327.svg)](https://doi.org/10.5281/zenodo.14778327) 11 | 12 | the pre-print is available at: https://doi.org/10.1101/2023.07.06.547963 13 | 14 | ```bash {bash, echo=T, eval=F} 15 | # the container is available as tarball in supplementary or via docker pull delser292/mass2smiles:final 16 | # unzip the docker.zip, the mass2smiles folder contains the model files and scripts to execute everything and it is important to specify the path to this folder when starting predictions. 17 | 18 | # The predictions can be started through this command: 19 | 20 | docker run -v c:/your_path/to_the_folder/mass2smiles/:/app mass2smiles:transformer_v1 conda run -n tf python app/mass2smiles_transformer.py your_mgf_file.mgf /app 21 | ``` 22 | 23 | The model architecture: 24 | 25 | ![architecture](https://github.com/volvox292/mass2smiles/assets/63146629/3e4313d8-43b2-469d-bab6-c8670a00f62d) 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /composite_math_v2_nist.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from collections import defaultdict 4 | from matchms import Spikes 5 | import math 6 | #import deepchem as dc 7 | #dc.__version__ 8 | from matchms.filtering import add_losses 9 | from matchms.filtering import add_parent_mass 10 | from matchms.filtering import default_filters 11 | from matchms.filtering import normalize_intensities 12 | from matchms.filtering import select_by_intensity 13 | from matchms.filtering import reduce_to_number_of_peaks 14 | from matchms.filtering import require_minimum_number_of_peaks 15 | from matchms.filtering import select_by_mz 16 | from matchms.importing import load_from_mgf 17 | from matchms.exporting import save_as_mgf 18 | from matchms.importing import load_from_msp 19 | 20 | 21 | from matchms.filtering import repair_inchi_inchikey_smiles 22 | from matchms.filtering import derive_inchikey_from_inchi 23 | from matchms.filtering import derive_smiles_from_inchi 24 | from matchms.filtering import derive_inchi_from_smiles 25 | from matchms.filtering import harmonize_undefined_inchi 26 | from matchms.filtering import harmonize_undefined_inchikey 27 | from matchms.filtering import harmonize_undefined_smiles 28 | 29 | import pickle 30 | path_data = "/home/delser/" 31 | outfile = os.path.join(path_data, 'nist_cache_pos.pickle') 32 | with open(outfile, 'rb') as file: 33 | spectrums = pickle.load(file) 34 | 35 | def spectrum_processing(s): 36 | """This is how one would typically design a desired pre- and post- 37 | processing pipeline.""" 38 | s = normalize_intensities(s) 39 | s = select_by_intensity(s, intensity_from=0.01) 40 | s = reduce_to_number_of_peaks(s, n_required=4, n_max=500) 41 | return s 42 | 43 | spectrums = [spectrum_processing(s) for s in spectrums] 44 | spectrums = [s for s in spectrums if s is not None] 45 | 46 | 47 | 48 | 49 | 50 | def compare_update(first_mz_intensity_dict,second_mz_intensity_dict): 51 | modified_dict=first_mz_intensity_dict.copy() 52 | for key in first_mz_intensity_dict: 53 | for key_2 in second_mz_intensity_dict: 54 | if math.isclose(key,key_2,abs_tol=0.005) == False: 55 | mzs_modified_dict=list(modified_dict.keys()) 56 | matches=[math.isclose(i,key_2,abs_tol=0.005) for i in mzs_modified_dict] 57 | if True not in matches: 58 | modified_dict[key_2]=second_mz_intensity_dict[key_2] 59 | else: 60 | if first_mz_intensity_dict[key]= end_pts[i] and oupt <= end_pts[i+1]: 59 | return i 60 | return -1 # fatal error 61 | 62 | funct=['#num_of_sugars',"#Number of aliphatic carboxylic acids", 63 | "#Number of aliphatic hydroxyl groups", 64 | "#Number of aliphatic hydroxyl groups excluding tert-OH", 65 | "#Number of N functional groups attached to aromatics", 66 | "#Number of Aromatic carboxylic acides", 67 | "#Number of aromatic nitrogens", 68 | "#Number of aromatic amines", 69 | "#Number of aromatic hydroxyl groups", 70 | "#Number of carboxylic acids", 71 | "#Number of carboxylic acids", 72 | "#Number of carbonyl O", 73 | "#Number of carbonyl O, excluding COOH", 74 | "#Number of thiocarbonyl", 75 | "#Number of Imines", 76 | "#Number of Tertiary amines", 77 | "#Number of Secondary amines", 78 | "#Number of Primary amines", 79 | "#Number of hydroxylamine groups", 80 | "#Number of tert-alicyclic amines (no heteroatoms, not quinine-like bridged N)", 81 | "#Number of H-pyrrole nitrogens", 82 | "#Number of thiol groups", 83 | "#Number of aldehydes", 84 | "#Number of alkyl carbamates (subject to hydrolysis)", 85 | "#Number of alkyl halides", 86 | "#Number of allylic oxidation sites excluding steroid dienone", 87 | "#Number of amides", 88 | "#Number of anilines", 89 | "#Number of aryl methyl sites for hydroxylation", 90 | "#Number of azo groups", 91 | "#Number of benzene rings", 92 | "#Bicyclic", 93 | "#Number of dihydropyridines", 94 | "#Number of epoxide rings", 95 | "#Number of esters", 96 | "#Number of ether oxygens (including phenoxy)", 97 | "#Number of furan rings", 98 | "#Number of guanidine groups", 99 | "#Number of halogens", 100 | "#Number of imidazole rings", 101 | "#Number of isothiocyanates", 102 | "#Number of ketones", 103 | "#Number of ketones excluding diaryl, a,b-unsat. dienones, heteroatom on Calpha", 104 | "#Number of beta lactams", 105 | "#Number of cyclic esters (lactones)", 106 | "#Number of methoxy groups -OCH3", 107 | "#Number of nitriles", 108 | "#Number of nitro groups", 109 | "#Number of oxazole rings", 110 | "#Number of para-hydroxylation sites", 111 | "#Number of phenols", 112 | "#Number of phosphoric acid groups", 113 | "#Number of phosphoric ester groups", 114 | "#Number of piperdine rings", 115 | "#Number of primary amides", 116 | "#Number of pyridine rings", 117 | "#Number of quaternary nitrogens", 118 | "#Number of thioether", 119 | "#Number of thiazole rings", 120 | "#Number of unbranched alkanes of at least 4 members (excludes halogenated alkanes)", 121 | "#adduct_enc","#C","#H", "#O","#N", "#S", "#I", "#Br","#Cl", "#F","#P"] 122 | 123 | all_arr=[] 124 | for sample in x1: 125 | cache=[] 126 | for i in sample: 127 | cache.append(float_oupt_to_class(i,65)) 128 | xn=np.array(cache) 129 | all_arr.append(xn) 130 | 131 | result=np.stack(all_arr) 132 | #result.shape 133 | 134 | df = pd.DataFrame(result, columns = funct) 135 | 136 | df_final = df_final.join(df, how="outer") 137 | 138 | 139 | 140 | 141 | fname10=os.path.join(sys.argv[1],"predicted_results.tsv") 142 | 143 | df_final.to_csv(fname10,sep='\t') 144 | 145 | #df_final.to_csv("/Users/delser/mass2smiles/predicted_results.tsv",sep='\t') -------------------------------------------------------------------------------- /tcn_seq_train_used.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.models import Sequential 2 | from tensorflow.keras.layers import Dense, Dropout, Flatten,MultiHeadAttention 3 | from tensorflow import keras 4 | import tensorflow as tf 5 | import numpy as np 6 | from tcn import TCN 7 | import wandb 8 | from wandb.keras import WandbCallback 9 | import pickle 10 | import os 11 | 12 | wandb.init(project="mass2smiles-tcn_seq") 13 | 14 | 15 | 16 | batch_size=16 17 | n_epochs=50 18 | 19 | 20 | ytr=np.load('/home/delser/train/tcn/cddd_all_HRMS_train_24012023_cddd_refine.npy') 21 | #ytr=np.expand_dims(ytr,-1) 22 | ytr1=np.load('/home/delser/train/tcn/y1_all_HRMS_train_24012023_cddd_mf.npy') 23 | 24 | yval=np.load('/home/delser/train/tcn/cddd_all_HRMS_valid_24012023_cddd_refine.npy') 25 | yval1=np.load('/home/delser/train/tcn/y1_all_HRMS_valid_24012023_cddd_mf.npy') 26 | #yval=np.expand_dims(yval,-1) 27 | 28 | xtr=np.load('/home/delser/train/tcn/tcn_train_seq_sin256_2401.npy',mmap_mode='r') 29 | xval =np.load( '/home/delser/train/tcn/tcn_valid_seq_sin256_2401.npy',mmap_mode='r') 30 | 31 | 32 | 33 | class BaseAttention(tf.keras.layers.Layer): 34 | def __init__(self, **kwargs): 35 | super().__init__() 36 | self.mha = tf.keras.layers.MultiHeadAttention(**kwargs) 37 | self.layernorm = tf.keras.layers.LayerNormalization() 38 | self.add = tf.keras.layers.Add() 39 | 40 | class FeedForward(tf.keras.layers.Layer): 41 | def __init__(self, d_model, dff, dropout_rate=0.1): 42 | super().__init__() 43 | self.seq = tf.keras.Sequential([ 44 | tf.keras.layers.Dense(dff, activation='relu'), 45 | tf.keras.layers.Dense(d_model), 46 | tf.keras.layers.Dropout(dropout_rate) 47 | ]) 48 | self.add = tf.keras.layers.Add() 49 | self.layer_norm = tf.keras.layers.LayerNormalization() 50 | 51 | def call(self, x): 52 | x = self.add([x, self.seq(x)]) 53 | x = self.layer_norm(x) 54 | return x 55 | 56 | class GlobalSelfAttention(BaseAttention): 57 | def call(self, x): 58 | attn_output = self.mha( 59 | query=x, 60 | value=x, 61 | key=x) 62 | x = self.add([x, attn_output]) 63 | x = self.layernorm(x) 64 | return x 65 | 66 | class EncoderLayer(tf.keras.layers.Layer): 67 | def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1): 68 | super().__init__() 69 | 70 | self.self_attention = GlobalSelfAttention( 71 | num_heads=num_heads, 72 | key_dim=d_model, 73 | dropout=dropout_rate) 74 | 75 | self.ffn = FeedForward(d_model, dff) 76 | 77 | def call(self, x): 78 | x = self.self_attention(x) 79 | x = self.ffn(x) 80 | return x 81 | 82 | 83 | class DataGenerator(keras.utils.Sequence): 84 | def __init__(self, x_data, y_data,y1_data, batch_size): 85 | self.x, self.y, self.y1 = x_data, y_data, y1_data, 86 | self.batch_size = batch_size 87 | self.num_batches = np.ceil(len(x_data) / batch_size) 88 | self.batch_idx = np.array_split(range(len(x_data)), self.num_batches) 89 | 90 | def __len__(self): 91 | return len(self.batch_idx) 92 | 93 | def __getitem__(self, idx): 94 | batch_x = self.x[self.batch_idx[idx]] 95 | batch_y = self.y[self.batch_idx[idx]] 96 | batch_y1 = self.y1[self.batch_idx[idx]] 97 | return batch_x, [batch_y,batch_y1] 98 | 99 | train_generator = DataGenerator(xtr, ytr,ytr1, batch_size = 16) 100 | 101 | 102 | def call_existing_code(units, heads, dropout,dense_dropout,lr,filters,num_layers): 103 | tcn=TCN( 104 | nb_filters=filters, 105 | kernel_size=8, 106 | dilations=[2 ** i for i in range(6)], 107 | use_skip_connections=True, 108 | use_layer_norm=True, 109 | kernel_initializer='glorot_uniform', 110 | go_backwards=True,) 111 | print(f'TCN.receptive_field: {tcn.receptive_field}.') 112 | input0=tf.keras.Input(shape=(501,257)) 113 | input1=tf.keras.layers.Masking(mask_value=10,input_shape=(501,257))(input0) 114 | att = Sequential([ 115 | EncoderLayer(d_model=257, num_heads=heads, dff=units,dropout_rate=dropout) 116 | for _ in range(num_layers)])(input1) 117 | hd_tcn=tcn(att) 118 | output_b=Sequential([ 119 | Dropout(rate=dense_dropout), 120 | Dense(128, activation='tanh'), 121 | Dropout(rate=dense_dropout), 122 | Dense(71, activation='sigmoid')],name="funct_groups")(hd_tcn) 123 | output_a = Sequential([ 124 | Dropout(rate=dense_dropout), 125 | Dense(512, activation='relu'), 126 | Dropout(rate=dense_dropout), 127 | Dense(512, activation='linear') 128 | ],name="smiles")(hd_tcn) 129 | model= tf.keras.Model(inputs=input0, outputs=[output_a,output_b]) 130 | model.compile(loss={"smiles":'mean_absolute_error',"funct_groups":'mse'}, optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),metrics={"smiles":'mean_squared_error',"funct_groups":'mean_absolute_error'}) 131 | 132 | 133 | return model 134 | 135 | def build_model(): 136 | num_layers = 5 137 | units = 2048 138 | heads = 16 139 | dropout = 0.1 140 | dense_dropout = 0.1 141 | filters=256 142 | #activation = hp.Choice("activation", ["relu", "tanh"]) 143 | #dropout = hp.Boolean("dropout") 144 | #lr = hp.Float("lr", min_value=1e-6, max_value=1e-4, sampling="log") 145 | # call existing model-building code with the hyperparameter values. 146 | #model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=lr,filters=filters) 147 | model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=1e-4,filters=filters, num_layers =num_layers) 148 | return model 149 | 150 | 151 | # Compile and train. 152 | 153 | model= build_model() 154 | model.summary() 155 | model.fit(train_generator,validation_data=(xval,[yval,yval1]), epochs=n_epochs,shuffle=False,batch_size=None,validation_batch_size=16,callbacks=[WandbCallback(log_batch_frequency=1)]) 156 | model.save_weights('/home/delser/train/tcn/model') 157 | del model 158 | model= build_model() 159 | model.load_weights('/home/delser/train/tcn/model') 160 | result= model.predict(xval) 161 | #np.save("/home/delser/train/tcn/val_predict.npy", result) 162 | np.save("/home/delser/train/tcn/val_predict.npy", result[0]) 163 | np.save("/home/delser/train/tcn/val_predict1.npy", result[1]) 164 | 165 | print('done!') -------------------------------------------------------------------------------- /tcn_seq_train_hp.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.models import Sequential 2 | from tensorflow.keras.layers import Dense, Dropout, Flatten,MultiHeadAttention 3 | from tensorflow import keras 4 | import tensorflow as tf 5 | import numpy as np 6 | from tcn import TCN 7 | #import wandb 8 | #from wandb.keras import WandbCallback 9 | import pickle 10 | import os 11 | import keras_tuner 12 | 13 | #wandb.init(project="mass2smiles-tcn_seq") 14 | 15 | # Run before every test for reproducibility 16 | 17 | def seed_all(): 18 | 19 | np.random.seed(123) 20 | 21 | tf.random.set_seed(123) 22 | 23 | seed_all() 24 | 25 | 26 | batch_size=16 27 | #n_epochs=100 28 | 29 | 30 | ytr=np.load('/home/delser/train/tcn/cddd_all_HRMS_train_24012023_cddd_refine.npy') 31 | #ytr=np.expand_dims(ytr,-1) 32 | #ytr1=np.load('/home/delser/train/tcn/y1_all_HRMS_train_24012023_cddd.npy') 33 | 34 | yval=np.load('/home/delser/train/tcn/cddd_all_HRMS_valid_24012023_cddd_refine.npy') 35 | #yval1=np.load('/home/delser/train/tcn/y1_all_HRMS_valid_24012023_cddd.npy') 36 | #yval=np.expand_dims(yval,-1) 37 | 38 | xtr=np.load('/home/delser/train/tcn/tcn_train_seq_sin256_2401.npy',mmap_mode='r') 39 | xval =np.load( '/home/delser/train/tcn/tcn_valid_seq_sin256_2401.npy',mmap_mode='r') 40 | 41 | 42 | 43 | class BaseAttention(tf.keras.layers.Layer): 44 | def __init__(self, **kwargs): 45 | super().__init__() 46 | self.mha = tf.keras.layers.MultiHeadAttention(**kwargs) 47 | self.layernorm = tf.keras.layers.LayerNormalization() 48 | self.add = tf.keras.layers.Add() 49 | 50 | class FeedForward(tf.keras.layers.Layer): 51 | def __init__(self, d_model, dff, dropout_rate=0.1): 52 | super().__init__() 53 | self.seq = tf.keras.Sequential([ 54 | tf.keras.layers.Dense(dff, activation='relu'), 55 | tf.keras.layers.Dense(d_model), 56 | tf.keras.layers.Dropout(dropout_rate) 57 | ]) 58 | self.add = tf.keras.layers.Add() 59 | self.layer_norm = tf.keras.layers.LayerNormalization() 60 | 61 | def call(self, x): 62 | x = self.add([x, self.seq(x)]) 63 | x = self.layer_norm(x) 64 | return x 65 | 66 | class GlobalSelfAttention(BaseAttention): 67 | def call(self, x): 68 | attn_output = self.mha( 69 | query=x, 70 | value=x, 71 | key=x) 72 | x = self.add([x, attn_output]) 73 | x = self.layernorm(x) 74 | return x 75 | 76 | class EncoderLayer(tf.keras.layers.Layer): 77 | def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1): 78 | super().__init__() 79 | 80 | self.self_attention = GlobalSelfAttention( 81 | num_heads=num_heads, 82 | key_dim=d_model, 83 | dropout=dropout_rate) 84 | 85 | self.ffn = FeedForward(d_model, dff) 86 | 87 | def call(self, x): 88 | x = self.self_attention(x) 89 | x = self.ffn(x) 90 | return x 91 | 92 | class DataGenerator(keras.utils.Sequence): 93 | def __init__(self, x_data, y_data, batch_size): 94 | self.x, self.y = x_data, y_data 95 | self.batch_size = batch_size 96 | self.num_batches = np.ceil(len(x_data) / batch_size) 97 | self.batch_idx = np.array_split(range(len(x_data)), self.num_batches) 98 | 99 | def __len__(self): 100 | return len(self.batch_idx) 101 | 102 | def __getitem__(self, idx): 103 | batch_x = self.x[self.batch_idx[idx]] 104 | batch_y = self.y[self.batch_idx[idx]] 105 | return batch_x, batch_y 106 | 107 | train_generator = DataGenerator(xtr, ytr, batch_size = 16) 108 | 109 | 110 | 111 | def call_existing_code(units, heads, dropout,dense_dropout,lr,filters,num_layers): 112 | tcn=TCN( 113 | nb_filters=filters, 114 | kernel_size=8, 115 | dilations=[2 ** i for i in range(6)], 116 | use_skip_connections=True, 117 | use_layer_norm=True, 118 | kernel_initializer='glorot_uniform', 119 | go_backwards=True,) 120 | print(f'TCN.receptive_field: {tcn.receptive_field}.') 121 | input0=tf.keras.Input(shape=(501,257)) 122 | input1=tf.keras.layers.Masking(mask_value=10,input_shape=(501,257))(input0) 123 | att = Sequential([ 124 | EncoderLayer(d_model=257, num_heads=heads, dff=units,dropout_rate=dropout) 125 | for _ in range(num_layers)])(input1) 126 | hd_tcn=tcn(att) 127 | output_a = Sequential([ 128 | Dropout(rate=dense_dropout), 129 | Dense(512, activation='relu'), 130 | Dropout(rate=dense_dropout), 131 | Dense(512, activation='linear') 132 | ],name="smiles")(hd_tcn) 133 | model= tf.keras.Model(inputs=input0, outputs=output_a) 134 | model.compile(loss={"smiles":'mean_absolute_error'}, optimizer=tf.keras.optimizers.Adam(learning_rate=lr),metrics={"smiles":'mean_squared_error'}) 135 | 136 | return model 137 | 138 | def build_model(hp): 139 | num_layers = hp.Int("num_layers", min_value=2, max_value=6, step=2) 140 | units = hp.Int("units", min_value=512, max_value=2048, step=512) 141 | heads = hp.Int("heads", min_value=8, max_value=32, step=8) 142 | dropout = hp.Float("dropout", min_value=0.1, max_value=0.5, step=0.1) 143 | dense_dropout = hp.Float("dense_dropout", min_value=0.1, max_value=0.5, step=0.1) 144 | filters=hp.Int("filters", min_value=128, max_value=512, step=128) 145 | #activation = hp.Choice("activation", ["relu", "tanh"]) 146 | #dropout = hp.Boolean("dropout") 147 | #lr = hp.Float("lr", min_value=1e-6, max_value=1e-4, sampling="log") 148 | # call existing model-building code with the hyperparameter values. 149 | #model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=lr,filters=filters) 150 | model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=1e-4,filters=filters, num_layers =num_layers) 151 | return model 152 | 153 | tuner = keras_tuner.RandomSearch( 154 | hypermodel=build_model, 155 | objective=keras_tuner.Objective("val_mean_squared_error", direction="min"), 156 | max_trials=99, 157 | executions_per_trial=1, 158 | overwrite=True, 159 | directory="my_dir", 160 | project_name="helloworld", 161 | ) 162 | 163 | tuner.search_space_summary() 164 | tuner.search(train_generator, epochs=4, validation_data=(xval,yval),batch_size=None,validation_batch_size=16) 165 | #model.compile(loss={"smiles":'mean_absolute_error',"funct_groups":'mse'}, optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),metrics={"smiles":'mean_squared_error',"funct_groups":'mean_absolute_error'}) 166 | #model.summary() 167 | #model.fit(train_generator,validation_data=(xval,[yval,yval1]), epochs=n_epochs,shuffle=False,batch_size=None,validation_batch_size=16,callbacks=[WandbCallback(log_batch_frequency=1)]) 168 | #result= model.predict(xval) 169 | #np.save("/home/delser/train/tcn/val_predict.npy", result[0]) 170 | #np.save("/home/delser/train/tcn/val_predict1.npy", result[1]) 171 | 172 | #model.save('/home/delser/train/tcn') 173 | 174 | print('done!') -------------------------------------------------------------------------------- /mass2smiles_transformer.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | import json 4 | import sys 5 | import os 6 | import subprocess 7 | import time 8 | start_time = time.time() 9 | import numpy as np 10 | import pandas as pd 11 | import pickle 12 | from matchms import set_matchms_logger_level 13 | import pandas as pd 14 | set_matchms_logger_level("ERROR") 15 | from matchms.filtering import add_losses 16 | from matchms.filtering import add_parent_mass 17 | from matchms.filtering import default_filters 18 | from matchms.filtering import normalize_intensities 19 | from matchms.filtering import select_by_intensity 20 | from matchms.filtering import reduce_to_number_of_peaks 21 | from matchms.filtering import require_minimum_number_of_peaks 22 | from matchms.filtering import select_by_mz 23 | from matchms.importing import load_from_mgf 24 | from matchms.exporting import save_as_mgf 25 | from matchms.importing import load_from_msp 26 | from matchms.filtering import repair_inchi_inchikey_smiles 27 | from matchms.filtering import derive_inchikey_from_inchi 28 | from matchms.filtering import derive_smiles_from_inchi 29 | from matchms.filtering import derive_inchi_from_smiles 30 | from matchms.filtering import harmonize_undefined_inchi 31 | from matchms.filtering import harmonize_undefined_inchikey 32 | from matchms.filtering import harmonize_undefined_smiles 33 | from tensorflow.keras.models import Sequential 34 | from tensorflow.keras.layers import Dense, Dropout 35 | import tensorflow as tf 36 | from tcn import TCN 37 | from tensorflow import keras 38 | from keras.initializers import glorot_uniform 39 | ##################### parse mfg and convert to df ######################## 40 | print('parsing specs now') 41 | def spectrum_processing(s): 42 | """This is how one would typically design a desired pre- and post- 43 | processing pipeline.""" 44 | s = default_filters(s) 45 | s = add_parent_mass(s) 46 | s = normalize_intensities(s) 47 | s = select_by_intensity(s, intensity_from=0.01) 48 | s = reduce_to_number_of_peaks(s, n_required=5, n_max=250) 49 | s = select_by_mz(s, mz_from=15, mz_to=2000) 50 | s = add_losses(s, loss_mz_from=15.0, loss_mz_to=350.0) 51 | s = require_minimum_number_of_peaks(s, n_required=5) 52 | return s 53 | 54 | 55 | 56 | def metadata_processing(spectrum): 57 | spectrum = default_filters(spectrum) 58 | spectrum = repair_inchi_inchikey_smiles(spectrum) 59 | spectrum = derive_inchi_from_smiles(spectrum) 60 | spectrum = derive_smiles_from_inchi(spectrum) 61 | spectrum = derive_inchikey_from_inchi(spectrum) 62 | spectrum = harmonize_undefined_smiles(spectrum) 63 | spectrum = harmonize_undefined_inchi(spectrum) 64 | spectrum = harmonize_undefined_inchikey(spectrum) 65 | return spectrum 66 | # Load data from MGF file and apply filters 67 | 68 | 69 | #path_data = # enter path to downloaded mgf file 70 | file_mgf = os.path.join(sys.argv[2], 71 | sys.argv[1]) 72 | spectrums = list(load_from_mgf(file_mgf)) 73 | 74 | spectrums = [metadata_processing(s) for s in spectrums] 75 | spectrums = [spectrum_processing(s) for s in spectrums] 76 | #spectrums = [spectrum_processing(s) for s in load_from_mgf("/Users/delser/Desktop/PhD/Phytochemistry/NP-Databases/CFM-4_DB/TOTAL_COMPOUNDS_DB.energies_merged_name.mgf")] 77 | #spectrums = [spectrum_processing(s) for s in load_from_mgf("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/altissues15072021-py.mgf")] 78 | # Omit spectrums that didn't qualify for analysis 79 | spectrums = [s for s in spectrums if s is not None] 80 | 81 | precs = [] 82 | IDs = [] 83 | mzs=[] 84 | ints=[] 85 | loss_mzs=[] 86 | loss_ints=[] 87 | 88 | 89 | for spec in spectrums: 90 | IDs.append(spec.get("feature_id")) 91 | precs.append(spec.get("precursor_mz")) 92 | mzs.append(list(spec.peaks.mz)) 93 | ints.append(list(spec.peaks.intensities)) 94 | loss_mzs.append(list(spec.losses.mz)) 95 | loss_ints.append(list(spec.losses.intensities)) 96 | 97 | metadata = pd.DataFrame(list(zip(IDs, precs,mzs,ints,loss_mzs,loss_ints)), columns=["feature_id", "precursor_mz","mzs","intensities","loss_mzs","loss_intensities" ]) 98 | fname2=os.path.join(sys.argv[2],'feature_ids_dataframe.tsv') 99 | metadata.to_csv(fname2,sep='\t') 100 | print("done!") 101 | ##################### encode specs ######################## 102 | print('encoding specs now') 103 | def positional_encoding(max_position, d_model, min_freq=1e-6): 104 | position = np.arange(max_position) 105 | freqs = min_freq**(2*(np.arange(d_model)//2)/d_model) 106 | pos_enc = position.reshape(-1,1)*freqs.reshape(1,-1) 107 | pos_enc[:, ::2] = np.cos(pos_enc[:, ::2]) 108 | pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2]) 109 | return pos_enc 110 | 111 | def trun_n_d(n,d): 112 | return ( n if not n.find('.')+1 else n[:n.find('.')+d+1] ) 113 | 114 | P=positional_encoding(200000,256, min_freq=1e2) 115 | 116 | def prepro_specs_train(df): 117 | valid=[] 118 | precs=df['precursor_mz'].to_list() 119 | mzs=df['mzs'].to_list() 120 | ints=df['intensities'].to_list() 121 | loss_mzs=df['loss_mzs'].to_list() 122 | loss_ints=df['loss_intensities'].to_list() 123 | for one_pre,one_mzs,one_ints,one_loss,one_loss_ints in zip(precs,mzs,ints,loss_mzs,loss_ints): 124 | mz_list=[round(float(trun_n_d(str(one_pre),2))*100)] # add precursor mz 125 | intes_list=[2.0] # add precursor int 126 | res = dict(zip(one_mzs+one_loss, one_ints+one_loss_ints)) # order by mzs 127 | res=dict(sorted(res.items())) 128 | for m,i in zip(list(res.keys()), list(res.values())): # change this from mgf from matchms 129 | mz=round(float(trun_n_d(str(m),2))*100) 130 | mz_list.append(mz) 131 | intens=round(i,4) 132 | intes_list.append(intens) 133 | int_mzs=[intes_list,mz_list] 134 | valid.append(int_mzs) # put intesities at first 135 | return tf.ragged.constant(valid) 136 | 137 | train=prepro_specs_train(metadata) 138 | 139 | dimn=256 140 | def encoding(rag_tensor,P,dimn): 141 | to_pad=[] 142 | for sample in rag_tensor: 143 | all_dim=[sample[0].numpy().tolist()] 144 | pos_enc=[P[int(i)-1] for i in sample[1].numpy().tolist()] 145 | for dim in range(dimn): 146 | dim_n=[i[dim] for i in pos_enc] 147 | all_dim.append(dim_n) 148 | to_pad.append(all_dim) 149 | to_pad=[tf.keras.preprocessing.sequence.pad_sequences(i,maxlen=501,dtype='float32',padding='post',truncating='post',value=10) for i in to_pad] 150 | to_pad=np.stack((to_pad)) 151 | to_pad=np.swapaxes(to_pad, 1, -1) 152 | return to_pad 153 | 154 | xtrain=encoding(train,P,dimn) 155 | print("done!") 156 | #xval=np.load('/home/delser/train/tcn/casmi_specs.npy') 157 | class BaseAttention(tf.keras.layers.Layer): 158 | def __init__(self, **kwargs): 159 | super().__init__() 160 | self.mha = tf.keras.layers.MultiHeadAttention(**kwargs) 161 | self.layernorm = tf.keras.layers.LayerNormalization() 162 | self.add = tf.keras.layers.Add() 163 | 164 | class FeedForward(tf.keras.layers.Layer): 165 | def __init__(self, d_model, dff, dropout_rate=0.1): 166 | super().__init__() 167 | self.seq = tf.keras.Sequential([ 168 | tf.keras.layers.Dense(dff, activation='relu'), 169 | tf.keras.layers.Dense(d_model), 170 | tf.keras.layers.Dropout(dropout_rate) 171 | ]) 172 | self.add = tf.keras.layers.Add() 173 | self.layer_norm = tf.keras.layers.LayerNormalization() 174 | 175 | def call(self, x): 176 | x = self.add([x, self.seq(x)]) 177 | x = self.layer_norm(x) 178 | return x 179 | 180 | class GlobalSelfAttention(BaseAttention): 181 | def call(self, x): 182 | attn_output = self.mha( 183 | query=x, 184 | value=x, 185 | key=x) 186 | x = self.add([x, attn_output]) 187 | x = self.layernorm(x) 188 | return x 189 | 190 | class EncoderLayer(tf.keras.layers.Layer): 191 | def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1): 192 | super().__init__() 193 | 194 | self.self_attention = GlobalSelfAttention( 195 | num_heads=num_heads, 196 | key_dim=d_model, 197 | dropout=dropout_rate) 198 | 199 | self.ffn = FeedForward(d_model, dff) 200 | 201 | def call(self, x): 202 | x = self.self_attention(x) 203 | x = self.ffn(x) 204 | return x 205 | 206 | 207 | class DataGenerator(keras.utils.Sequence): 208 | def __init__(self, x_data, y_data,y1_data, batch_size): 209 | self.x, self.y, self.y1 = x_data, y_data, y1_data, 210 | self.batch_size = batch_size 211 | self.num_batches = np.ceil(len(x_data) / batch_size) 212 | self.batch_idx = np.array_split(range(len(x_data)), self.num_batches) 213 | 214 | def __len__(self): 215 | return len(self.batch_idx) 216 | 217 | def __getitem__(self, idx): 218 | batch_x = self.x[self.batch_idx[idx]] 219 | batch_y = self.y[self.batch_idx[idx]] 220 | batch_y1 = self.y1[self.batch_idx[idx]] 221 | return batch_x, [batch_y,batch_y1] 222 | 223 | 224 | 225 | 226 | def call_existing_code(units, heads, dropout,dense_dropout,lr,filters,num_layers): 227 | tcn=TCN( 228 | nb_filters=filters, 229 | kernel_size=8, 230 | dilations=[2 ** i for i in range(6)], 231 | use_skip_connections=True, 232 | use_layer_norm=True, 233 | kernel_initializer='glorot_uniform', 234 | go_backwards=True,) 235 | print(f'TCN.receptive_field: {tcn.receptive_field}.') 236 | input0=tf.keras.Input(shape=(501,257)) 237 | input1=tf.keras.layers.Masking(mask_value=10,input_shape=(501,257))(input0) 238 | att = Sequential([ 239 | EncoderLayer(d_model=257, num_heads=heads, dff=units,dropout_rate=dropout) 240 | for _ in range(num_layers)])(input1) 241 | hd_tcn=tcn(att) 242 | output_b=Sequential([ 243 | Dropout(rate=dense_dropout), 244 | Dense(128, activation='tanh'), 245 | Dropout(rate=dense_dropout), 246 | Dense(71, activation='sigmoid')],name="funct_groups")(hd_tcn) 247 | output_a = Sequential([ 248 | Dropout(rate=dense_dropout), 249 | Dense(512, activation='relu'), 250 | Dropout(rate=dense_dropout), 251 | Dense(512, activation='linear') 252 | ],name="smiles")(hd_tcn) 253 | model= tf.keras.Model(inputs=input0, outputs=[output_a,output_b]) 254 | model.compile(loss={"smiles":'mean_absolute_error',"funct_groups":'mse'}, optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),metrics={"smiles":'mean_squared_error',"funct_groups":'mean_absolute_error'}) 255 | 256 | 257 | return model 258 | 259 | def build_model(): 260 | num_layers = 5 261 | units = 2048 262 | heads = 16 263 | dropout = 0.1 264 | dense_dropout = 0.1 265 | filters=256 266 | #activation = hp.Choice("activation", ["relu", "tanh"]) 267 | #dropout = hp.Boolean("dropout") 268 | #lr = hp.Float("lr", min_value=1e-6, max_value=1e-4, sampling="log") 269 | # call existing model-building code with the hyperparameter values. 270 | #model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=lr,filters=filters) 271 | model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=1e-4,filters=filters, num_layers =num_layers) 272 | return model 273 | 274 | 275 | 276 | ##################### predict and decode ######################## 277 | model=build_model() 278 | model.load_weights(os.path.normpath(sys.argv[2]+"/misunderstood-fire-207/model")) 279 | #model = keras.models.load_model(os.path.normpath(sys.argv[2]+"/upbeat-puddle-198"), custom_objects={'TCN': TCN, 'GlorotUniform': glorot_uniform()}) 280 | #model.summary() 281 | result= model.predict(xtrain) 282 | np.save(os.path.join(sys.argv[2],"result_predict.npy"), result[0]) 283 | np.save(os.path.join(sys.argv[2],"result_predict1.npy"), result[1]) 284 | 285 | 286 | print('predict with transformer done!') 287 | 288 | ###### cddd decode predictions ##### 289 | print("decode embeddings now!") 290 | x=subprocess.check_output(['conda', 'run','-n', 'cddd', 'python', 'app/cddd_decode.py',sys.argv[2]]) 291 | print(x.decode('ascii')) 292 | 293 | print("done!") 294 | print("Everything was successfully predicted!") 295 | print("Everything was successfully predicted in --- %s minutes --- to the file predicted_results.tsv" % ((time.time() - start_time)/60)) 296 | -------------------------------------------------------------------------------- /molecularformula.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "53aeece7-a596-411f-9e1f-3f3e5ba0d354", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from tqdm import tqdm\n", 11 | "import tensorflow as tf\n", 12 | "import json\n", 13 | "import tensorflow_text as text\n", 14 | "import os\n", 15 | "import numpy as np\n", 16 | "import pandas as pd\n", 17 | "import pickle" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "id": "e8baf966-5ead-4951-a254-af7b4b301d5c", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from rdkit import DataStructs\n", 28 | "from rdkit import Chem\n", 29 | "from rdkit.Chem.rdMolDescriptors import CalcMolFormula\n", 30 | "#m_true=Chem.MolFromSmiles(df_final[\"decoded_test\"][1])\n", 31 | "#m_predict=Chem.MolFromSmiles(df_final[\"predicted\"][1])\n", 32 | "#ms=[m_true,m_predict]\n", 33 | "#fps = [Chem.RDKFingerprint(x) for x in ms]\n", 34 | "#DataStructs.FingerprintSimilarity(fps[0],fps[1])\n", 35 | "\n", 36 | "def get_mf(smiles1):\n", 37 | " try:\n", 38 | " m_true=Chem.MolFromSmiles(smiles1)\n", 39 | " mf=CalcMolFormula(m_true)\n", 40 | " return mf\n", 41 | " except:\n", 42 | " return \"no_prediction_or_error_with_parsing_by_rdkit\"\n", 43 | " " 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "id": "4c5b299b-56ac-42b3-bd6c-ec3c01b46a39", 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/html": [ 55 | "
\n", 56 | "\n", 69 | "\n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | "
Unnamed: 0spectrum_idprecursor_mzmzsintensitiesloss_mzsloss_intensitiessmiles_preprocessednum_of_sugarsNumber of aliphatic carboxylic acids...Number of phenolsNumber of phosphoric acid groupsNumber of phosphoric ester groupsNumber of piperdine ringsNumber of primary amidesNumber of pyridine ringsNumber of quaternary nitrogensNumber of thioetherNumber of thiazole ringsNumber of unbranched alkanes of at least 4 members (excludes halogenated alkanes)
003719.2538[53.0387, 55.0179, 55.0545, 57.0338, 59.0492, ...[0.017272727272727273, 0.01818181818181818, 0....[314.15709999999996][0.01818181818181818]CC1C(C(C(C(O1)OC2=C(OC3=C(C(=CC(=C3C2=O)O)O)CC...2.00...2000000000
115499.2298[67.0543, 69.0698, 81.0699, 83.0492, 83.0853, ...[0.010357142857142856, 0.04642857142857143, 0....[60.02260000000001, 118.02660000000003, 160.03...[0.060714285714285714, 0.39285714285714285, 0....CC1CC2(C(C1O)C=C(C(CC3C(C3(C)C)C=C(C2=O)C)OC(=...0.01...0000000000
2261102.5777[81.0334, 83.0491, 85.0224, 85.0284, 85.0333, ...[0.14210526315789473, 0.2236842105263158, 0.01...[][]CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)O...4.00...0000000000
3310472.2082[72.2736, 145.0759, 148.0868, 149.0707, 172.06...[0.010697674418604652, 0.03488372093023256, 0....[135.07919999999996, 148.0632, 165.08959999999...[0.053488372093023255, 0.10465116279069768, 0....CN1C2=C(C=C(C=C2)C(=O)N(CCC(=O)O)C3=CC=CC=N3)N...0.01...0000010000
4411657.3116[55.0542, 55.9818, 57.0334, 60.2456, 69.0335, ...[0.018260869565217393, 0.013043478260869565, 0...[158.094, 210.1282, 228.1352, 246.1485, 280.16...[0.02217391304347826, 0.06086956521739131, 0.1...CCCC(=O)OCC(C(C(CN1C2=C(C=C(C(=C2)C)C)N=C3C1=N...1.00...0000000000
..................................................................
237237490268.1541[86.0599, 109.065, 109.1013, 121.101, 123.1168...[0.10689655172413794, 0.04482758620689655, 0.0...[18.01030000000003, 36.02120000000002, 46.0054...[0.21724137931034482, 0.06206896551724138, 1.0...CC1CCC2C1C(=O)OC(C2C)NC(=O)C3C(O3)C0.00...0000000000
238238491411.3254[55.0544, 57.0701, 67.0544, 69.0699, 81.0699, ...[0.0215625, 0.053125, 0.015, 0.34375, 0.046875...[18.01060000000001, 88.08850000000001, 142.135...[0.153125, 0.02125, 0.01875, 0.02625, 0.028437...CC(C)C(C)C=CC(C)C1CCC2C1(CCC3=C2C(=O)C=C4C3(CC...0.00...0000000000
239239492430.2432[50.4192, 52.9216, 74.1355, 81.0699, 86.4846, ...[0.05555555555555555, 0.058333333333333334, 0....[98.63150000000002, 179.0794, 197.0896, 214.55...[0.06666666666666667, 0.2777777777777778, 1.0,...CC1CCC(C2(C1=CC(CC2)C(=C)C(=O)O)C)OC3C(C(C(C(O...1.01...0000000000
240240495578.2076[54.5814, 57.0338, 61.0286, 69.0335, 81.0331, ...[0.01, 0.010769230769230769, 0.038461538461538...[160.76359999999994][0.011923076923076923]COC1=C2C(=CC(=C1OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(...2.01...0000000000
241241500243.1014[71.9563, 104.546, 105.0698, 107.0491, 107.056...[0.015833333333333335, 0.018333333333333333, 0...[24.813600000000008, 42.197700000000026, 94.04...[0.025833333333333333, 0.0175, 0.065, 1.0, 0.6...C1C(COC2=C1C=CC(=C2)O)C3=CC=C(C=C3)O0.00...2000000000
\n", 363 | "

242 rows × 68 columns

\n", 364 | "
" 365 | ], 366 | "text/plain": [ 367 | " Unnamed: 0 spectrum_id precursor_mz \\\n", 368 | "0 0 3 719.2538 \n", 369 | "1 1 5 499.2298 \n", 370 | "2 2 6 1102.5777 \n", 371 | "3 3 10 472.2082 \n", 372 | "4 4 11 657.3116 \n", 373 | ".. ... ... ... \n", 374 | "237 237 490 268.1541 \n", 375 | "238 238 491 411.3254 \n", 376 | "239 239 492 430.2432 \n", 377 | "240 240 495 578.2076 \n", 378 | "241 241 500 243.1014 \n", 379 | "\n", 380 | " mzs \\\n", 381 | "0 [53.0387, 55.0179, 55.0545, 57.0338, 59.0492, ... \n", 382 | "1 [67.0543, 69.0698, 81.0699, 83.0492, 83.0853, ... \n", 383 | "2 [81.0334, 83.0491, 85.0224, 85.0284, 85.0333, ... \n", 384 | "3 [72.2736, 145.0759, 148.0868, 149.0707, 172.06... \n", 385 | "4 [55.0542, 55.9818, 57.0334, 60.2456, 69.0335, ... \n", 386 | ".. ... \n", 387 | "237 [86.0599, 109.065, 109.1013, 121.101, 123.1168... \n", 388 | "238 [55.0544, 57.0701, 67.0544, 69.0699, 81.0699, ... \n", 389 | "239 [50.4192, 52.9216, 74.1355, 81.0699, 86.4846, ... \n", 390 | "240 [54.5814, 57.0338, 61.0286, 69.0335, 81.0331, ... \n", 391 | "241 [71.9563, 104.546, 105.0698, 107.0491, 107.056... \n", 392 | "\n", 393 | " intensities \\\n", 394 | "0 [0.017272727272727273, 0.01818181818181818, 0.... \n", 395 | "1 [0.010357142857142856, 0.04642857142857143, 0.... \n", 396 | "2 [0.14210526315789473, 0.2236842105263158, 0.01... \n", 397 | "3 [0.010697674418604652, 0.03488372093023256, 0.... \n", 398 | "4 [0.018260869565217393, 0.013043478260869565, 0... \n", 399 | ".. ... \n", 400 | "237 [0.10689655172413794, 0.04482758620689655, 0.0... \n", 401 | "238 [0.0215625, 0.053125, 0.015, 0.34375, 0.046875... \n", 402 | "239 [0.05555555555555555, 0.058333333333333334, 0.... \n", 403 | "240 [0.01, 0.010769230769230769, 0.038461538461538... \n", 404 | "241 [0.015833333333333335, 0.018333333333333333, 0... \n", 405 | "\n", 406 | " loss_mzs \\\n", 407 | "0 [314.15709999999996] \n", 408 | "1 [60.02260000000001, 118.02660000000003, 160.03... \n", 409 | "2 [] \n", 410 | "3 [135.07919999999996, 148.0632, 165.08959999999... \n", 411 | "4 [158.094, 210.1282, 228.1352, 246.1485, 280.16... \n", 412 | ".. ... \n", 413 | "237 [18.01030000000003, 36.02120000000002, 46.0054... \n", 414 | "238 [18.01060000000001, 88.08850000000001, 142.135... \n", 415 | "239 [98.63150000000002, 179.0794, 197.0896, 214.55... \n", 416 | "240 [160.76359999999994] \n", 417 | "241 [24.813600000000008, 42.197700000000026, 94.04... \n", 418 | "\n", 419 | " loss_intensities \\\n", 420 | "0 [0.01818181818181818] \n", 421 | "1 [0.060714285714285714, 0.39285714285714285, 0.... \n", 422 | "2 [] \n", 423 | "3 [0.053488372093023255, 0.10465116279069768, 0.... \n", 424 | "4 [0.02217391304347826, 0.06086956521739131, 0.1... \n", 425 | ".. ... \n", 426 | "237 [0.21724137931034482, 0.06206896551724138, 1.0... \n", 427 | "238 [0.153125, 0.02125, 0.01875, 0.02625, 0.028437... \n", 428 | "239 [0.06666666666666667, 0.2777777777777778, 1.0,... \n", 429 | "240 [0.011923076923076923] \n", 430 | "241 [0.025833333333333333, 0.0175, 0.065, 1.0, 0.6... \n", 431 | "\n", 432 | " smiles_preprocessed num_of_sugars \\\n", 433 | "0 CC1C(C(C(C(O1)OC2=C(OC3=C(C(=CC(=C3C2=O)O)O)CC... 2.0 \n", 434 | "1 CC1CC2(C(C1O)C=C(C(CC3C(C3(C)C)C=C(C2=O)C)OC(=... 0.0 \n", 435 | "2 CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)O... 4.0 \n", 436 | "3 CN1C2=C(C=C(C=C2)C(=O)N(CCC(=O)O)C3=CC=CC=N3)N... 0.0 \n", 437 | "4 CCCC(=O)OCC(C(C(CN1C2=C(C=C(C(=C2)C)C)N=C3C1=N... 1.0 \n", 438 | ".. ... ... \n", 439 | "237 CC1CCC2C1C(=O)OC(C2C)NC(=O)C3C(O3)C 0.0 \n", 440 | "238 CC(C)C(C)C=CC(C)C1CCC2C1(CCC3=C2C(=O)C=C4C3(CC... 0.0 \n", 441 | "239 CC1CCC(C2(C1=CC(CC2)C(=C)C(=O)O)C)OC3C(C(C(C(O... 1.0 \n", 442 | "240 COC1=C2C(=CC(=C1OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(... 2.0 \n", 443 | "241 C1C(COC2=C1C=CC(=C2)O)C3=CC=C(C=C3)O 0.0 \n", 444 | "\n", 445 | " Number of aliphatic carboxylic acids ... Number of phenols \\\n", 446 | "0 0 ... 2 \n", 447 | "1 1 ... 0 \n", 448 | "2 0 ... 0 \n", 449 | "3 1 ... 0 \n", 450 | "4 0 ... 0 \n", 451 | ".. ... ... ... \n", 452 | "237 0 ... 0 \n", 453 | "238 0 ... 0 \n", 454 | "239 1 ... 0 \n", 455 | "240 1 ... 0 \n", 456 | "241 0 ... 2 \n", 457 | "\n", 458 | " Number of phosphoric acid groups Number of phosphoric ester groups \\\n", 459 | "0 0 0 \n", 460 | "1 0 0 \n", 461 | "2 0 0 \n", 462 | "3 0 0 \n", 463 | "4 0 0 \n", 464 | ".. ... ... \n", 465 | "237 0 0 \n", 466 | "238 0 0 \n", 467 | "239 0 0 \n", 468 | "240 0 0 \n", 469 | "241 0 0 \n", 470 | "\n", 471 | " Number of piperdine rings Number of primary amides \\\n", 472 | "0 0 0 \n", 473 | "1 0 0 \n", 474 | "2 0 0 \n", 475 | "3 0 0 \n", 476 | "4 0 0 \n", 477 | ".. ... ... \n", 478 | "237 0 0 \n", 479 | "238 0 0 \n", 480 | "239 0 0 \n", 481 | "240 0 0 \n", 482 | "241 0 0 \n", 483 | "\n", 484 | " Number of pyridine rings Number of quaternary nitrogens \\\n", 485 | "0 0 0 \n", 486 | "1 0 0 \n", 487 | "2 0 0 \n", 488 | "3 1 0 \n", 489 | "4 0 0 \n", 490 | ".. ... ... \n", 491 | "237 0 0 \n", 492 | "238 0 0 \n", 493 | "239 0 0 \n", 494 | "240 0 0 \n", 495 | "241 0 0 \n", 496 | "\n", 497 | " Number of thioether Number of thiazole rings \\\n", 498 | "0 0 0 \n", 499 | "1 0 0 \n", 500 | "2 0 0 \n", 501 | "3 0 0 \n", 502 | "4 0 0 \n", 503 | ".. ... ... \n", 504 | "237 0 0 \n", 505 | "238 0 0 \n", 506 | "239 0 0 \n", 507 | "240 0 0 \n", 508 | "241 0 0 \n", 509 | "\n", 510 | " Number of unbranched alkanes of at least 4 members (excludes halogenated alkanes) \n", 511 | "0 0 \n", 512 | "1 0 \n", 513 | "2 0 \n", 514 | "3 0 \n", 515 | "4 0 \n", 516 | ".. ... \n", 517 | "237 0 \n", 518 | "238 0 \n", 519 | "239 0 \n", 520 | "240 0 \n", 521 | "241 0 \n", 522 | "\n", 523 | "[242 rows x 68 columns]" 524 | ] 525 | }, 526 | "execution_count": 3, 527 | "metadata": {}, 528 | "output_type": "execute_result" 529 | } 530 | ], 531 | "source": [ 532 | "root='C:/Users/delser/'\n", 533 | "#name='all-HRMS-loss-header-train-refine_prepro.tsv'\n", 534 | "#name='all_HRMS_train_24012023_cddd_refine_s.tsv'\n", 535 | "#name='all_HRMS_valid_24012023_cddd_refine_s.tsv'\n", 536 | "name='casmi_func_groups_2201.tsv'\n", 537 | "\n", 538 | "os.path.join(root, name)\n", 539 | "df_valid = pd.read_csv(os.path.join(root, name), sep=\"\\t\") \n", 540 | "\n", 541 | "df_valid" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "id": "0a17727e-845e-4778-9626-d283eed2e287", 547 | "metadata": {}, 548 | "source": [ 549 | "adducts_v=df_valid['adduct'].to_list()" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "id": "6092dbac-8cfd-445b-96f0-dbad382d593d", 555 | "metadata": {}, 556 | "source": [ 557 | "len(set(adducts_v+adducts))" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "id": "7404a6f3-6049-4a64-b839-9deaf36761ca", 563 | "metadata": {}, 564 | "source": [ 565 | "list(set(adducts_v+adducts))" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "id": "b27a4cd7-9b87-4113-b512-87df6b0c8f98", 571 | "metadata": {}, 572 | "source": [ 573 | "sorted_adducts=['[M+H-C2H4O2]+',#-60\n", 574 | " '[M-3H2O+H]+',#-54\n", 575 | " '[M-2H2O+H]+',#-36\n", 576 | " '[M-H2O+H]+',#-18\n", 577 | " '[M-NH3+H]+',#-17\n", 578 | " '[M]+',\n", 579 | " '[M+H]+',\n", 580 | " '[M+H+2i]+',\n", 581 | " '[M+NH3]+',#+17\n", 582 | " '[M+NH4]+',#+18\n", 583 | " '[M+Na]+',#+23\n", 584 | " '[M+H+CH3OH]',#+33\n", 585 | " '[M+K]+',#+39\n", 586 | " '[2M+H]+',\n", 587 | " '[2M+H+2i]+',\n", 588 | "'[2M+NH4]+',\n", 589 | "'[M-H+2Na]+',\n", 590 | " '[2M+Na]+',\n", 591 | "'[2M+K]+',]\n", 592 | "\n", 593 | "\n", 594 | "\n", 595 | "\n" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "id": "d8a12ea1-c198-4f68-a908-82edd45776d3", 601 | "metadata": {}, 602 | "source": [ 603 | "sorted_adducts.index('[M-NH3+H]+')" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 4, 609 | "id": "8cd72c14-9c64-4784-811f-744f88e54498", 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "def get_adduct_num(adduct):\n", 614 | " sorted_adducts=['[M+H-C2H4O2]+',#-60\n", 615 | " '[M-3H2O+H]+',#-54\n", 616 | " '[M-2H2O+H]+',#-36\n", 617 | " '[M-H2O+H]+',#-18\n", 618 | " '[M-NH3+H]+',#-17\n", 619 | " '[M]+',\n", 620 | " '[M+H]+',\n", 621 | " '[M+H+2i]+',\n", 622 | " '[M+NH3]+',#+17\n", 623 | " '[M+NH4]+',#+18\n", 624 | " '[M+Na]+',#+23\n", 625 | " '[M+H+CH3OH]',#+33\n", 626 | " '[M+K]+',#+39\n", 627 | " '[2M+H]+',\n", 628 | " '[2M+H+2i]+',\n", 629 | "'[2M+NH4]+',\n", 630 | "'[M-H+2Na]+',\n", 631 | " '[2M+Na]+',\n", 632 | "'[2M+K]+',]\n", 633 | " return sorted_adducts.index(adduct)\n" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": 5, 639 | "id": "44d72a58-b378-4e7c-bff9-9e2a4ea1478f", 640 | "metadata": {}, 641 | "outputs": [ 642 | { 643 | "data": { 644 | "text/plain": [ 645 | "4" 646 | ] 647 | }, 648 | "execution_count": 5, 649 | "metadata": {}, 650 | "output_type": "execute_result" 651 | } 652 | ], 653 | "source": [ 654 | "get_adduct_num('[M-NH3+H]+')" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 6, 660 | "id": "898370cd-ab79-4442-921b-e1c9490faf50", 661 | "metadata": { 662 | "collapsed": true, 663 | "jupyter": { 664 | "outputs_hidden": true 665 | }, 666 | "tags": [] 667 | }, 668 | "outputs": [ 669 | { 670 | "ename": "AttributeError", 671 | "evalue": "'Series' object has no attribute 'adduct'", 672 | "output_type": "error", 673 | "traceback": [ 674 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 675 | "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", 676 | "Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df_valid[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124madduct_enc\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf_valid\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mget_adduct_num\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madduct\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m df_valid\n", 677 | "File \u001b[1;32m~\\Anaconda3\\envs\\tf_new\\lib\\site-packages\\pandas\\core\\frame.py:9565\u001b[0m, in \u001b[0;36mDataFrame.apply\u001b[1;34m(self, func, axis, raw, result_type, args, **kwargs)\u001b[0m\n\u001b[0;32m 9554\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapply\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m frame_apply\n\u001b[0;32m 9556\u001b[0m op \u001b[38;5;241m=\u001b[39m frame_apply(\n\u001b[0;32m 9557\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 9558\u001b[0m func\u001b[38;5;241m=\u001b[39mfunc,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 9563\u001b[0m kwargs\u001b[38;5;241m=\u001b[39mkwargs,\n\u001b[0;32m 9564\u001b[0m )\n\u001b[1;32m-> 9565\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapply\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", 678 | "File \u001b[1;32m~\\Anaconda3\\envs\\tf_new\\lib\\site-packages\\pandas\\core\\apply.py:746\u001b[0m, in \u001b[0;36mFrameApply.apply\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 743\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mraw:\n\u001b[0;32m 744\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_raw()\n\u001b[1;32m--> 746\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", 679 | "File \u001b[1;32m~\\Anaconda3\\envs\\tf_new\\lib\\site-packages\\pandas\\core\\apply.py:873\u001b[0m, in \u001b[0;36mFrameApply.apply_standard\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 872\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply_standard\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m--> 873\u001b[0m results, res_index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_series_generator\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 875\u001b[0m \u001b[38;5;66;03m# wrap results\u001b[39;00m\n\u001b[0;32m 876\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwrap_results(results, res_index)\n", 680 | "File \u001b[1;32m~\\Anaconda3\\envs\\tf_new\\lib\\site-packages\\pandas\\core\\apply.py:889\u001b[0m, in \u001b[0;36mFrameApply.apply_series_generator\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 886\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m option_context(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmode.chained_assignment\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m 887\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(series_gen):\n\u001b[0;32m 888\u001b[0m \u001b[38;5;66;03m# ignore SettingWithCopy here in case the user mutates\u001b[39;00m\n\u001b[1;32m--> 889\u001b[0m results[i] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 890\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(results[i], ABCSeries):\n\u001b[0;32m 891\u001b[0m \u001b[38;5;66;03m# If we have a view on v, we need to make a copy because\u001b[39;00m\n\u001b[0;32m 892\u001b[0m \u001b[38;5;66;03m# series_generator will swap out the underlying data\u001b[39;00m\n\u001b[0;32m 893\u001b[0m results[i] \u001b[38;5;241m=\u001b[39m results[i]\u001b[38;5;241m.\u001b[39mcopy(deep\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n", 681 | "Cell \u001b[1;32mIn[6], line 1\u001b[0m, in \u001b[0;36m\u001b[1;34m(x)\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df_valid[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124madduct_enc\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m df_valid\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: get_adduct_num(\u001b[43mx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madduct\u001b[49m), axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m 2\u001b[0m df_valid\n", 682 | "File \u001b[1;32m~\\Anaconda3\\envs\\tf_new\\lib\\site-packages\\pandas\\core\\generic.py:5902\u001b[0m, in \u001b[0;36mNDFrame.__getattr__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 5895\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m 5896\u001b[0m name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_internal_names_set\n\u001b[0;32m 5897\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_metadata\n\u001b[0;32m 5898\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accessors\n\u001b[0;32m 5899\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info_axis\u001b[38;5;241m.\u001b[39m_can_hold_identifiers_and_holds_name(name)\n\u001b[0;32m 5900\u001b[0m ):\n\u001b[0;32m 5901\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m[name]\n\u001b[1;32m-> 5902\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mobject\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__getattribute__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n", 683 | "\u001b[1;31mAttributeError\u001b[0m: 'Series' object has no attribute 'adduct'" 684 | ] 685 | } 686 | ], 687 | "source": [ 688 | "df_valid['adduct_enc'] = df_valid.apply(lambda x: get_adduct_num(x.adduct), axis=1)\n", 689 | "df_valid" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": null, 695 | "id": "840ef319-6b66-4127-bff3-bf1fdbf4b42d", 696 | "metadata": {}, 697 | "outputs": [], 698 | "source": [ 699 | "df_valid['mf'] = df_valid.apply(lambda x: get_mf(x.smiles_preprocessed), axis=1)" 700 | ] 701 | }, 702 | { 703 | "cell_type": "markdown", 704 | "id": "bad5061d-55c6-4939-a449-8b03db6565f7", 705 | "metadata": {}, 706 | "source": [ 707 | "formulas=df_valid['mf'].to_list()+df_train['mf'].to_list()" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "id": "1341520a-a50d-4734-a514-591780f42cd8", 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "from molmass import Formula\n", 718 | "\n", 719 | "def number_of_atoms(form): \n", 720 | " f= Formula(form.replace(\"-\",\"\").replace(\"+\",\"\"))\n", 721 | " MF=(f._elements)\n", 722 | " try:\n", 723 | " C=MF[\"C\"][0]\n", 724 | " except:\n", 725 | " C=0\n", 726 | " try:\n", 727 | " H=MF[\"H\"][0]\n", 728 | " except:\n", 729 | " H=0\n", 730 | " try:\n", 731 | " O=MF[\"O\"][0]\n", 732 | " except:\n", 733 | " O=0\n", 734 | " try:\n", 735 | " N=MF[\"N\"][0]\n", 736 | " except:\n", 737 | " N=0\n", 738 | " try:\n", 739 | " S=MF[\"S\"][0]\n", 740 | " except:\n", 741 | " S=0\n", 742 | " try:\n", 743 | " I=MF[\"I\"][0]\n", 744 | " except:\n", 745 | " I=0\n", 746 | " try:\n", 747 | " Br=MF[\"Br\"][0]\n", 748 | " except:\n", 749 | " Br=0\n", 750 | " try:\n", 751 | " Cl=MF[\"Cl\"][0]\n", 752 | " except:\n", 753 | " Cl=0\n", 754 | " try:\n", 755 | " F=MF[\"F\"][0]\n", 756 | " except:\n", 757 | " F=0\n", 758 | " try:\n", 759 | " P=MF[\"P\"][0]\n", 760 | " except:\n", 761 | " P=0\n", 762 | " \n", 763 | " return [C,H, O, N, S, I, Br,Cl, F,P]\n", 764 | " \n", 765 | " " 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": null, 771 | "id": "5471c5f4-279e-491f-abcc-cf273ae1171a", 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [ 775 | "df_valid['elements'] = df_valid.apply(lambda x:number_of_atoms(x.mf), axis=1)\n", 776 | "df_valid" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": null, 782 | "id": "d46734f6-4172-4859-8e56-b8adbbc7b1c0", 783 | "metadata": {}, 784 | "outputs": [], 785 | "source": [ 786 | "ele=[\"C\",\"H\", \"O\",\" N\", \"S\", \"I\", \"Br\",\"Cl\", \"F\",\"P\"]\n", 787 | "df_elements= pd.DataFrame(df_valid['elements'].to_list(), columns=ele)\n", 788 | "df_elements" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": null, 794 | "id": "09cdb893-c1ca-40b2-a9d4-2a21752ce2d8", 795 | "metadata": {}, 796 | "outputs": [], 797 | "source": [ 798 | "df_elements.to_numpy().max()" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": null, 804 | "id": "fafb0ac4-acd9-4be0-89d7-30778d0d753d", 805 | "metadata": {}, 806 | "outputs": [], 807 | "source": [ 808 | "df_valid=df_valid.join(df_elements,how=\"outer\")\n", 809 | "df_valid" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": null, 815 | "id": "0658692f-8df1-41c7-bcd9-2c39982f65fb", 816 | "metadata": {}, 817 | "outputs": [], 818 | "source": [ 819 | "df_valid=df_valid.drop([\"elements\",\"mf\"], axis=1)\n", 820 | "df_valid" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": null, 826 | "id": "dd96e6b0-1259-41a7-abc2-b8acdb9e76d5", 827 | "metadata": {}, 828 | "outputs": [], 829 | "source": [ 830 | "group_array=df_valid.iloc[:, 10:]\n", 831 | "group_array" 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": null, 837 | "id": "dd11efb4-b712-4aa8-a7ab-44233704b3f9", 838 | "metadata": {}, 839 | "outputs": [], 840 | "source": [ 841 | "group_array.to_numpy().max()" 842 | ] 843 | }, 844 | { 845 | "cell_type": "code", 846 | "execution_count": null, 847 | "id": "c2bb0831-02a7-4b1b-acdc-ce94b7086c5a", 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "group_array.to_numpy().shape" 852 | ] 853 | }, 854 | { 855 | "cell_type": "code", 856 | "execution_count": null, 857 | "id": "d4a9922c-c1e4-41de-8ee1-fff110a5cb2c", 858 | "metadata": {}, 859 | "outputs": [], 860 | "source": [ 861 | "def make_float_targets(k):\n", 862 | " targets = np.zeros(k, dtype=np.float32)\n", 863 | " start = 1.0 / (2 * k) # like 0.125\n", 864 | " delta = 1.0 / k # like 0.250\n", 865 | " for i in range(k):\n", 866 | " targets[i] = start + (i * delta) \n", 867 | " return targets" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": null, 873 | "id": "02f0a3e7-154b-4ef6-a2f3-e2667a1cc13b", 874 | "metadata": {}, 875 | "outputs": [], 876 | "source": [ 877 | "train_int=group_array.to_numpy()\n", 878 | "ordinal=make_float_targets(65)\n", 879 | "all_arr=[]\n", 880 | "for sample in train_int:\n", 881 | " cache=[]\n", 882 | " for i in sample:\n", 883 | " cache.append(ordinal[int(i)])\n", 884 | " x=np.array(cache)\n", 885 | " all_arr.append(x)\n", 886 | " \n", 887 | "train_y=np.stack(all_arr) \n", 888 | "train_y.shape " 889 | ] 890 | }, 891 | { 892 | "cell_type": "code", 893 | "execution_count": null, 894 | "id": "b5dfb35c-7230-4d0f-9cd7-19b8435f43c1", 895 | "metadata": {}, 896 | "outputs": [], 897 | "source": [ 898 | "#np.save('y1_all_HRMS_valid_24012023_cddd_mf.npy',train_y)" 899 | ] 900 | }, 901 | { 902 | "cell_type": "code", 903 | "execution_count": null, 904 | "id": "73610487-9ad9-4bdc-a6c6-18f7f72298df", 905 | "metadata": {}, 906 | "outputs": [], 907 | "source": [ 908 | "df_valid.to_csv('casmi_func_groups_2201.tsv_mf.tsv',sep='\\t')" 909 | ] 910 | } 911 | ], 912 | "metadata": { 913 | "kernelspec": { 914 | "display_name": "Python 3", 915 | "language": "python", 916 | "name": "python3" 917 | }, 918 | "language_info": { 919 | "codemirror_mode": { 920 | "name": "ipython", 921 | "version": 3 922 | }, 923 | "file_extension": ".py", 924 | "mimetype": "text/x-python", 925 | "name": "python", 926 | "nbconvert_exporter": "python", 927 | "pygments_lexer": "ipython3", 928 | "version": "3.8.13" 929 | } 930 | }, 931 | "nbformat": 4, 932 | "nbformat_minor": 5 933 | } 934 | -------------------------------------------------------------------------------- /preprocessing_onlin-v3_mgf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0c60b9c7-bba8-4e7f-91d8-7935ef85da08", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "\n", 11 | "from tqdm import tqdm\n", 12 | "import tensorflow as tf\n", 13 | "import json\n", 14 | "import tensorflow_text as text\n", 15 | "import os\n", 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import pickle\n", 19 | "from matchms import set_matchms_logger_level\n", 20 | "import pandas as pd\n", 21 | "set_matchms_logger_level(\"ERROR\")\n", 22 | "from matchms.filtering import add_losses\n", 23 | "from matchms.filtering import add_parent_mass\n", 24 | "from matchms.filtering import default_filters\n", 25 | "from matchms.filtering import normalize_intensities\n", 26 | "from matchms.filtering import select_by_intensity\n", 27 | "from matchms.filtering import reduce_to_number_of_peaks\n", 28 | "from matchms.filtering import require_minimum_number_of_peaks\n", 29 | "from matchms.filtering import select_by_mz\n", 30 | "from matchms.importing import load_from_mgf\n", 31 | "from matchms.exporting import save_as_mgf\n", 32 | "from matchms.importing import load_from_msp\n", 33 | "\n", 34 | "\n", 35 | "from matchms.filtering import repair_inchi_inchikey_smiles\n", 36 | "from matchms.filtering import derive_inchikey_from_inchi\n", 37 | "from matchms.filtering import derive_smiles_from_inchi\n", 38 | "from matchms.filtering import derive_inchi_from_smiles\n", 39 | "from matchms.filtering import harmonize_undefined_inchi\n", 40 | "from matchms.filtering import harmonize_undefined_inchikey\n", 41 | "from matchms.filtering import harmonize_undefined_smiles" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 5, 47 | "id": "30ed8f9b-215b-4fac-8d09-64317fb965e3", 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "Python 3.9.15\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "!python -V" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "id": "1a88907a-9f26-4877-9356-1fe77bfed534", 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "CPU times: total: 7.12 s\n", 73 | "Wall time: 7.25 s\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "%%time\n", 79 | "\n", 80 | "#from spec2vec.model_building import train_new_word2vec_model\n", 81 | "\n", 82 | "def spectrum_processing(s):\n", 83 | " \"\"\"This is how one would typically design a desired pre- and post-\n", 84 | " processing pipeline.\"\"\"\n", 85 | " s = default_filters(s)\n", 86 | " s = add_parent_mass(s)\n", 87 | " s = normalize_intensities(s)\n", 88 | " s = select_by_intensity(s, intensity_from=0.01)\n", 89 | " s = reduce_to_number_of_peaks(s, n_required=5, n_max=250)\n", 90 | " s = select_by_mz(s, mz_from=15, mz_to=2000)\n", 91 | " s = add_losses(s, loss_mz_from=15.0, loss_mz_to=350.0)\n", 92 | " s = require_minimum_number_of_peaks(s, n_required=5)\n", 93 | " return s\n", 94 | "\n", 95 | "\n", 96 | "\n", 97 | "def metadata_processing(spectrum):\n", 98 | " spectrum = default_filters(spectrum)\n", 99 | " spectrum = repair_inchi_inchikey_smiles(spectrum)\n", 100 | " spectrum = derive_inchi_from_smiles(spectrum)\n", 101 | " spectrum = derive_smiles_from_inchi(spectrum)\n", 102 | " spectrum = derive_inchikey_from_inchi(spectrum)\n", 103 | " spectrum = harmonize_undefined_smiles(spectrum)\n", 104 | " spectrum = harmonize_undefined_inchi(spectrum)\n", 105 | " spectrum = harmonize_undefined_inchikey(spectrum)\n", 106 | " return spectrum\n", 107 | "# Load data from MGF file and apply filters\n", 108 | "\n", 109 | "import os\n", 110 | "from matchms.importing import load_from_mgf\n", 111 | "path_data = \"C:/Users/delser/mass2smiles\" # enter path to downloaded mgf file\n", 112 | "file_mgf = os.path.join(path_data, \n", 113 | " \"casmi_candidates_pos_casmi_id.mgf\")\n", 114 | "spectrums = list(load_from_mgf(file_mgf))\n", 115 | "\n", 116 | "spectrums = [metadata_processing(s) for s in spectrums]\n", 117 | "spectrums = [spectrum_processing(s) for s in spectrums]\n", 118 | "#spectrums = [spectrum_processing(s) for s in load_from_mgf(\"/Users/delser/Desktop/PhD/Phytochemistry/NP-Databases/CFM-4_DB/TOTAL_COMPOUNDS_DB.energies_merged_name.mgf\")]\n", 119 | "#spectrums = [spectrum_processing(s) for s in load_from_mgf(\"/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/altissues15072021-py.mgf\")]\n", 120 | "# Omit spectrums that didn't qualify for analysis\n", 121 | "spectrums = [s for s in spectrums if s is not None]\n", 122 | "\n", 123 | "\n", 124 | "# Create spectrum documents\n", 125 | "#reference_documents = [SpectrumDocument(s, n_decimals=2) for s in spectrums]" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 3, 131 | "id": "89dca0b7-9f27-441c-95b4-3f074766ead8", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "precs = []\n", 136 | "IDs = []\n", 137 | "mzs=[]\n", 138 | "ints=[]\n", 139 | "loss_mzs=[]\n", 140 | "loss_ints=[]\n", 141 | "\n", 142 | "\n", 143 | "for spec in spectrums: \n", 144 | " IDs.append(spec.get(\"feature_id\"))\n", 145 | " precs.append(spec.get(\"precursor_mz\"))\n", 146 | " mzs.append(list(spec.peaks.mz))\n", 147 | " ints.append(list(spec.peaks.intensities))\n", 148 | " loss_mzs.append(list(spec.losses.mz))\n", 149 | " loss_ints.append(list(spec.losses.intensities))\n", 150 | " " 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 27, 156 | "id": "4321c672-2b43-41fc-930d-3665db0c2608", 157 | "metadata": { 158 | "tags": [] 159 | }, 160 | "outputs": [ 161 | { 162 | "data": { 163 | "text/plain": [ 164 | "{56.604: 0.020625,\n", 165 | " 57.0337: 0.03625,\n", 166 | " 61.0286: 0.10625,\n", 167 | " 69.0336: 0.51875,\n", 168 | " 71.0492: 0.2,\n", 169 | " 74.5668: 0.018125,\n", 170 | " 81.0336: 0.075,\n", 171 | " 83.0491: 0.3,\n", 172 | " 85.0283: 0.425,\n", 173 | " 87.044: 0.1625,\n", 174 | " 95.0492: 0.024375,\n", 175 | " 97.0286: 0.125,\n", 176 | " 99.0442: 0.034375,\n", 177 | " 109.0281: 0.0325,\n", 178 | " 111.0441: 0.1625,\n", 179 | " 115.0384: 0.02625,\n", 180 | " 127.0392: 0.026875,\n", 181 | " 129.0546: 0.1125,\n", 182 | " 232.1556: 0.020625,\n", 183 | " 299.0545: 1.0,\n", 184 | " 299.0909: 0.036875,\n", 185 | " 311.0572: 0.01875,\n", 186 | " 355.1155: 0.15,\n", 187 | " 391.0793: 0.0225,\n", 188 | " 407.7533: 0.02,\n", 189 | " 498.4384: 0.02125,\n", 190 | " 206.79970000000003: 0.02125,\n", 191 | " 297.4848: 0.02,\n", 192 | " 314.15880000000004: 0.0225}" 193 | ] 194 | }, 195 | "execution_count": 27, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "res = dict(zip(mzs[3]+loss_mzs[3], ints[3]+loss_ints[3]))\n", 202 | "res" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 28, 208 | "id": "5cf3fe57-c57d-4913-8e1b-d1da1696486b", 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "{56.604: 0.020625,\n", 215 | " 57.0337: 0.03625,\n", 216 | " 61.0286: 0.10625,\n", 217 | " 69.0336: 0.51875,\n", 218 | " 71.0492: 0.2,\n", 219 | " 74.5668: 0.018125,\n", 220 | " 81.0336: 0.075,\n", 221 | " 83.0491: 0.3,\n", 222 | " 85.0283: 0.425,\n", 223 | " 87.044: 0.1625,\n", 224 | " 95.0492: 0.024375,\n", 225 | " 97.0286: 0.125,\n", 226 | " 99.0442: 0.034375,\n", 227 | " 109.0281: 0.0325,\n", 228 | " 111.0441: 0.1625,\n", 229 | " 115.0384: 0.02625,\n", 230 | " 127.0392: 0.026875,\n", 231 | " 129.0546: 0.1125,\n", 232 | " 206.79970000000003: 0.02125,\n", 233 | " 232.1556: 0.020625,\n", 234 | " 297.4848: 0.02,\n", 235 | " 299.0545: 1.0,\n", 236 | " 299.0909: 0.036875,\n", 237 | " 311.0572: 0.01875,\n", 238 | " 314.15880000000004: 0.0225,\n", 239 | " 355.1155: 0.15,\n", 240 | " 391.0793: 0.0225,\n", 241 | " 407.7533: 0.02,\n", 242 | " 498.4384: 0.02125}" 243 | ] 244 | }, 245 | "execution_count": 28, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "res=dict(sorted(res.items()))\n", 252 | "res" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 32, 258 | "id": "777cd1bc-3b0e-4c29-a327-2c1fd755e258", 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "text/plain": [ 264 | "list" 265 | ] 266 | }, 267 | "execution_count": 32, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "type(list(res.keys()))" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 30, 279 | "id": "7fe78582-fb81-4480-aea3-850216adf45c", 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "data": { 284 | "text/plain": [ 285 | "dict_values([0.020625, 0.03625, 0.10625, 0.51875, 0.2, 0.018125, 0.075, 0.3, 0.425, 0.1625, 0.024375, 0.125, 0.034375, 0.0325, 0.1625, 0.02625, 0.026875, 0.1125, 0.02125, 0.020625, 0.02, 1.0, 0.036875, 0.01875, 0.0225, 0.15, 0.0225, 0.02, 0.02125])" 286 | ] 287 | }, 288 | "execution_count": 30, 289 | "metadata": {}, 290 | "output_type": "execute_result" 291 | } 292 | ], 293 | "source": [ 294 | "res.values()" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 4, 300 | "id": "2d4707a7-6f95-4fd3-89bc-23ac60ab019a", 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/html": [ 306 | "
\n", 307 | "\n", 320 | "\n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | "
feature_idprecursor_mzmzsintensitiesloss_mzsloss_intensities
0398235.1691[53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...[0.07166666666666667, 0.020833333333333332, 0....[19.8476, 19.87299999999999, 56.06339999999997...[0.075, 0.07083333333333333, 0.031666666666666...
1398235.1691[53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...[0.07166666666666667, 0.020833333333333332, 0....[19.8476, 19.87299999999999, 56.06339999999997...[0.075, 0.07083333333333333, 0.031666666666666...
2159485.2164[66.3952, 68.416, 81.3048, 100.9429, 121.0081,...[0.05, 0.04565217391304348, 0.0521739130434782...[18.01230000000004, 96.17940000000004, 102.068...[1.0, 0.058695652173913045, 0.0630434782608695...
3169705.2381[56.604, 57.0337, 61.0286, 69.0336, 71.0492, 7...[0.020625, 0.03625, 0.10625, 0.51875, 0.2, 0.0...[206.79970000000003, 297.4848, 314.15880000000...[0.02125, 0.02, 0.0225]
4423441.2264[62.099, 62.3016, 65.6795, 71.5638, 95.0857, 1...[0.07096774193548387, 0.08709677419354839, 0.0...[70.07659999999998, 88.08980000000003, 130.173...[1.0, 0.14516129032258066, 0.12903225806451613...
.....................
231439336.0647[55.173, 55.6729, 57.7501, 63.2992, 64.3001, 7...[0.010476190476190476, 0.010952380952380953, 0...[17.859800000000007, 72.62130000000002, 119.40...[0.04428571428571428, 0.010952380952380953, 0....
232322532.3113[53.6431, 61.0821, 67.0543, 69.0334, 81.0699, ...[0.0125, 0.012, 0.016, 0.08, 0.23, 0.285, 0.07...[197.09179999999998, 207.04439999999994, 215.1...[0.0245, 0.015, 0.205, 0.135, 0.28, 0.0205, 0....
233300478.3159[63.7647, 71.5594, 93.5641, 108.7938, 119.8244...[0.10476190476190476, 0.1, 0.11428571428571428...[16.459400000000016, 35.03750000000002, 53.050...[0.10952380952380952, 0.13333333333333333, 0.1...
234254329.2320[53.0389, 55.0182, 55.0546, 67.008, 67.0544, 6...[0.0456140350877193, 0.05087719298245614, 0.28...[89.70510000000002, 112.67550000000003, 145.15...[0.042105263157894736, 0.2631578947368421, 0.0...
235237278.1172[50.8153, 74.535, 78.0275, 93.2556, 97.6804, 1...[0.01818181818181818, 0.01818181818181818, 0.0...[67.65480000000002, 83.67750000000004, 86.4763...[0.02, 0.019090909090909092, 0.018181818181818...
\n", 434 | "

236 rows × 6 columns

\n", 435 | "
" 436 | ], 437 | "text/plain": [ 438 | " feature_id precursor_mz \\\n", 439 | "0 398 235.1691 \n", 440 | "1 398 235.1691 \n", 441 | "2 159 485.2164 \n", 442 | "3 169 705.2381 \n", 443 | "4 423 441.2264 \n", 444 | ".. ... ... \n", 445 | "231 439 336.0647 \n", 446 | "232 322 532.3113 \n", 447 | "233 300 478.3159 \n", 448 | "234 254 329.2320 \n", 449 | "235 237 278.1172 \n", 450 | "\n", 451 | " mzs \\\n", 452 | "0 [53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ... \n", 453 | "1 [53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ... \n", 454 | "2 [66.3952, 68.416, 81.3048, 100.9429, 121.0081,... \n", 455 | "3 [56.604, 57.0337, 61.0286, 69.0336, 71.0492, 7... \n", 456 | "4 [62.099, 62.3016, 65.6795, 71.5638, 95.0857, 1... \n", 457 | ".. ... \n", 458 | "231 [55.173, 55.6729, 57.7501, 63.2992, 64.3001, 7... \n", 459 | "232 [53.6431, 61.0821, 67.0543, 69.0334, 81.0699, ... \n", 460 | "233 [63.7647, 71.5594, 93.5641, 108.7938, 119.8244... \n", 461 | "234 [53.0389, 55.0182, 55.0546, 67.008, 67.0544, 6... \n", 462 | "235 [50.8153, 74.535, 78.0275, 93.2556, 97.6804, 1... \n", 463 | "\n", 464 | " intensities \\\n", 465 | "0 [0.07166666666666667, 0.020833333333333332, 0.... \n", 466 | "1 [0.07166666666666667, 0.020833333333333332, 0.... \n", 467 | "2 [0.05, 0.04565217391304348, 0.0521739130434782... \n", 468 | "3 [0.020625, 0.03625, 0.10625, 0.51875, 0.2, 0.0... \n", 469 | "4 [0.07096774193548387, 0.08709677419354839, 0.0... \n", 470 | ".. ... \n", 471 | "231 [0.010476190476190476, 0.010952380952380953, 0... \n", 472 | "232 [0.0125, 0.012, 0.016, 0.08, 0.23, 0.285, 0.07... \n", 473 | "233 [0.10476190476190476, 0.1, 0.11428571428571428... \n", 474 | "234 [0.0456140350877193, 0.05087719298245614, 0.28... \n", 475 | "235 [0.01818181818181818, 0.01818181818181818, 0.0... \n", 476 | "\n", 477 | " loss_mzs \\\n", 478 | "0 [19.8476, 19.87299999999999, 56.06339999999997... \n", 479 | "1 [19.8476, 19.87299999999999, 56.06339999999997... \n", 480 | "2 [18.01230000000004, 96.17940000000004, 102.068... \n", 481 | "3 [206.79970000000003, 297.4848, 314.15880000000... \n", 482 | "4 [70.07659999999998, 88.08980000000003, 130.173... \n", 483 | ".. ... \n", 484 | "231 [17.859800000000007, 72.62130000000002, 119.40... \n", 485 | "232 [197.09179999999998, 207.04439999999994, 215.1... \n", 486 | "233 [16.459400000000016, 35.03750000000002, 53.050... \n", 487 | "234 [89.70510000000002, 112.67550000000003, 145.15... \n", 488 | "235 [67.65480000000002, 83.67750000000004, 86.4763... \n", 489 | "\n", 490 | " loss_intensities \n", 491 | "0 [0.075, 0.07083333333333333, 0.031666666666666... \n", 492 | "1 [0.075, 0.07083333333333333, 0.031666666666666... \n", 493 | "2 [1.0, 0.058695652173913045, 0.0630434782608695... \n", 494 | "3 [0.02125, 0.02, 0.0225] \n", 495 | "4 [1.0, 0.14516129032258066, 0.12903225806451613... \n", 496 | ".. ... \n", 497 | "231 [0.04428571428571428, 0.010952380952380953, 0.... \n", 498 | "232 [0.0245, 0.015, 0.205, 0.135, 0.28, 0.0205, 0.... \n", 499 | "233 [0.10952380952380952, 0.13333333333333333, 0.1... \n", 500 | "234 [0.042105263157894736, 0.2631578947368421, 0.0... \n", 501 | "235 [0.02, 0.019090909090909092, 0.018181818181818... \n", 502 | "\n", 503 | "[236 rows x 6 columns]" 504 | ] 505 | }, 506 | "execution_count": 4, 507 | "metadata": {}, 508 | "output_type": "execute_result" 509 | } 510 | ], 511 | "source": [ 512 | "metadata = pd.DataFrame(list(zip(IDs, precs,mzs,ints,loss_mzs,loss_ints)), columns=[\"feature_id\", \"precursor_mz\",\"mzs\",\"intensities\",\"loss_mzs\",\"loss_intensities\" ])\n", 513 | "metadata" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 5, 519 | "id": "538b75bb-ec4d-4bdd-82f6-687a89cd5276", 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "data": { 524 | "text/html": [ 525 | "
\n", 526 | "\n", 539 | "\n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | "
feature_idprecursor_mzmzsintensitiesloss_mzsloss_intensities
0398235.1691[53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...[0.07166666666666667, 0.020833333333333332, 0....[19.8476, 19.87299999999999, 56.06339999999997...[0.075, 0.07083333333333333, 0.031666666666666...
1398235.1691[53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...[0.07166666666666667, 0.020833333333333332, 0....[19.8476, 19.87299999999999, 56.06339999999997...[0.075, 0.07083333333333333, 0.031666666666666...
2159485.2164[66.3952, 68.416, 81.3048, 100.9429, 121.0081,...[0.05, 0.04565217391304348, 0.0521739130434782...[18.01230000000004, 96.17940000000004, 102.068...[1.0, 0.058695652173913045, 0.0630434782608695...
3169705.2381[56.604, 57.0337, 61.0286, 69.0336, 71.0492, 7...[0.020625, 0.03625, 0.10625, 0.51875, 0.2, 0.0...[206.79970000000003, 297.4848, 314.15880000000...[0.02125, 0.02, 0.0225]
4423441.2264[62.099, 62.3016, 65.6795, 71.5638, 95.0857, 1...[0.07096774193548387, 0.08709677419354839, 0.0...[70.07659999999998, 88.08980000000003, 130.173...[1.0, 0.14516129032258066, 0.12903225806451613...
.....................
231439336.0647[55.173, 55.6729, 57.7501, 63.2992, 64.3001, 7...[0.010476190476190476, 0.010952380952380953, 0...[17.859800000000007, 72.62130000000002, 119.40...[0.04428571428571428, 0.010952380952380953, 0....
232322532.3113[53.6431, 61.0821, 67.0543, 69.0334, 81.0699, ...[0.0125, 0.012, 0.016, 0.08, 0.23, 0.285, 0.07...[197.09179999999998, 207.04439999999994, 215.1...[0.0245, 0.015, 0.205, 0.135, 0.28, 0.0205, 0....
233300478.3159[63.7647, 71.5594, 93.5641, 108.7938, 119.8244...[0.10476190476190476, 0.1, 0.11428571428571428...[16.459400000000016, 35.03750000000002, 53.050...[0.10952380952380952, 0.13333333333333333, 0.1...
234254329.2320[53.0389, 55.0182, 55.0546, 67.008, 67.0544, 6...[0.0456140350877193, 0.05087719298245614, 0.28...[89.70510000000002, 112.67550000000003, 145.15...[0.042105263157894736, 0.2631578947368421, 0.0...
235237278.1172[50.8153, 74.535, 78.0275, 93.2556, 97.6804, 1...[0.01818181818181818, 0.01818181818181818, 0.0...[67.65480000000002, 83.67750000000004, 86.4763...[0.02, 0.019090909090909092, 0.018181818181818...
\n", 653 | "

236 rows × 6 columns

\n", 654 | "
" 655 | ], 656 | "text/plain": [ 657 | " feature_id precursor_mz \\\n", 658 | "0 398 235.1691 \n", 659 | "1 398 235.1691 \n", 660 | "2 159 485.2164 \n", 661 | "3 169 705.2381 \n", 662 | "4 423 441.2264 \n", 663 | ".. ... ... \n", 664 | "231 439 336.0647 \n", 665 | "232 322 532.3113 \n", 666 | "233 300 478.3159 \n", 667 | "234 254 329.2320 \n", 668 | "235 237 278.1172 \n", 669 | "\n", 670 | " mzs \\\n", 671 | "0 [53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ... \n", 672 | "1 [53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ... \n", 673 | "2 [66.3952, 68.416, 81.3048, 100.9429, 121.0081,... \n", 674 | "3 [56.604, 57.0337, 61.0286, 69.0336, 71.0492, 7... \n", 675 | "4 [62.099, 62.3016, 65.6795, 71.5638, 95.0857, 1... \n", 676 | ".. ... \n", 677 | "231 [55.173, 55.6729, 57.7501, 63.2992, 64.3001, 7... \n", 678 | "232 [53.6431, 61.0821, 67.0543, 69.0334, 81.0699, ... \n", 679 | "233 [63.7647, 71.5594, 93.5641, 108.7938, 119.8244... \n", 680 | "234 [53.0389, 55.0182, 55.0546, 67.008, 67.0544, 6... \n", 681 | "235 [50.8153, 74.535, 78.0275, 93.2556, 97.6804, 1... \n", 682 | "\n", 683 | " intensities \\\n", 684 | "0 [0.07166666666666667, 0.020833333333333332, 0.... \n", 685 | "1 [0.07166666666666667, 0.020833333333333332, 0.... \n", 686 | "2 [0.05, 0.04565217391304348, 0.0521739130434782... \n", 687 | "3 [0.020625, 0.03625, 0.10625, 0.51875, 0.2, 0.0... \n", 688 | "4 [0.07096774193548387, 0.08709677419354839, 0.0... \n", 689 | ".. ... \n", 690 | "231 [0.010476190476190476, 0.010952380952380953, 0... \n", 691 | "232 [0.0125, 0.012, 0.016, 0.08, 0.23, 0.285, 0.07... \n", 692 | "233 [0.10476190476190476, 0.1, 0.11428571428571428... \n", 693 | "234 [0.0456140350877193, 0.05087719298245614, 0.28... \n", 694 | "235 [0.01818181818181818, 0.01818181818181818, 0.0... \n", 695 | "\n", 696 | " loss_mzs \\\n", 697 | "0 [19.8476, 19.87299999999999, 56.06339999999997... \n", 698 | "1 [19.8476, 19.87299999999999, 56.06339999999997... \n", 699 | "2 [18.01230000000004, 96.17940000000004, 102.068... \n", 700 | "3 [206.79970000000003, 297.4848, 314.15880000000... \n", 701 | "4 [70.07659999999998, 88.08980000000003, 130.173... \n", 702 | ".. ... \n", 703 | "231 [17.859800000000007, 72.62130000000002, 119.40... \n", 704 | "232 [197.09179999999998, 207.04439999999994, 215.1... \n", 705 | "233 [16.459400000000016, 35.03750000000002, 53.050... \n", 706 | "234 [89.70510000000002, 112.67550000000003, 145.15... \n", 707 | "235 [67.65480000000002, 83.67750000000004, 86.4763... \n", 708 | "\n", 709 | " loss_intensities \n", 710 | "0 [0.075, 0.07083333333333333, 0.031666666666666... \n", 711 | "1 [0.075, 0.07083333333333333, 0.031666666666666... \n", 712 | "2 [1.0, 0.058695652173913045, 0.0630434782608695... \n", 713 | "3 [0.02125, 0.02, 0.0225] \n", 714 | "4 [1.0, 0.14516129032258066, 0.12903225806451613... \n", 715 | ".. ... \n", 716 | "231 [0.04428571428571428, 0.010952380952380953, 0.... \n", 717 | "232 [0.0245, 0.015, 0.205, 0.135, 0.28, 0.0205, 0.... \n", 718 | "233 [0.10952380952380952, 0.13333333333333333, 0.1... \n", 719 | "234 [0.042105263157894736, 0.2631578947368421, 0.0... \n", 720 | "235 [0.02, 0.019090909090909092, 0.018181818181818... \n", 721 | "\n", 722 | "[236 rows x 6 columns]" 723 | ] 724 | }, 725 | "execution_count": 5, 726 | "metadata": {}, 727 | "output_type": "execute_result" 728 | } 729 | ], 730 | "source": [ 731 | "df_train=metadata.dropna()\n", 732 | "df_train" 733 | ] 734 | }, 735 | { 736 | "cell_type": "markdown", 737 | "id": "17d5d4e0-789e-415e-98c0-7b357c3262bc", 738 | "metadata": {}, 739 | "source": [ 740 | "df_train=df_train.loc[df_train.loss_mzs.apply(str) != \"[]\"]\n", 741 | "df_train" 742 | ] 743 | }, 744 | { 745 | "cell_type": "markdown", 746 | "id": "b9428407-e8b2-450e-82ca-2d7a5541cdf7", 747 | "metadata": {}, 748 | "source": [ 749 | "df_wrong=metadata.loc[set(metadata.index) - set(df_train.index.values.tolist())]\n", 750 | "df_wrong" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": 6, 756 | "id": "ac599deb-d5cc-4bec-824b-52a3ae976d8a", 757 | "metadata": {}, 758 | "outputs": [], 759 | "source": [ 760 | "df_train.to_csv('casmi_ids.tsv',sep='\\t')" 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": 33, 766 | "id": "c9d0f5a9-2e05-4143-bf21-a62d075b273b", 767 | "metadata": {}, 768 | "outputs": [ 769 | { 770 | "data": { 771 | "text/html": [ 772 | "
\n", 773 | "\n", 786 | "\n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | "
Unnamed: 0spectrum_idprecursor_mzmzsintensitiesloss_mzsloss_intensitiessmiles_preprocessednum_of_sugarsNumber of aliphatic carboxylic acids...Number of phenolsNumber of phosphoric acid groupsNumber of phosphoric ester groupsNumber of piperdine ringsNumber of primary amidesNumber of pyridine ringsNumber of quaternary nitrogensNumber of thioetherNumber of thiazole ringsNumber of unbranched alkanes of at least 4 members (excludes halogenated alkanes)
003719.2538[53.0387, 55.0179, 55.0545, 57.0338, 59.0492, ...[0.017272727272727273, 0.01818181818181818, 0....[314.15709999999996][0.01818181818181818]CC1C(C(C(C(O1)OC2=C(OC3=C(C(=CC(=C3C2=O)O)O)CC...2.00...2000000000
115499.2298[67.0543, 69.0698, 81.0699, 83.0492, 83.0853, ...[0.010357142857142856, 0.04642857142857143, 0....[60.02260000000001, 118.02660000000003, 160.03...[0.060714285714285714, 0.39285714285714285, 0....CC1CC2(C(C1O)C=C(C(CC3C(C3(C)C)C=C(C2=O)C)OC(=...0.01...0000000000
2261102.5777[81.0334, 83.0491, 85.0224, 85.0284, 85.0333, ...[0.14210526315789473, 0.2236842105263158, 0.01...[][]CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)O...4.00...0000000000
3310472.2082[72.2736, 145.0759, 148.0868, 149.0707, 172.06...[0.010697674418604652, 0.03488372093023256, 0....[135.07919999999996, 148.0632, 165.08959999999...[0.053488372093023255, 0.10465116279069768, 0....CN1C2=C(C=C(C=C2)C(=O)N(CCC(=O)O)C3=CC=CC=N3)N...0.01...0000010000
4411657.3116[55.0542, 55.9818, 57.0334, 60.2456, 69.0335, ...[0.018260869565217393, 0.013043478260869565, 0...[158.094, 210.1282, 228.1352, 246.1485, 280.16...[0.02217391304347826, 0.06086956521739131, 0.1...CCCC(=O)OCC(C(C(CN1C2=C(C=C(C(=C2)C)C)N=C3C1=N...1.00...0000000000
..................................................................
237237490268.1541[86.0599, 109.065, 109.1013, 121.101, 123.1168...[0.10689655172413794, 0.04482758620689655, 0.0...[18.01030000000003, 36.02120000000002, 46.0054...[0.21724137931034482, 0.06206896551724138, 1.0...CC1CCC2C1C(=O)OC(C2C)NC(=O)C3C(O3)C0.00...0000000000
238238491411.3254[55.0544, 57.0701, 67.0544, 69.0699, 81.0699, ...[0.0215625, 0.053125, 0.015, 0.34375, 0.046875...[18.01060000000001, 88.08850000000001, 142.135...[0.153125, 0.02125, 0.01875, 0.02625, 0.028437...CC(C)C(C)C=CC(C)C1CCC2C1(CCC3=C2C(=O)C=C4C3(CC...0.00...0000000000
239239492430.2432[50.4192, 52.9216, 74.1355, 81.0699, 86.4846, ...[0.05555555555555555, 0.058333333333333334, 0....[98.63150000000002, 179.0794, 197.0896, 214.55...[0.06666666666666667, 0.2777777777777778, 1.0,...CC1CCC(C2(C1=CC(CC2)C(=C)C(=O)O)C)OC3C(C(C(C(O...1.01...0000000000
240240495578.2076[54.5814, 57.0338, 61.0286, 69.0335, 81.0331, ...[0.01, 0.010769230769230769, 0.038461538461538...[160.76359999999994][0.011923076923076923]COC1=C2C(=CC(=C1OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(...2.01...0000000000
241241500243.1014[71.9563, 104.546, 105.0698, 107.0491, 107.056...[0.015833333333333335, 0.018333333333333333, 0...[24.813600000000008, 42.197700000000026, 94.04...[0.025833333333333333, 0.0175, 0.065, 1.0, 0.6...C1C(COC2=C1C=CC(=C2)O)C3=CC=C(C=C3)O0.00...2000000000
\n", 1080 | "

242 rows × 68 columns

\n", 1081 | "
" 1082 | ], 1083 | "text/plain": [ 1084 | " Unnamed: 0 spectrum_id precursor_mz \\\n", 1085 | "0 0 3 719.2538 \n", 1086 | "1 1 5 499.2298 \n", 1087 | "2 2 6 1102.5777 \n", 1088 | "3 3 10 472.2082 \n", 1089 | "4 4 11 657.3116 \n", 1090 | ".. ... ... ... \n", 1091 | "237 237 490 268.1541 \n", 1092 | "238 238 491 411.3254 \n", 1093 | "239 239 492 430.2432 \n", 1094 | "240 240 495 578.2076 \n", 1095 | "241 241 500 243.1014 \n", 1096 | "\n", 1097 | " mzs \\\n", 1098 | "0 [53.0387, 55.0179, 55.0545, 57.0338, 59.0492, ... \n", 1099 | "1 [67.0543, 69.0698, 81.0699, 83.0492, 83.0853, ... \n", 1100 | "2 [81.0334, 83.0491, 85.0224, 85.0284, 85.0333, ... \n", 1101 | "3 [72.2736, 145.0759, 148.0868, 149.0707, 172.06... \n", 1102 | "4 [55.0542, 55.9818, 57.0334, 60.2456, 69.0335, ... \n", 1103 | ".. ... \n", 1104 | "237 [86.0599, 109.065, 109.1013, 121.101, 123.1168... \n", 1105 | "238 [55.0544, 57.0701, 67.0544, 69.0699, 81.0699, ... \n", 1106 | "239 [50.4192, 52.9216, 74.1355, 81.0699, 86.4846, ... \n", 1107 | "240 [54.5814, 57.0338, 61.0286, 69.0335, 81.0331, ... \n", 1108 | "241 [71.9563, 104.546, 105.0698, 107.0491, 107.056... \n", 1109 | "\n", 1110 | " intensities \\\n", 1111 | "0 [0.017272727272727273, 0.01818181818181818, 0.... \n", 1112 | "1 [0.010357142857142856, 0.04642857142857143, 0.... \n", 1113 | "2 [0.14210526315789473, 0.2236842105263158, 0.01... \n", 1114 | "3 [0.010697674418604652, 0.03488372093023256, 0.... \n", 1115 | "4 [0.018260869565217393, 0.013043478260869565, 0... \n", 1116 | ".. ... \n", 1117 | "237 [0.10689655172413794, 0.04482758620689655, 0.0... \n", 1118 | "238 [0.0215625, 0.053125, 0.015, 0.34375, 0.046875... \n", 1119 | "239 [0.05555555555555555, 0.058333333333333334, 0.... \n", 1120 | "240 [0.01, 0.010769230769230769, 0.038461538461538... \n", 1121 | "241 [0.015833333333333335, 0.018333333333333333, 0... \n", 1122 | "\n", 1123 | " loss_mzs \\\n", 1124 | "0 [314.15709999999996] \n", 1125 | "1 [60.02260000000001, 118.02660000000003, 160.03... \n", 1126 | "2 [] \n", 1127 | "3 [135.07919999999996, 148.0632, 165.08959999999... \n", 1128 | "4 [158.094, 210.1282, 228.1352, 246.1485, 280.16... \n", 1129 | ".. ... \n", 1130 | "237 [18.01030000000003, 36.02120000000002, 46.0054... \n", 1131 | "238 [18.01060000000001, 88.08850000000001, 142.135... \n", 1132 | "239 [98.63150000000002, 179.0794, 197.0896, 214.55... \n", 1133 | "240 [160.76359999999994] \n", 1134 | "241 [24.813600000000008, 42.197700000000026, 94.04... \n", 1135 | "\n", 1136 | " loss_intensities \\\n", 1137 | "0 [0.01818181818181818] \n", 1138 | "1 [0.060714285714285714, 0.39285714285714285, 0.... \n", 1139 | "2 [] \n", 1140 | "3 [0.053488372093023255, 0.10465116279069768, 0.... \n", 1141 | "4 [0.02217391304347826, 0.06086956521739131, 0.1... \n", 1142 | ".. ... \n", 1143 | "237 [0.21724137931034482, 0.06206896551724138, 1.0... \n", 1144 | "238 [0.153125, 0.02125, 0.01875, 0.02625, 0.028437... \n", 1145 | "239 [0.06666666666666667, 0.2777777777777778, 1.0,... \n", 1146 | "240 [0.011923076923076923] \n", 1147 | "241 [0.025833333333333333, 0.0175, 0.065, 1.0, 0.6... \n", 1148 | "\n", 1149 | " smiles_preprocessed num_of_sugars \\\n", 1150 | "0 CC1C(C(C(C(O1)OC2=C(OC3=C(C(=CC(=C3C2=O)O)O)CC... 2.0 \n", 1151 | "1 CC1CC2(C(C1O)C=C(C(CC3C(C3(C)C)C=C(C2=O)C)OC(=... 0.0 \n", 1152 | "2 CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)O... 4.0 \n", 1153 | "3 CN1C2=C(C=C(C=C2)C(=O)N(CCC(=O)O)C3=CC=CC=N3)N... 0.0 \n", 1154 | "4 CCCC(=O)OCC(C(C(CN1C2=C(C=C(C(=C2)C)C)N=C3C1=N... 1.0 \n", 1155 | ".. ... ... \n", 1156 | "237 CC1CCC2C1C(=O)OC(C2C)NC(=O)C3C(O3)C 0.0 \n", 1157 | "238 CC(C)C(C)C=CC(C)C1CCC2C1(CCC3=C2C(=O)C=C4C3(CC... 0.0 \n", 1158 | "239 CC1CCC(C2(C1=CC(CC2)C(=C)C(=O)O)C)OC3C(C(C(C(O... 1.0 \n", 1159 | "240 COC1=C2C(=CC(=C1OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(... 2.0 \n", 1160 | "241 C1C(COC2=C1C=CC(=C2)O)C3=CC=C(C=C3)O 0.0 \n", 1161 | "\n", 1162 | " Number of aliphatic carboxylic acids ... Number of phenols \\\n", 1163 | "0 0 ... 2 \n", 1164 | "1 1 ... 0 \n", 1165 | "2 0 ... 0 \n", 1166 | "3 1 ... 0 \n", 1167 | "4 0 ... 0 \n", 1168 | ".. ... ... ... \n", 1169 | "237 0 ... 0 \n", 1170 | "238 0 ... 0 \n", 1171 | "239 1 ... 0 \n", 1172 | "240 1 ... 0 \n", 1173 | "241 0 ... 2 \n", 1174 | "\n", 1175 | " Number of phosphoric acid groups Number of phosphoric ester groups \\\n", 1176 | "0 0 0 \n", 1177 | "1 0 0 \n", 1178 | "2 0 0 \n", 1179 | "3 0 0 \n", 1180 | "4 0 0 \n", 1181 | ".. ... ... \n", 1182 | "237 0 0 \n", 1183 | "238 0 0 \n", 1184 | "239 0 0 \n", 1185 | "240 0 0 \n", 1186 | "241 0 0 \n", 1187 | "\n", 1188 | " Number of piperdine rings Number of primary amides \\\n", 1189 | "0 0 0 \n", 1190 | "1 0 0 \n", 1191 | "2 0 0 \n", 1192 | "3 0 0 \n", 1193 | "4 0 0 \n", 1194 | ".. ... ... \n", 1195 | "237 0 0 \n", 1196 | "238 0 0 \n", 1197 | "239 0 0 \n", 1198 | "240 0 0 \n", 1199 | "241 0 0 \n", 1200 | "\n", 1201 | " Number of pyridine rings Number of quaternary nitrogens \\\n", 1202 | "0 0 0 \n", 1203 | "1 0 0 \n", 1204 | "2 0 0 \n", 1205 | "3 1 0 \n", 1206 | "4 0 0 \n", 1207 | ".. ... ... \n", 1208 | "237 0 0 \n", 1209 | "238 0 0 \n", 1210 | "239 0 0 \n", 1211 | "240 0 0 \n", 1212 | "241 0 0 \n", 1213 | "\n", 1214 | " Number of thioether Number of thiazole rings \\\n", 1215 | "0 0 0 \n", 1216 | "1 0 0 \n", 1217 | "2 0 0 \n", 1218 | "3 0 0 \n", 1219 | "4 0 0 \n", 1220 | ".. ... ... \n", 1221 | "237 0 0 \n", 1222 | "238 0 0 \n", 1223 | "239 0 0 \n", 1224 | "240 0 0 \n", 1225 | "241 0 0 \n", 1226 | "\n", 1227 | " Number of unbranched alkanes of at least 4 members (excludes halogenated alkanes) \n", 1228 | "0 0 \n", 1229 | "1 0 \n", 1230 | "2 0 \n", 1231 | "3 0 \n", 1232 | "4 0 \n", 1233 | ".. ... \n", 1234 | "237 0 \n", 1235 | "238 0 \n", 1236 | "239 0 \n", 1237 | "240 0 \n", 1238 | "241 0 \n", 1239 | "\n", 1240 | "[242 rows x 68 columns]" 1241 | ] 1242 | }, 1243 | "execution_count": 33, 1244 | "metadata": {}, 1245 | "output_type": "execute_result" 1246 | } 1247 | ], 1248 | "source": [ 1249 | "df_train = pd.read_csv(\"casmi_func_groups_2201.tsv\",sep=\"\\t\")\n", 1250 | "#data_df = pd.read_csv(\"/Users/delser/mass2smiles/retrain/nist/all_HRMS_validation_16122022_cddd_refine.tsv\",sep=\"\\t\")\n", 1251 | "df_train=df_train.dropna()\n", 1252 | "df_train" 1253 | ] 1254 | }, 1255 | { 1256 | "cell_type": "code", 1257 | "execution_count": 9, 1258 | "id": "0d79b609-3512-4b19-950c-be76089d8a96", 1259 | "metadata": {}, 1260 | "outputs": [], 1261 | "source": [ 1262 | "df_wrong.to_csv('loss_fail_matchms.tsv',sep='\\t')" 1263 | ] 1264 | }, 1265 | { 1266 | "cell_type": "code", 1267 | "execution_count": 34, 1268 | "id": "62e7f44e-b7fc-4154-a48c-c6f12b02ddb9", 1269 | "metadata": {}, 1270 | "outputs": [], 1271 | "source": [ 1272 | "def positional_encoding(max_position, d_model, min_freq=1e-6):\n", 1273 | " position = np.arange(max_position)\n", 1274 | " freqs = min_freq**(2*(np.arange(d_model)//2)/d_model)\n", 1275 | " pos_enc = position.reshape(-1,1)*freqs.reshape(1,-1)\n", 1276 | " pos_enc[:, ::2] = np.cos(pos_enc[:, ::2])\n", 1277 | " pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2])\n", 1278 | " return pos_enc\n", 1279 | "\n", 1280 | "def trun_n_d(n,d):\n", 1281 | " return ( n if not n.find('.')+1 else n[:n.find('.')+d+1] )" 1282 | ] 1283 | }, 1284 | { 1285 | "cell_type": "code", 1286 | "execution_count": 35, 1287 | "id": "23408430-22d9-4190-a930-54743b8a48ce", 1288 | "metadata": {}, 1289 | "outputs": [], 1290 | "source": [ 1291 | "P=positional_encoding(200000,256, min_freq=1e2)\n", 1292 | "#np.save('positions_512_1e2.npy',P)" 1293 | ] 1294 | }, 1295 | { 1296 | "cell_type": "code", 1297 | "execution_count": 11, 1298 | "id": "84cb2056-f65f-4a64-8139-ddae05cefe63", 1299 | "metadata": {}, 1300 | "outputs": [], 1301 | "source": [ 1302 | "#matchms mgf encoding\n", 1303 | "\n", 1304 | "def prepro_specs_train(df):\n", 1305 | " valid=[]\n", 1306 | " precs=df['precursor_mz'].to_list()\n", 1307 | " mzs=df['mzs'].to_list()\n", 1308 | " ints=df['intensities'].to_list()\n", 1309 | " loss_mzs=df['loss_mzs'].to_list()\n", 1310 | " loss_ints=df['loss_intensities'].to_list()\n", 1311 | " for one_pre,one_mzs,one_ints,one_loss,one_loss_ints in tqdm(zip(precs,mzs,ints,loss_mzs,loss_ints)):\n", 1312 | " mz_list=[round(float(trun_n_d(str(one_pre),2))*100)] # add precursor mz\n", 1313 | " intes_list=[2.0] # add precursor int\n", 1314 | " res = dict(zip(one_mzs+one_loss, one_ints+one_loss_ints)) # order by mzs\n", 1315 | " res=dict(sorted(res.items()))\n", 1316 | " for m,i in zip(list(res.keys()), list(res.values())): # change this from mgf from matchms\n", 1317 | " mz=round(float(trun_n_d(str(m),2))*100)\n", 1318 | " mz_list.append(mz)\n", 1319 | " intens=round(i,4)\n", 1320 | " intes_list.append(intens)\n", 1321 | " int_mzs=[intes_list,mz_list] \n", 1322 | " valid.append(int_mzs) # put intesities at first\n", 1323 | " return tf.ragged.constant(valid)\n", 1324 | " " 1325 | ] 1326 | }, 1327 | { 1328 | "cell_type": "code", 1329 | "execution_count": 36, 1330 | "id": "c1230761-5a62-431e-906b-a403f2127558", 1331 | "metadata": {}, 1332 | "outputs": [], 1333 | "source": [ 1334 | "def prepro_specs_train(df):\n", 1335 | " valid=[]\n", 1336 | " precs=df['precursor_mz'].to_list()\n", 1337 | " mzs=df['mzs'].to_list()\n", 1338 | " ints=df['intensities'].to_list()\n", 1339 | " loss_mzs=df['loss_mzs'].to_list()\n", 1340 | " loss_ints=df['loss_intensities'].to_list()\n", 1341 | " for one_pre,one_mzs,one_ints,one_loss,one_loss_ints in tqdm(zip(precs,mzs,ints,loss_mzs,loss_ints)):\n", 1342 | " mz_list=[round(float(trun_n_d(str(one_pre),2))*100)] # add precursor mz\n", 1343 | " intes_list=[2.0] # add precursor int\n", 1344 | " res = dict(zip(json.loads(one_mzs)+json.loads(one_loss), json.loads(one_ints)+json.loads(one_loss_ints))) # order by mzs\n", 1345 | " res=dict(sorted(res.items()))\n", 1346 | " for m,i in zip(list(res.keys()), list(res.values())): # change this from mgf from matchms\n", 1347 | " mz=round(float(trun_n_d(str(m),2))*100)\n", 1348 | " mz_list.append(mz)\n", 1349 | " intens=round(i,4)\n", 1350 | " intes_list.append(intens)\n", 1351 | " int_mzs=[intes_list,mz_list] \n", 1352 | " valid.append(int_mzs) # put intesities at first\n", 1353 | " return tf.ragged.constant(valid)" 1354 | ] 1355 | }, 1356 | { 1357 | "cell_type": "code", 1358 | "execution_count": 37, 1359 | "id": "e63172d5-e297-4760-91c1-2c4e1eae540e", 1360 | "metadata": {}, 1361 | "outputs": [ 1362 | { 1363 | "name": "stderr", 1364 | "output_type": "stream", 1365 | "text": [ 1366 | "242it [00:00, 1234.65it/s]\n" 1367 | ] 1368 | }, 1369 | { 1370 | "name": "stdout", 1371 | "output_type": "stream", 1372 | "text": [ 1373 | "CPU times: total: 516 ms\n", 1374 | "Wall time: 475 ms\n" 1375 | ] 1376 | } 1377 | ], 1378 | "source": [ 1379 | "%%time\n", 1380 | "train=prepro_specs_train(df_train)" 1381 | ] 1382 | }, 1383 | { 1384 | "cell_type": "code", 1385 | "execution_count": 17, 1386 | "id": "c7c0941a-7433-4d4c-b78e-572fe1ba72a0", 1387 | "metadata": {}, 1388 | "outputs": [ 1389 | { 1390 | "data": { 1391 | "text/plain": [ 1392 | "TensorShape([5, None, None])" 1393 | ] 1394 | }, 1395 | "execution_count": 17, 1396 | "metadata": {}, 1397 | "output_type": "execute_result" 1398 | } 1399 | ], 1400 | "source": [ 1401 | "tf.gather(train, [0, 1, 2, 3, 4]).shape" 1402 | ] 1403 | }, 1404 | { 1405 | "cell_type": "code", 1406 | "execution_count": 33, 1407 | "id": "54d9db5e-77e0-4718-8565-b9ddd4f45063", 1408 | "metadata": {}, 1409 | "outputs": [ 1410 | { 1411 | "data": { 1412 | "text/plain": [ 1413 | "TensorShape([32, None, None])" 1414 | ] 1415 | }, 1416 | "execution_count": 33, 1417 | "metadata": {}, 1418 | "output_type": "execute_result" 1419 | } 1420 | ], 1421 | "source": [ 1422 | "train[0:32].shape" 1423 | ] 1424 | }, 1425 | { 1426 | "cell_type": "code", 1427 | "execution_count": 26, 1428 | "id": "57acb8e7-faaa-42d5-a7ca-89b969e20a0f", 1429 | "metadata": {}, 1430 | "outputs": [ 1431 | { 1432 | "data": { 1433 | "text/plain": [ 1434 | "253" 1435 | ] 1436 | }, 1437 | "execution_count": 26, 1438 | "metadata": {}, 1439 | "output_type": "execute_result" 1440 | } 1441 | ], 1442 | "source": [ 1443 | "length=[i[0].shape[0] for i in train]\n", 1444 | "max(length)" 1445 | ] 1446 | }, 1447 | { 1448 | "cell_type": "code", 1449 | "execution_count": 38, 1450 | "id": "38425b96-bac0-4aac-8390-77598f6d39f4", 1451 | "metadata": {}, 1452 | "outputs": [], 1453 | "source": [ 1454 | "dimn=256\n", 1455 | "def encoding(rag_tensor,P,dimn):\n", 1456 | " to_pad=[]\n", 1457 | " for sample in rag_tensor:\n", 1458 | " all_dim=[sample[0].numpy().tolist()]\n", 1459 | " pos_enc=[P[int(i)-1] for i in sample[1].numpy().tolist()]\n", 1460 | " for dim in range(dimn):\n", 1461 | " dim_n=[i[dim] for i in pos_enc]\n", 1462 | " all_dim.append(dim_n)\n", 1463 | " to_pad.append(all_dim)\n", 1464 | " to_pad=[tf.keras.preprocessing.sequence.pad_sequences(i,maxlen=501,dtype='float32',padding='post',truncating='post',value=10) for i in to_pad]\n", 1465 | " to_pad=np.stack((to_pad))\n", 1466 | " to_pad=np.swapaxes(to_pad, 1, -1)\n", 1467 | " return to_pad" 1468 | ] 1469 | }, 1470 | { 1471 | "cell_type": "code", 1472 | "execution_count": 38, 1473 | "id": "6aa1bde2-2aa0-4508-bbe9-614fefa5c73b", 1474 | "metadata": {}, 1475 | "outputs": [ 1476 | { 1477 | "data": { 1478 | "text/plain": [ 1479 | "array([ 0.0717 , 0.5332156 , -0.8459794 , 0.01143887, -0.99993455,\n", 1480 | " 0.28827846, -0.95754665, 0.9954703 , 0.09507313, -0.95233387,\n", 1481 | " 0.3050577 , 0.59969896, 0.8002257 , 0.57598245, 0.8174621 ,\n", 1482 | " -0.996885 , -0.07886912, -0.17545353, 0.9844877 , -0.9999982 ,\n", 1483 | " -0.00188863, 0.1089685 , 0.9940452 , -0.98186713, -0.18957031,\n", 1484 | " -0.95267904, 0.30397803, 0.9439646 , 0.33004668, -0.80919385,\n", 1485 | " 0.58754176, -0.9424226 , -0.3344243 , -0.86887133, -0.49503803,\n", 1486 | " -0.94475156, -0.32778722, -0.9721955 , -0.2341707 , -0.7721371 ,\n", 1487 | " -0.63545597, 0.5649307 , -0.82513833, -0.4526213 , 0.89170283,\n", 1488 | " 0.68732804, 0.72634715, -0.56023455, 0.82833403, 0.93247634,\n", 1489 | " 0.36123106, -0.654053 , 0.75644875, -0.6053066 , 0.7959924 ,\n", 1490 | " 0.69717646, 0.7168996 , 0.22841789, -0.9735632 , -0.92402524,\n", 1491 | " 0.38233152, 0.37131587, 0.9285066 , 0.55711097, 0.8304381 ,\n", 1492 | " -0.9599877 , 0.28004214, 0.25998947, 0.96561146, -0.96464807,\n", 1493 | " -0.26354155, -0.84793615, -0.5300983 , -0.9946696 , 0.10311342,\n", 1494 | " -0.6722058 , 0.7403644 , -0.7557188 , 0.6548962 , -0.3556158 ,\n", 1495 | " -0.93463224, -0.9336576 , 0.3581669 , 0.2960882 , -0.9551606 ,\n", 1496 | " 0.90296847, -0.42970684, 0.9975964 , 0.06929295, 0.16034408,\n", 1497 | " 0.9870612 , 0.04580148, -0.99895054, 0.33682257, -0.94156814,\n", 1498 | " -0.90871364, -0.41742012, -0.05195929, 0.9986492 , 0.8410124 ,\n", 1499 | " 0.5410158 , -0.2589479 , 0.9658913 , 0.83173 , 0.5551804 ,\n", 1500 | " -0.07052226, -0.9975102 , -0.99015146, 0.1400004 , -0.904356 ,\n", 1501 | " -0.42677885, -0.22271346, 0.974884 , 0.82213503, -0.5692925 ,\n", 1502 | " -0.22886883, -0.9734573 , 0.99570584, 0.09257355, 0.73236376,\n", 1503 | " 0.6809136 , -0.5831401 , 0.8123716 , 0.97060597, 0.24067426,\n", 1504 | " -0.9765509 , -0.21528676, 0.3335899 , 0.94271827, -0.7904735 ,\n", 1505 | " 0.61249626, -0.99346447, -0.1141419 , 0.9764814 , -0.21560162,\n", 1506 | " 0.5799171 , 0.8146755 , -0.9991347 , -0.04159042, -0.9890586 ,\n", 1507 | " 0.1475232 , -0.989487 , -0.14462198, 0.7041925 , 0.7100091 ,\n", 1508 | " 0.19167444, -0.98145854, 0.99982166, 0.01888514, -0.46084157,\n", 1509 | " 0.8874824 , -0.33013698, 0.943933 , -0.9985991 , -0.05291281,\n", 1510 | " -0.97547275, -0.2201203 , 0.99999547, 0.00300953, -0.9642097 ,\n", 1511 | " -0.26514086, 0.4496711 , -0.8931942 , -0.98045963, -0.19672027,\n", 1512 | " -0.7123622 , 0.701812 , 0.8237492 , 0.56695443, -0.96114033,\n", 1513 | " 0.2760603 , 0.01536015, 0.99988204, -0.27245298, 0.9621691 ,\n", 1514 | " -0.94391733, 0.3301819 , -0.85027707, -0.52633536, -0.6628971 ,\n", 1515 | " -0.7487105 , -0.9762616 , -0.21659462, -0.13901651, 0.99029005,\n", 1516 | " 0.66964203, -0.742684 , -0.7097949 , 0.7044084 , 0.7883553 ,\n", 1517 | " -0.6152202 , -0.930816 , -0.36548817, -0.9543821 , -0.2985879 ,\n", 1518 | " 0.8720212 , 0.4894681 , -0.8894411 , 0.45704985, 0.7674 ,\n", 1519 | " -0.6411686 , 0.5122702 , -0.8588243 , -0.46367675, -0.88600445,\n", 1520 | " 0.65450615, -0.75605667, 0.88037896, -0.47427094, -0.86710006,\n", 1521 | " 0.49813405, 0.9908141 , -0.135231 , -0.26741374, 0.9635818 ,\n", 1522 | " 0.7690229 , 0.63922125, 0.03958566, 0.9992162 , -0.89686114,\n", 1523 | " -0.44231218, 0.95714754, 0.28960076, -0.3952544 , -0.9185717 ,\n", 1524 | " -0.86790514, -0.49672997, 0.8436555 , -0.53688484, 0.86669165,\n", 1525 | " -0.49884427, 0.9220753 , -0.3870106 , -0.7610874 , 0.64864933,\n", 1526 | " 0.16541438, -0.9862242 , -0.29848945, -0.95441294, 0.6250325 ,\n", 1527 | " -0.78059876, 0.9744055 , 0.22479734, 0.673075 , 0.7395742 ,\n", 1528 | " 0.6001858 , 0.7998606 , 0.35591003, 0.93452024, -0.9983589 ,\n", 1529 | " 0.05726733, -0.75771755, 0.65258265, -0.563865 , 0.82586694,\n", 1530 | " 0.9665189 , -0.2565955 ], dtype=float32)" 1531 | ] 1532 | }, 1533 | "execution_count": 38, 1534 | "metadata": {}, 1535 | "output_type": "execute_result" 1536 | } 1537 | ], 1538 | "source": [ 1539 | "xtrain[0][1]" 1540 | ] 1541 | }, 1542 | { 1543 | "cell_type": "code", 1544 | "execution_count": 39, 1545 | "id": "adbccd44-0de9-4492-8a39-ac30cd2f29fa", 1546 | "metadata": {}, 1547 | "outputs": [ 1548 | { 1549 | "name": "stdout", 1550 | "output_type": "stream", 1551 | "text": [ 1552 | "CPU times: total: 7.39 s\n", 1553 | "Wall time: 7.39 s\n" 1554 | ] 1555 | } 1556 | ], 1557 | "source": [ 1558 | "%%time\n", 1559 | "xtrain=encoding(train,P,dimn)" 1560 | ] 1561 | }, 1562 | { 1563 | "cell_type": "code", 1564 | "execution_count": 40, 1565 | "id": "56740753-1960-4e35-9cb9-08951cd8aa42", 1566 | "metadata": {}, 1567 | "outputs": [ 1568 | { 1569 | "name": "stdout", 1570 | "output_type": "stream", 1571 | "text": [ 1572 | "CPU times: total: 3.28 s\n", 1573 | "Wall time: 3.29 s\n" 1574 | ] 1575 | } 1576 | ], 1577 | "source": [ 1578 | "%%time\n", 1579 | "np.save('casmi_specs.npy',xtrain)" 1580 | ] 1581 | }, 1582 | { 1583 | "cell_type": "code", 1584 | "execution_count": null, 1585 | "id": "73610487-9ad9-4bdc-a6c6-18f7f72298df", 1586 | "metadata": {}, 1587 | "outputs": [], 1588 | "source": [] 1589 | } 1590 | ], 1591 | "metadata": { 1592 | "kernelspec": { 1593 | "display_name": "Python 3 (ipykernel)", 1594 | "language": "python", 1595 | "name": "python3" 1596 | }, 1597 | "language_info": { 1598 | "codemirror_mode": { 1599 | "name": "ipython", 1600 | "version": 3 1601 | }, 1602 | "file_extension": ".py", 1603 | "mimetype": "text/x-python", 1604 | "name": "python", 1605 | "nbconvert_exporter": "python", 1606 | "pygments_lexer": "ipython3", 1607 | "version": "3.9.15" 1608 | } 1609 | }, 1610 | "nbformat": 4, 1611 | "nbformat_minor": 5 1612 | } 1613 | --------------------------------------------------------------------------------