├── GPU_encode_smiles.py
├── refine_code_17012023_valid.json
├── README.md
├── composite_math_v2_nist.py
├── composite_math_v2.py
├── refine_code_17012023_train.json
├── cddd_decode.py
├── tcn_seq_train_used.py
├── tcn_seq_train_hp.py
├── mass2smiles_transformer.py
├── molecularformula.ipynb
└── preprocessing_onlin-v3_mgf.ipynb


/GPU_encode_smiles.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from cddd.inference import InferenceModel
 4 | from cddd.preprocessing import preprocess_smiles
 5 | 
 6 | ames_df = pd.read_csv("/home2020/home/ibmp/delser/cddd/nist/all_HRMS_train_24012023_cddd_refine_s.tsv", index_col="spectrum_id",sep="\t")
 7 | ames_df["smiles"] = ames_df.smiles_preprocessed.map(preprocess_smiles)
 8 | ames_df = ames_df.dropna()
 9 | smiles_list = ames_df["smiles"].tolist()
10 | 
11 | inference_model = InferenceModel()
12 | print("Encoding now!")
13 | smiles_embedding = inference_model.seq_to_emb(smiles_list)
14 | print("Saving file")
15 | np.save('/home2020/home/ibmp/delser/cddd/nist/cddd_all_HRMS_train_24012023_cddd_refine.npy', smiles_embedding)
16 | 
17 | 
18 | print("Done!")
19 | 
20 | 
21 | ames_df = pd.read_csv("/home2020/home/ibmp/delser/cddd/nist/all_HRMS_valid_24012023_cddd_refine_s.tsv", index_col="spectrum_id",sep="\t")
22 | ames_df["smiles"] = ames_df.smiles_preprocessed.map(preprocess_smiles)
23 | ames_df = ames_df.dropna()
24 | smiles_list = ames_df["smiles"].tolist()
25 | 
26 | inference_model = InferenceModel()
27 | print("Encoding now!")
28 | smiles_embedding = inference_model.seq_to_emb(smiles_list)
29 | print("Saving file")
30 | np.save('/home2020/home/ibmp/delser/cddd/nist/cddd_all_HRMS_valid_24012023_cddd_refine.npy', smiles_embedding)
31 | 
32 | print("Done!")
33 | 


--------------------------------------------------------------------------------
/refine_code_17012023_valid.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "op": "core/mass-edit",
 4 |     "engineConfig": {
 5 |       "facets": [],
 6 |       "mode": "row-based"
 7 |     },
 8 |     "columnName": "adduct",
 9 |     "expression": "value",
10 |     "edits": [
11 |       {
12 |         "from": [
13 |           "[Cat]+"
14 |         ],
15 |         "fromBlank": false,
16 |         "fromError": false,
17 |         "to": "[M]+"
18 |       }
19 |     ],
20 |     "description": "Mass edit cells in column adduct"
21 |   },
22 |   {
23 |     "op": "core/mass-edit",
24 |     "engineConfig": {
25 |       "facets": [],
26 |       "mode": "row-based"
27 |     },
28 |     "columnName": "adduct",
29 |     "expression": "value",
30 |     "edits": [
31 |       {
32 |         "from": [
33 |           "[M+H-2H2O]+"
34 |         ],
35 |         "fromBlank": false,
36 |         "fromError": false,
37 |         "to": "[M-2H2O+H]+"
38 |       }
39 |     ],
40 |     "description": "Mass edit cells in column adduct"
41 |   },
42 |   {
43 |     "op": "core/mass-edit",
44 |     "engineConfig": {
45 |       "facets": [],
46 |       "mode": "row-based"
47 |     },
48 |     "columnName": "adduct",
49 |     "expression": "value",
50 |     "edits": [
51 |       {
52 |         "from": [
53 |           "[M+H-H2O]+"
54 |         ],
55 |         "fromBlank": false,
56 |         "fromError": false,
57 |         "to": "[M-H2O+H]+"
58 |       }
59 |     ],
60 |     "description": "Mass edit cells in column adduct"
61 |   },
62 |   {
63 |     "op": "core/mass-edit",
64 |     "engineConfig": {
65 |       "facets": [],
66 |       "mode": "row-based"
67 |     },
68 |     "columnName": "adduct",
69 |     "expression": "value",
70 |     "edits": [
71 |       {
72 |         "from": [
73 |           "[M+H-NH3]+"
74 |         ],
75 |         "fromBlank": false,
76 |         "fromError": false,
77 |         "to": "[M-NH3+H]+"
78 |       }
79 |     ],
80 |     "description": "Mass edit cells in column adduct"
81 |   }
82 | ]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ![logo](https://github.com/volvox292/mass2smiles/assets/63146629/7e5b37dc-534b-4780-b310-45f197283709)
 3 | 
 4 | Mass2SMILES is an open-source Python based deep learning approach for structure and functional group prediction from mass spectrometry data (MS/MS). Spectral data can be provided as MGF files (GNPS-syle) and model inference is most effciently performed via the provided docker container.
 5 | 
 6 | 
 7 | supplementary data with container and model at (you must have a vaild licence for NIST): [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7883491.svg)](https://doi.org/10.5281/zenodo.7883491)
 8 | 
 9 | recent update containing dockerfiles to build two separate containers, adjust to your needs, this Mass2SMILES model container is using GPU, the cddd does not seem to work on newer cuda drivers, therefore it is 
10 |  build using tensorflow cpu, but can be speed up by changing the number of cores: e.g. InferenceModel(cpu_threads=128). You need to point to your input and output dir, now the mass2smiles model is built into the container. Using this setup inference speed is highly improved.  [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14778327.svg)](https://doi.org/10.5281/zenodo.14778327)
11 | 
12 | the pre-print is available at: https://doi.org/10.1101/2023.07.06.547963
13 | 
14 | ```bash {bash, echo=T, eval=F}
15 | # the container is available as tarball in supplementary or via docker pull delser292/mass2smiles:final
16 | # unzip the docker.zip, the mass2smiles folder contains the model files and scripts to execute everything and it is important to specify the path to this folder when starting predictions.
17 | 
18 | # The predictions can be started through this command:
19 | 
20 | docker run -v c:/your_path/to_the_folder/mass2smiles/:/app  mass2smiles:transformer_v1 conda run -n tf python app/mass2smiles_transformer.py your_mgf_file.mgf /app
21 | ```
22 | 
23 | The model architecture:
24 | 
25 | ![architecture](https://github.com/volvox292/mass2smiles/assets/63146629/3e4313d8-43b2-469d-bab6-c8670a00f62d)
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/composite_math_v2_nist.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from collections import defaultdict
 4 | from matchms import Spikes
 5 | import math
 6 | #import deepchem as dc
 7 | #dc.__version__
 8 | from matchms.filtering import add_losses
 9 | from matchms.filtering import add_parent_mass
10 | from matchms.filtering import default_filters
11 | from matchms.filtering import normalize_intensities
12 | from matchms.filtering import select_by_intensity
13 | from matchms.filtering import reduce_to_number_of_peaks
14 | from matchms.filtering import require_minimum_number_of_peaks
15 | from matchms.filtering import select_by_mz
16 | from matchms.importing import load_from_mgf
17 | from matchms.exporting import save_as_mgf
18 | from matchms.importing import load_from_msp
19 | 
20 | 
21 | from matchms.filtering import repair_inchi_inchikey_smiles
22 | from matchms.filtering import derive_inchikey_from_inchi
23 | from matchms.filtering import derive_smiles_from_inchi
24 | from matchms.filtering import derive_inchi_from_smiles
25 | from matchms.filtering import harmonize_undefined_inchi
26 | from matchms.filtering import harmonize_undefined_inchikey
27 | from matchms.filtering import harmonize_undefined_smiles
28 | 
29 | import pickle
30 | path_data = "/home/delser/" 
31 | outfile = os.path.join(path_data, 'nist_cache_pos.pickle')
32 | with open(outfile, 'rb') as file:
33 |     spectrums = pickle.load(file)
34 |     
35 | def spectrum_processing(s):
36 |     """This is how one would typically design a desired pre- and post-
37 |     processing pipeline."""
38 |     s = normalize_intensities(s)
39 |     s = select_by_intensity(s, intensity_from=0.01)
40 |     s = reduce_to_number_of_peaks(s, n_required=4, n_max=500)
41 |     return s 
42 | 
43 | spectrums = [spectrum_processing(s) for s in spectrums]
44 | spectrums = [s for s in spectrums if s is not None]
45 | 
46 |     
47 | 
48 | 
49 |     
50 | def compare_update(first_mz_intensity_dict,second_mz_intensity_dict):
51 |     modified_dict=first_mz_intensity_dict.copy()
52 |     for key in first_mz_intensity_dict:
53 |         for key_2 in second_mz_intensity_dict:
54 |             if math.isclose(key,key_2,abs_tol=0.005) == False:
55 |                 mzs_modified_dict=list(modified_dict.keys())
56 |                 matches=[math.isclose(i,key_2,abs_tol=0.005) for i in mzs_modified_dict]
57 |                 if True not in matches:
58 |                     modified_dict[key_2]=second_mz_intensity_dict[key_2]
59 |             else:
60 |                 if first_mz_intensity_dict[key]<second_mz_intensity_dict[key_2]:
61 |                     modified_dict[key]=second_mz_intensity_dict[key_2]
62 |     return modified_dict
63 | 
64 | 
65 | modified_spectra = defaultdict(list)
66 | 
67 | for count,i in enumerate(spectrums):
68 |     print(count)
69 |     if i.metadata['inchikey'] not in modified_spectra:
70 |         modified_spectra[i.metadata['inchikey']]=i
71 |     else:
72 |         first_mz_intensity_dict=dict(zip(modified_spectra[i.metadata['inchikey']].peaks.mz, modified_spectra[i.metadata['inchikey']].peaks.intensities))
73 |         second_mz_intensity_dict=dict(zip(i.peaks.mz, i.peaks.intensities))
74 |         update=compare_update(first_mz_intensity_dict,second_mz_intensity_dict)
75 |         update_sorted=dict(sorted(update.items()))
76 |         modified_spectra[i.metadata['inchikey']].peaks=Spikes(mz=np.array(list(update_sorted.keys()), dtype="float"), intensities=np.array(list(update_sorted.values()), dtype="float"))
77 |         
78 | mod_specs_matchms=list(modified_spectra.values())
79 | 
80 | pickle.dump(mod_specs_matchms, 
81 |             open(os.path.join(path_data,'composite_nist_pos_math_500.pickle'), "wb"))
82 |         


--------------------------------------------------------------------------------
/composite_math_v2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from collections import defaultdict
 4 | from matchms import Spikes
 5 | import math
 6 | #import deepchem as dc
 7 | #dc.__version__
 8 | from matchms.filtering import add_losses
 9 | from matchms.filtering import add_parent_mass
10 | from matchms.filtering import default_filters
11 | from matchms.filtering import normalize_intensities
12 | from matchms.filtering import select_by_intensity
13 | from matchms.filtering import reduce_to_number_of_peaks
14 | from matchms.filtering import require_minimum_number_of_peaks
15 | from matchms.filtering import select_by_mz
16 | from matchms.importing import load_from_mgf
17 | from matchms.exporting import save_as_mgf
18 | from matchms.importing import load_from_msp
19 | 
20 | 
21 | from matchms.filtering import repair_inchi_inchikey_smiles
22 | from matchms.filtering import derive_inchikey_from_inchi
23 | from matchms.filtering import derive_smiles_from_inchi
24 | from matchms.filtering import derive_inchi_from_smiles
25 | from matchms.filtering import harmonize_undefined_inchi
26 | from matchms.filtering import harmonize_undefined_inchikey
27 | from matchms.filtering import harmonize_undefined_smiles
28 | 
29 | import pickle
30 | path_data = "/home/delser/" 
31 | outfile = os.path.join(path_data, 'bmdms_cache.pickle')
32 | with open(outfile, 'rb') as file:
33 |     spectrums = pickle.load(file)
34 |     
35 | def spectrum_processing(s):
36 |     """This is how one would typically design a desired pre- and post-
37 |     processing pipeline."""
38 |     s = normalize_intensities(s)
39 |     s = select_by_intensity(s, intensity_from=0.01)
40 |     s = reduce_to_number_of_peaks(s, n_required=4, n_max=500)
41 |     return s 
42 | 
43 | spectrums = [spectrum_processing(s) for s in spectrums]
44 | spectrums = [s for s in spectrums if s is not None]
45 | 
46 | for i in spectrums:
47 |     adduct=i.metadata['precursortype']
48 |     i._metadata['adduct']=adduct
49 |     
50 | for i in spectrums:
51 |     i._metadata['spectrumid']=i.metadata['comment'].split(";")[0]
52 |     
53 | 
54 | 
55 |     
56 | def compare_update(first_mz_intensity_dict,second_mz_intensity_dict):
57 |     modified_dict=first_mz_intensity_dict.copy()
58 |     for key in first_mz_intensity_dict:
59 |         for key_2 in second_mz_intensity_dict:
60 |             if math.isclose(key,key_2,abs_tol=0.005) == False:
61 |                 mzs_modified_dict=list(modified_dict.keys())
62 |                 matches=[math.isclose(i,key_2,abs_tol=0.005) for i in mzs_modified_dict]
63 |                 if True not in matches:
64 |                     modified_dict[key_2]=second_mz_intensity_dict[key_2]
65 |             else:
66 |                 if first_mz_intensity_dict[key]<second_mz_intensity_dict[key_2]:
67 |                     modified_dict[key]=second_mz_intensity_dict[key_2]
68 |     return modified_dict
69 | 
70 | 
71 | modified_spectra = defaultdict(list)
72 | 
73 | for count,i in enumerate(spectrums):
74 |     print(count)
75 |     if i.metadata['name'] not in modified_spectra:
76 |         modified_spectra[i.metadata['name']]=i
77 |     else:
78 |         first_mz_intensity_dict=dict(zip(modified_spectra[i.metadata['name']].peaks.mz, modified_spectra[i.metadata['name']].peaks.intensities))
79 |         second_mz_intensity_dict=dict(zip(i.peaks.mz, i.peaks.intensities))
80 |         update=compare_update(first_mz_intensity_dict,second_mz_intensity_dict)
81 |         update_sorted=dict(sorted(update.items()))
82 |         modified_spectra[i.metadata['name']].peaks=Spikes(mz=np.array(list(update_sorted.keys()), dtype="float"), intensities=np.array(list(update_sorted.values()), dtype="float"))
83 |         
84 | mod_specs_matchms=list(modified_spectra.values())
85 | 
86 | pickle.dump(mod_specs_matchms, 
87 |             open(os.path.join(path_data,'compositebmdms_math_500.pickle'), "wb"))
88 |         


--------------------------------------------------------------------------------
/refine_code_17012023_train.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "op": "core/mass-edit",
  4 |     "engineConfig": {
  5 |       "facets": [],
  6 |       "mode": "row-based"
  7 |     },
  8 |     "columnName": "adduct",
  9 |     "expression": "value",
 10 |     "edits": [
 11 |       {
 12 |         "from": [
 13 |           "[M-3H2O+H]+",
 14 |           "[M-3H2O+H]"
 15 |         ],
 16 |         "fromBlank": false,
 17 |         "fromError": false,
 18 |         "to": "[M-3H2O+H]+"
 19 |       }
 20 |     ],
 21 |     "description": "Mass edit cells in column adduct"
 22 |   },
 23 |   {
 24 |     "op": "core/mass-edit",
 25 |     "engineConfig": {
 26 |       "facets": [],
 27 |       "mode": "row-based"
 28 |     },
 29 |     "columnName": "adduct",
 30 |     "expression": "value",
 31 |     "edits": [
 32 |       {
 33 |         "from": [
 34 |           "[Cat]+"
 35 |         ],
 36 |         "fromBlank": false,
 37 |         "fromError": false,
 38 |         "to": "[M]+"
 39 |       }
 40 |     ],
 41 |     "description": "Mass edit cells in column adduct"
 42 |   },
 43 |   {
 44 |     "op": "core/mass-edit",
 45 |     "engineConfig": {
 46 |       "facets": [],
 47 |       "mode": "row-based"
 48 |     },
 49 |     "columnName": "adduct",
 50 |     "expression": "value",
 51 |     "edits": [
 52 |       {
 53 |         "from": [
 54 |           "[2M+NH4]"
 55 |         ],
 56 |         "fromBlank": false,
 57 |         "fromError": false,
 58 |         "to": "[2M+NH4]+"
 59 |       }
 60 |     ],
 61 |     "description": "Mass edit cells in column adduct"
 62 |   },
 63 |   {
 64 |     "op": "core/mass-edit",
 65 |     "engineConfig": {
 66 |       "facets": [],
 67 |       "mode": "row-based"
 68 |     },
 69 |     "columnName": "adduct",
 70 |     "expression": "value",
 71 |     "edits": [
 72 |       {
 73 |         "from": [
 74 |           "[M-H+2Na]"
 75 |         ],
 76 |         "fromBlank": false,
 77 |         "fromError": false,
 78 |         "to": "[M-H+2Na]+"
 79 |       }
 80 |     ],
 81 |     "description": "Mass edit cells in column adduct"
 82 |   },
 83 |   {
 84 |     "op": "core/mass-edit",
 85 |     "engineConfig": {
 86 |       "facets": [],
 87 |       "mode": "row-based"
 88 |     },
 89 |     "columnName": "adduct",
 90 |     "expression": "value",
 91 |     "edits": [
 92 |       {
 93 |         "from": [
 94 |           "[M+2Na-H]+"
 95 |         ],
 96 |         "fromBlank": false,
 97 |         "fromError": false,
 98 |         "to": "[M-H+2Na]+"
 99 |       }
100 |     ],
101 |     "description": "Mass edit cells in column adduct"
102 |   },
103 |   {
104 |     "op": "core/mass-edit",
105 |     "engineConfig": {
106 |       "facets": [],
107 |       "mode": "row-based"
108 |     },
109 |     "columnName": "adduct",
110 |     "expression": "value",
111 |     "edits": [
112 |       {
113 |         "from": [
114 |           "[M+H-2H2O]+"
115 |         ],
116 |         "fromBlank": false,
117 |         "fromError": false,
118 |         "to": "[M-2H2O+H]+"
119 |       }
120 |     ],
121 |     "description": "Mass edit cells in column adduct"
122 |   },
123 |   {
124 |     "op": "core/mass-edit",
125 |     "engineConfig": {
126 |       "facets": [],
127 |       "mode": "row-based"
128 |     },
129 |     "columnName": "adduct",
130 |     "expression": "value",
131 |     "edits": [
132 |       {
133 |         "from": [
134 |           "[M+H-3H2O]+"
135 |         ],
136 |         "fromBlank": false,
137 |         "fromError": false,
138 |         "to": "[M-3H2O+H]+"
139 |       }
140 |     ],
141 |     "description": "Mass edit cells in column adduct"
142 |   },
143 |   {
144 |     "op": "core/mass-edit",
145 |     "engineConfig": {
146 |       "facets": [],
147 |       "mode": "row-based"
148 |     },
149 |     "columnName": "adduct",
150 |     "expression": "value",
151 |     "edits": [
152 |       {
153 |         "from": [
154 |           "[M+H-H2O]+"
155 |         ],
156 |         "fromBlank": false,
157 |         "fromError": false,
158 |         "to": "[M-H2O+H]+"
159 |       }
160 |     ],
161 |     "description": "Mass edit cells in column adduct"
162 |   },
163 |   {
164 |     "op": "core/mass-edit",
165 |     "engineConfig": {
166 |       "facets": [],
167 |       "mode": "row-based"
168 |     },
169 |     "columnName": "adduct",
170 |     "expression": "value",
171 |     "edits": [
172 |       {
173 |         "from": [
174 |           "[M+H-NH3]+"
175 |         ],
176 |         "fromBlank": false,
177 |         "fromError": false,
178 |         "to": "[M-NH3+H]+"
179 |       }
180 |     ],
181 |     "description": "Mass edit cells in column adduct"
182 |   },
183 |   {
184 |     "op": "core/mass-edit",
185 |     "engineConfig": {
186 |       "facets": [],
187 |       "mode": "row-based"
188 |     },
189 |     "columnName": "adduct",
190 |     "expression": "value",
191 |     "edits": [
192 |       {
193 |         "from": [
194 |           "[M+NH3]"
195 |         ],
196 |         "fromBlank": false,
197 |         "fromError": false,
198 |         "to": "[M+NH3]+"
199 |       }
200 |     ],
201 |     "description": "Mass edit cells in column adduct"
202 |   }
203 | ]


--------------------------------------------------------------------------------
/cddd_decode.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from cddd.inference import InferenceModel
  3 | from cddd.preprocessing import preprocess_smiles
  4 | import numpy as np
  5 | import sys
  6 | import os
  7 | 
  8 | inference_model = InferenceModel()
  9 | fname6=os.path.join(sys.argv[1],"result_predict.npy")
 10 | #x=np.load('/Users/delser/mass2smiles/output_loss_predicted.npy')
 11 | x=np.load(fname6)
 12 | 
 13 | #flat=x[:, :, 0]
 14 | decoded_smiles_list = inference_model.emb_to_seq(x)
 15 | 
 16 | if type(decoded_smiles_list)== str:
 17 |     decoded_smiles_list=[decoded_smiles_list]
 18 | else:
 19 |     pass
 20 | df_cddd_decoded = pd.DataFrame(decoded_smiles_list)
 21 | #df_cddd_decoded=df_cddd_decoded.drop(["Unnamed: 0"], axis=1)
 22 | #fname7=os.path.join(sys.argv[1],"output_loss_predicted.tsv")
 23 | #df.to_csv("/Users/delser/mass2smiles/output_loss_predicted.tsv",sep='\t')
 24 | #df.to_csv(fname7,sep='\t')
 25 | 
 26 | 
 27 | fname8=os.path.join(sys.argv[1],'feature_ids_dataframe.tsv')
 28 | df_samples = pd.read_csv(fname8, sep="\t") # mgf dataframe with loss
 29 | 
 30 | #df_samples = pd.read_csv("/Users/delser/mass2smiles/output_loss.tsv", header=None, sep="\t") # mgf dataframe with loss
 31 | 
 32 | #fname9=os.path.join(sys.argv[1],"output_loss_predicted.tsv")
 33 | 
 34 | #df_pred = pd.read_csv(fname9,index_col= "Unnamed: 0",sep="\t") # predicted smiles from cddd
 35 | #df_pred = pd.read_csv("/Users/delser/mass2smiles/output_loss_predicted.tsv",index_col= "Unnamed: 0",sep="\t") # predicted smiles from cddd
 36 | 
 37 | df_cddd_decoded=df_cddd_decoded.rename(columns={"0": "predicted"})
 38 | #df_final = df_samples.join(df_pred, how="outer")
 39 | #df_final=df_final.drop(["Column 3"], axis=1)
 40 | #df_final=df_final.drop([1,2,3], axis=1)
 41 | 
 42 | df_final = pd.concat([df_samples, df_cddd_decoded ], axis=1)
 43 | df_final=df_final.drop(["Unnamed: 0"], axis=1)
 44 | df_final=df_final.drop(["mzs","intensities",'loss_mzs','loss_intensities'], axis=1)
 45 | #df_final=df_final.rename(columns={0: "feature_ID"})
 46 | fname9=os.path.join(sys.argv[1],"result_predict1.npy")
 47 | x1=np.load(fname9)
 48 | 
 49 | def float_oupt_to_class(oupt, k):
 50 |   end_pts = np.zeros(k+1, dtype=np.float32) 
 51 |   delta = 1.0 / k
 52 |   for i in range(k):
 53 |     end_pts[i] = i * delta
 54 |   end_pts[k] = 1.0
 55 |   # if k=4, [0.0, 0.25, 0.50, 0.75, 1.0] 
 56 | 
 57 |   for i in range(k):
 58 |     if oupt >= end_pts[i] and oupt <= end_pts[i+1]:
 59 |       return i
 60 |   return -1  # fatal error 
 61 |   
 62 | funct=['#num_of_sugars',"#Number of aliphatic carboxylic acids",
 63 |  "#Number of aliphatic hydroxyl groups",
 64 |  "#Number of aliphatic hydroxyl groups excluding tert-OH",
 65 |  "#Number of N functional groups attached to aromatics",
 66 |  "#Number of Aromatic carboxylic acides",
 67 |  "#Number of aromatic nitrogens",
 68 |  "#Number of aromatic amines",
 69 |  "#Number of aromatic hydroxyl groups",
 70 |  "#Number of carboxylic acids",
 71 |  "#Number of carboxylic acids",
 72 |  "#Number of carbonyl O",
 73 |  "#Number of carbonyl O, excluding COOH",
 74 |  "#Number of thiocarbonyl",
 75 |  "#Number of Imines",
 76 |  "#Number of Tertiary amines",
 77 |  "#Number of Secondary amines",
 78 |  "#Number of Primary amines",
 79 |  "#Number of hydroxylamine groups",
 80 |  "#Number of tert-alicyclic amines (no heteroatoms, not quinine-like bridged N)",
 81 |  "#Number of H-pyrrole nitrogens",
 82 |  "#Number of thiol groups",
 83 |  "#Number of aldehydes",
 84 |  "#Number of alkyl carbamates (subject to hydrolysis)",
 85 |  "#Number of alkyl halides",
 86 |  "#Number of allylic oxidation sites excluding steroid dienone",
 87 |  "#Number of amides",
 88 |  "#Number of anilines",
 89 |  "#Number of aryl methyl sites for hydroxylation",
 90 |  "#Number of azo groups",
 91 |  "#Number of benzene rings",
 92 |  "#Bicyclic",
 93 |  "#Number of dihydropyridines",
 94 |  "#Number of epoxide rings",
 95 |  "#Number of esters",
 96 |  "#Number of ether oxygens (including phenoxy)",
 97 |  "#Number of furan rings",
 98 |  "#Number of guanidine groups",
 99 |  "#Number of halogens",
100 |  "#Number of imidazole rings",
101 |  "#Number of isothiocyanates",
102 |  "#Number of ketones",
103 |  "#Number of ketones excluding diaryl, a,b-unsat. dienones, heteroatom on Calpha",
104 |  "#Number of beta lactams",
105 |  "#Number of cyclic esters (lactones)",
106 |  "#Number of methoxy groups -OCH3",
107 |  "#Number of nitriles",
108 |  "#Number of nitro groups",
109 |  "#Number of oxazole rings",
110 |  "#Number of para-hydroxylation sites",
111 |  "#Number of phenols",
112 |  "#Number of phosphoric acid groups",
113 |  "#Number of phosphoric ester groups",
114 |  "#Number of piperdine rings",
115 |  "#Number of primary amides",
116 |  "#Number of pyridine rings",
117 |  "#Number of quaternary nitrogens",
118 |  "#Number of thioether",
119 |  "#Number of thiazole rings",
120 |  "#Number of unbranched alkanes of at least 4 members (excludes halogenated alkanes)",
121 |       "#adduct_enc","#C","#H", "#O","#N", "#S", "#I", "#Br","#Cl", "#F","#P"]
122 | 
123 | all_arr=[]
124 | for sample in x1:
125 |     cache=[]
126 |     for i in sample:
127 |         cache.append(float_oupt_to_class(i,65))
128 |     xn=np.array(cache)
129 |     all_arr.append(xn)
130 |     
131 | result=np.stack(all_arr)   
132 | #result.shape  
133 | 
134 | df = pd.DataFrame(result, columns = funct)
135 | 
136 | df_final = df_final.join(df, how="outer")
137 | 
138 | 
139 | 
140 | 
141 | fname10=os.path.join(sys.argv[1],"predicted_results.tsv")
142 | 
143 | df_final.to_csv(fname10,sep='\t')
144 | 
145 | #df_final.to_csv("/Users/delser/mass2smiles/predicted_results.tsv",sep='\t')


--------------------------------------------------------------------------------
/tcn_seq_train_used.py:
--------------------------------------------------------------------------------
  1 | from tensorflow.keras.models import Sequential
  2 | from tensorflow.keras.layers import Dense, Dropout, Flatten,MultiHeadAttention
  3 | from tensorflow import keras
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | from tcn import TCN
  7 | import wandb
  8 | from wandb.keras import WandbCallback
  9 | import pickle
 10 | import os
 11 | 
 12 | wandb.init(project="mass2smiles-tcn_seq")
 13 | 
 14 | 
 15 | 
 16 | batch_size=16
 17 | n_epochs=50
 18 | 
 19 | 
 20 | ytr=np.load('/home/delser/train/tcn/cddd_all_HRMS_train_24012023_cddd_refine.npy')
 21 | #ytr=np.expand_dims(ytr,-1)
 22 | ytr1=np.load('/home/delser/train/tcn/y1_all_HRMS_train_24012023_cddd_mf.npy')
 23 | 
 24 | yval=np.load('/home/delser/train/tcn/cddd_all_HRMS_valid_24012023_cddd_refine.npy')
 25 | yval1=np.load('/home/delser/train/tcn/y1_all_HRMS_valid_24012023_cddd_mf.npy')
 26 | #yval=np.expand_dims(yval,-1)
 27 | 
 28 | xtr=np.load('/home/delser/train/tcn/tcn_train_seq_sin256_2401.npy',mmap_mode='r')
 29 | xval =np.load( '/home/delser/train/tcn/tcn_valid_seq_sin256_2401.npy',mmap_mode='r')
 30 | 
 31 | 
 32 | 
 33 | class BaseAttention(tf.keras.layers.Layer):
 34 |   def __init__(self, **kwargs):
 35 |     super().__init__()
 36 |     self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
 37 |     self.layernorm = tf.keras.layers.LayerNormalization()
 38 |     self.add = tf.keras.layers.Add()
 39 |         
 40 | class FeedForward(tf.keras.layers.Layer):
 41 |   def __init__(self, d_model, dff, dropout_rate=0.1):
 42 |     super().__init__()
 43 |     self.seq = tf.keras.Sequential([
 44 |       tf.keras.layers.Dense(dff, activation='relu'),
 45 |       tf.keras.layers.Dense(d_model),
 46 |       tf.keras.layers.Dropout(dropout_rate)
 47 |     ])
 48 |     self.add = tf.keras.layers.Add()
 49 |     self.layer_norm = tf.keras.layers.LayerNormalization()
 50 | 
 51 |   def call(self, x):
 52 |     x = self.add([x, self.seq(x)])
 53 |     x = self.layer_norm(x) 
 54 |     return x
 55 |     
 56 | class GlobalSelfAttention(BaseAttention):
 57 |   def call(self, x):
 58 |     attn_output = self.mha(
 59 |         query=x,
 60 |         value=x,
 61 |         key=x)
 62 |     x = self.add([x, attn_output])
 63 |     x = self.layernorm(x)
 64 |     return x
 65 | 
 66 | class EncoderLayer(tf.keras.layers.Layer):
 67 |   def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
 68 |     super().__init__()
 69 | 
 70 |     self.self_attention = GlobalSelfAttention(
 71 |         num_heads=num_heads,
 72 |         key_dim=d_model,
 73 |         dropout=dropout_rate)
 74 | 
 75 |     self.ffn = FeedForward(d_model, dff)
 76 | 
 77 |   def call(self, x):
 78 |     x = self.self_attention(x)
 79 |     x = self.ffn(x)
 80 |     return x
 81 | 
 82 | 
 83 | class DataGenerator(keras.utils.Sequence):
 84 |   def __init__(self, x_data, y_data,y1_data, batch_size):
 85 |     self.x, self.y, self.y1 = x_data, y_data, y1_data,
 86 |     self.batch_size = batch_size
 87 |     self.num_batches = np.ceil(len(x_data) / batch_size)
 88 |     self.batch_idx = np.array_split(range(len(x_data)), self.num_batches)
 89 | 
 90 |   def __len__(self):
 91 |     return len(self.batch_idx)
 92 | 
 93 |   def __getitem__(self, idx):
 94 |     batch_x = self.x[self.batch_idx[idx]]
 95 |     batch_y = self.y[self.batch_idx[idx]]
 96 |     batch_y1 = self.y1[self.batch_idx[idx]]
 97 |     return batch_x, [batch_y,batch_y1]
 98 | 
 99 | train_generator = DataGenerator(xtr, ytr,ytr1, batch_size = 16)
100 | 
101 | 
102 | def call_existing_code(units, heads, dropout,dense_dropout,lr,filters,num_layers):
103 |     tcn=TCN(
104 |             nb_filters=filters,
105 |             kernel_size=8,
106 |             dilations=[2 ** i for i in range(6)],
107 |             use_skip_connections=True,
108 |             use_layer_norm=True,
109 |             kernel_initializer='glorot_uniform',
110 |             go_backwards=True,)
111 |     print(f'TCN.receptive_field: {tcn.receptive_field}.')
112 |     input0=tf.keras.Input(shape=(501,257))
113 |     input1=tf.keras.layers.Masking(mask_value=10,input_shape=(501,257))(input0)
114 |     att = Sequential([               
115 |             EncoderLayer(d_model=257, num_heads=heads, dff=units,dropout_rate=dropout) 
116 |         for _ in range(num_layers)])(input1)
117 |     hd_tcn=tcn(att)
118 |     output_b=Sequential([               
119 |                 Dropout(rate=dense_dropout),
120 |                 Dense(128, activation='tanh'),
121 |                 Dropout(rate=dense_dropout),              
122 |                 Dense(71, activation='sigmoid')],name="funct_groups")(hd_tcn)
123 |     output_a = Sequential([               
124 |             Dropout(rate=dense_dropout),
125 |             Dense(512, activation='relu'),
126 |             Dropout(rate=dense_dropout),              
127 |             Dense(512, activation='linear')
128 |         ],name="smiles")(hd_tcn)
129 |     model= tf.keras.Model(inputs=input0, outputs=[output_a,output_b])
130 |     model.compile(loss={"smiles":'mean_absolute_error',"funct_groups":'mse'}, optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),metrics={"smiles":'mean_squared_error',"funct_groups":'mean_absolute_error'})
131 |     
132 |         
133 |     return model
134 | 
135 | def build_model():
136 |     num_layers = 5
137 |     units = 2048
138 |     heads = 16
139 |     dropout = 0.1
140 |     dense_dropout = 0.1
141 |     filters=256
142 |     #activation = hp.Choice("activation", ["relu", "tanh"])
143 |     #dropout = hp.Boolean("dropout")
144 |     #lr = hp.Float("lr", min_value=1e-6, max_value=1e-4, sampling="log")
145 |     # call existing model-building code with the hyperparameter values.
146 |     #model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=lr,filters=filters)
147 |     model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=1e-4,filters=filters, num_layers =num_layers)
148 |     return model
149 | 
150 | 
151 |     # Compile and train.
152 | 
153 | model= build_model()
154 | model.summary()
155 | model.fit(train_generator,validation_data=(xval,[yval,yval1]), epochs=n_epochs,shuffle=False,batch_size=None,validation_batch_size=16,callbacks=[WandbCallback(log_batch_frequency=1)])
156 | model.save_weights('/home/delser/train/tcn/model')
157 | del model
158 | model= build_model()
159 | model.load_weights('/home/delser/train/tcn/model')
160 | result= model.predict(xval)
161 | #np.save("/home/delser/train/tcn/val_predict.npy", result)
162 | np.save("/home/delser/train/tcn/val_predict.npy", result[0])
163 | np.save("/home/delser/train/tcn/val_predict1.npy", result[1])
164 | 
165 | print('done!')


--------------------------------------------------------------------------------
/tcn_seq_train_hp.py:
--------------------------------------------------------------------------------
  1 | from tensorflow.keras.models import Sequential
  2 | from tensorflow.keras.layers import Dense, Dropout, Flatten,MultiHeadAttention
  3 | from tensorflow import keras
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | from tcn import TCN
  7 | #import wandb
  8 | #from wandb.keras import WandbCallback
  9 | import pickle
 10 | import os
 11 | import keras_tuner
 12 | 
 13 | #wandb.init(project="mass2smiles-tcn_seq")
 14 | 
 15 | # Run before every test for reproducibility
 16 | 
 17 | def seed_all():
 18 | 
 19 |     np.random.seed(123)
 20 | 
 21 |     tf.random.set_seed(123)
 22 |     
 23 | seed_all()
 24 |     
 25 | 
 26 | batch_size=16
 27 | #n_epochs=100
 28 | 
 29 | 
 30 | ytr=np.load('/home/delser/train/tcn/cddd_all_HRMS_train_24012023_cddd_refine.npy')
 31 | #ytr=np.expand_dims(ytr,-1)
 32 | #ytr1=np.load('/home/delser/train/tcn/y1_all_HRMS_train_24012023_cddd.npy')
 33 | 
 34 | yval=np.load('/home/delser/train/tcn/cddd_all_HRMS_valid_24012023_cddd_refine.npy')
 35 | #yval1=np.load('/home/delser/train/tcn/y1_all_HRMS_valid_24012023_cddd.npy')
 36 | #yval=np.expand_dims(yval,-1)
 37 | 
 38 | xtr=np.load('/home/delser/train/tcn/tcn_train_seq_sin256_2401.npy',mmap_mode='r')
 39 | xval =np.load( '/home/delser/train/tcn/tcn_valid_seq_sin256_2401.npy',mmap_mode='r')
 40 | 
 41 | 
 42 | 
 43 | class BaseAttention(tf.keras.layers.Layer):
 44 |   def __init__(self, **kwargs):
 45 |     super().__init__()
 46 |     self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
 47 |     self.layernorm = tf.keras.layers.LayerNormalization()
 48 |     self.add = tf.keras.layers.Add()
 49 |         
 50 | class FeedForward(tf.keras.layers.Layer):
 51 |   def __init__(self, d_model, dff, dropout_rate=0.1):
 52 |     super().__init__()
 53 |     self.seq = tf.keras.Sequential([
 54 |       tf.keras.layers.Dense(dff, activation='relu'),
 55 |       tf.keras.layers.Dense(d_model),
 56 |       tf.keras.layers.Dropout(dropout_rate)
 57 |     ])
 58 |     self.add = tf.keras.layers.Add()
 59 |     self.layer_norm = tf.keras.layers.LayerNormalization()
 60 | 
 61 |   def call(self, x):
 62 |     x = self.add([x, self.seq(x)])
 63 |     x = self.layer_norm(x) 
 64 |     return x
 65 |     
 66 | class GlobalSelfAttention(BaseAttention):
 67 |   def call(self, x):
 68 |     attn_output = self.mha(
 69 |         query=x,
 70 |         value=x,
 71 |         key=x)
 72 |     x = self.add([x, attn_output])
 73 |     x = self.layernorm(x)
 74 |     return x
 75 | 
 76 | class EncoderLayer(tf.keras.layers.Layer):
 77 |   def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
 78 |     super().__init__()
 79 | 
 80 |     self.self_attention = GlobalSelfAttention(
 81 |         num_heads=num_heads,
 82 |         key_dim=d_model,
 83 |         dropout=dropout_rate)
 84 | 
 85 |     self.ffn = FeedForward(d_model, dff)
 86 | 
 87 |   def call(self, x):
 88 |     x = self.self_attention(x)
 89 |     x = self.ffn(x)
 90 |     return x
 91 | 
 92 | class DataGenerator(keras.utils.Sequence):
 93 |   def __init__(self, x_data, y_data, batch_size):
 94 |     self.x, self.y = x_data, y_data
 95 |     self.batch_size = batch_size
 96 |     self.num_batches = np.ceil(len(x_data) / batch_size)
 97 |     self.batch_idx = np.array_split(range(len(x_data)), self.num_batches)
 98 | 
 99 |   def __len__(self):
100 |     return len(self.batch_idx)
101 | 
102 |   def __getitem__(self, idx):
103 |     batch_x = self.x[self.batch_idx[idx]]
104 |     batch_y = self.y[self.batch_idx[idx]]
105 |     return batch_x, batch_y
106 | 
107 | train_generator = DataGenerator(xtr, ytr, batch_size = 16)
108 | 
109 | 
110 | 
111 | def call_existing_code(units, heads, dropout,dense_dropout,lr,filters,num_layers):
112 |     tcn=TCN(
113 |             nb_filters=filters,
114 |             kernel_size=8,
115 |             dilations=[2 ** i for i in range(6)],
116 |             use_skip_connections=True,
117 |             use_layer_norm=True,
118 |             kernel_initializer='glorot_uniform',
119 |             go_backwards=True,)
120 |     print(f'TCN.receptive_field: {tcn.receptive_field}.')
121 |     input0=tf.keras.Input(shape=(501,257))
122 |     input1=tf.keras.layers.Masking(mask_value=10,input_shape=(501,257))(input0)
123 |     att = Sequential([               
124 |             EncoderLayer(d_model=257, num_heads=heads, dff=units,dropout_rate=dropout) 
125 |         for _ in range(num_layers)])(input1)
126 |     hd_tcn=tcn(att)
127 |     output_a = Sequential([               
128 |             Dropout(rate=dense_dropout),
129 |             Dense(512, activation='relu'),
130 |             Dropout(rate=dense_dropout),              
131 |             Dense(512, activation='linear')
132 |         ],name="smiles")(hd_tcn)
133 |     model= tf.keras.Model(inputs=input0, outputs=output_a)
134 |     model.compile(loss={"smiles":'mean_absolute_error'}, optimizer=tf.keras.optimizers.Adam(learning_rate=lr),metrics={"smiles":'mean_squared_error'})
135 |         
136 |     return model
137 | 
138 | def build_model(hp):
139 |     num_layers = hp.Int("num_layers", min_value=2, max_value=6, step=2)
140 |     units = hp.Int("units", min_value=512, max_value=2048, step=512)
141 |     heads = hp.Int("heads", min_value=8, max_value=32, step=8)
142 |     dropout = hp.Float("dropout", min_value=0.1, max_value=0.5, step=0.1)
143 |     dense_dropout = hp.Float("dense_dropout", min_value=0.1, max_value=0.5, step=0.1)
144 |     filters=hp.Int("filters", min_value=128, max_value=512, step=128)
145 |     #activation = hp.Choice("activation", ["relu", "tanh"])
146 |     #dropout = hp.Boolean("dropout")
147 |     #lr = hp.Float("lr", min_value=1e-6, max_value=1e-4, sampling="log")
148 |     # call existing model-building code with the hyperparameter values.
149 |     #model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=lr,filters=filters)
150 |     model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=1e-4,filters=filters, num_layers =num_layers)
151 |     return model
152 | 
153 | tuner = keras_tuner.RandomSearch(
154 |     hypermodel=build_model,
155 |     objective=keras_tuner.Objective("val_mean_squared_error", direction="min"),
156 |     max_trials=99,
157 |     executions_per_trial=1,
158 |     overwrite=True,
159 |     directory="my_dir",
160 |     project_name="helloworld",
161 | )
162 | 
163 | tuner.search_space_summary()
164 | tuner.search(train_generator, epochs=4, validation_data=(xval,yval),batch_size=None,validation_batch_size=16)
165 | #model.compile(loss={"smiles":'mean_absolute_error',"funct_groups":'mse'}, optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),metrics={"smiles":'mean_squared_error',"funct_groups":'mean_absolute_error'})
166 | #model.summary()
167 | #model.fit(train_generator,validation_data=(xval,[yval,yval1]), epochs=n_epochs,shuffle=False,batch_size=None,validation_batch_size=16,callbacks=[WandbCallback(log_batch_frequency=1)])
168 | #result= model.predict(xval)
169 | #np.save("/home/delser/train/tcn/val_predict.npy", result[0])
170 | #np.save("/home/delser/train/tcn/val_predict1.npy", result[1])
171 | 
172 | #model.save('/home/delser/train/tcn')
173 | 
174 | print('done!')


--------------------------------------------------------------------------------
/mass2smiles_transformer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | import json
  4 | import sys
  5 | import os
  6 | import subprocess
  7 | import time
  8 | start_time = time.time()
  9 | import numpy as np
 10 | import pandas as pd
 11 | import pickle
 12 | from matchms import set_matchms_logger_level
 13 | import pandas as pd
 14 | set_matchms_logger_level("ERROR")
 15 | from matchms.filtering import add_losses
 16 | from matchms.filtering import add_parent_mass
 17 | from matchms.filtering import default_filters
 18 | from matchms.filtering import normalize_intensities
 19 | from matchms.filtering import select_by_intensity
 20 | from matchms.filtering import reduce_to_number_of_peaks
 21 | from matchms.filtering import require_minimum_number_of_peaks
 22 | from matchms.filtering import select_by_mz
 23 | from matchms.importing import load_from_mgf
 24 | from matchms.exporting import save_as_mgf
 25 | from matchms.importing import load_from_msp
 26 | from matchms.filtering import repair_inchi_inchikey_smiles
 27 | from matchms.filtering import derive_inchikey_from_inchi
 28 | from matchms.filtering import derive_smiles_from_inchi
 29 | from matchms.filtering import derive_inchi_from_smiles
 30 | from matchms.filtering import harmonize_undefined_inchi
 31 | from matchms.filtering import harmonize_undefined_inchikey
 32 | from matchms.filtering import harmonize_undefined_smiles
 33 | from tensorflow.keras.models import Sequential
 34 | from tensorflow.keras.layers import Dense, Dropout
 35 | import tensorflow as tf
 36 | from tcn import TCN
 37 | from tensorflow import keras
 38 | from keras.initializers import glorot_uniform
 39 | ##################### parse mfg and convert to df ########################
 40 | print('parsing specs now')
 41 | def spectrum_processing(s):
 42 |     """This is how one would typically design a desired pre- and post-
 43 |     processing pipeline."""
 44 |     s = default_filters(s)
 45 |     s = add_parent_mass(s)
 46 |     s = normalize_intensities(s)
 47 |     s = select_by_intensity(s, intensity_from=0.01)
 48 |     s = reduce_to_number_of_peaks(s, n_required=5, n_max=250)
 49 |     s = select_by_mz(s, mz_from=15, mz_to=2000)
 50 |     s = add_losses(s, loss_mz_from=15.0, loss_mz_to=350.0)
 51 |     s = require_minimum_number_of_peaks(s, n_required=5)
 52 |     return s
 53 | 
 54 | 
 55 | 
 56 | def metadata_processing(spectrum):
 57 |     spectrum = default_filters(spectrum)
 58 |     spectrum = repair_inchi_inchikey_smiles(spectrum)
 59 |     spectrum = derive_inchi_from_smiles(spectrum)
 60 |     spectrum = derive_smiles_from_inchi(spectrum)
 61 |     spectrum = derive_inchikey_from_inchi(spectrum)
 62 |     spectrum = harmonize_undefined_smiles(spectrum)
 63 |     spectrum = harmonize_undefined_inchi(spectrum)
 64 |     spectrum = harmonize_undefined_inchikey(spectrum)
 65 |     return spectrum
 66 | # Load data from MGF file and apply filters
 67 | 
 68 | 
 69 | #path_data =   # enter path to downloaded mgf file
 70 | file_mgf = os.path.join(sys.argv[2], 
 71 |                          sys.argv[1])
 72 | spectrums = list(load_from_mgf(file_mgf))
 73 | 
 74 | spectrums = [metadata_processing(s) for s in spectrums]
 75 | spectrums = [spectrum_processing(s) for s in spectrums]
 76 | #spectrums = [spectrum_processing(s) for s in load_from_mgf("/Users/delser/Desktop/PhD/Phytochemistry/NP-Databases/CFM-4_DB/TOTAL_COMPOUNDS_DB.energies_merged_name.mgf")]
 77 | #spectrums = [spectrum_processing(s) for s in load_from_mgf("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/altissues15072021-py.mgf")]
 78 | # Omit spectrums that didn't qualify for analysis
 79 | spectrums = [s for s in spectrums if s is not None]
 80 | 
 81 | precs = []
 82 | IDs = []
 83 | mzs=[]
 84 | ints=[]
 85 | loss_mzs=[]
 86 | loss_ints=[]
 87 | 
 88 | 
 89 | for spec in spectrums: 
 90 |     IDs.append(spec.get("feature_id"))
 91 |     precs.append(spec.get("precursor_mz"))
 92 |     mzs.append(list(spec.peaks.mz))
 93 |     ints.append(list(spec.peaks.intensities))
 94 |     loss_mzs.append(list(spec.losses.mz))
 95 |     loss_ints.append(list(spec.losses.intensities))
 96 | 
 97 | metadata = pd.DataFrame(list(zip(IDs, precs,mzs,ints,loss_mzs,loss_ints)), columns=["feature_id", "precursor_mz","mzs","intensities","loss_mzs","loss_intensities" ])
 98 | fname2=os.path.join(sys.argv[2],'feature_ids_dataframe.tsv')
 99 | metadata.to_csv(fname2,sep='\t')
100 | print("done!")
101 | ##################### encode specs ########################
102 | print('encoding specs now')
103 | def positional_encoding(max_position, d_model, min_freq=1e-6):
104 |     position = np.arange(max_position)
105 |     freqs = min_freq**(2*(np.arange(d_model)//2)/d_model)
106 |     pos_enc = position.reshape(-1,1)*freqs.reshape(1,-1)
107 |     pos_enc[:, ::2] = np.cos(pos_enc[:, ::2])
108 |     pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2])
109 |     return pos_enc
110 | 
111 | def trun_n_d(n,d):
112 |     return (  n if not n.find('.')+1 else n[:n.find('.')+d+1]  )
113 | 
114 | P=positional_encoding(200000,256, min_freq=1e2)
115 | 
116 | def prepro_specs_train(df):
117 |     valid=[]
118 |     precs=df['precursor_mz'].to_list()
119 |     mzs=df['mzs'].to_list()
120 |     ints=df['intensities'].to_list()
121 |     loss_mzs=df['loss_mzs'].to_list()
122 |     loss_ints=df['loss_intensities'].to_list()
123 |     for one_pre,one_mzs,one_ints,one_loss,one_loss_ints in zip(precs,mzs,ints,loss_mzs,loss_ints):
124 |         mz_list=[round(float(trun_n_d(str(one_pre),2))*100)] # add precursor mz
125 |         intes_list=[2.0] # add precursor int
126 |         res = dict(zip(one_mzs+one_loss, one_ints+one_loss_ints))  # order by mzs
127 |         res=dict(sorted(res.items()))
128 |         for m,i in zip(list(res.keys()), list(res.values())): # change this from mgf from matchms
129 |             mz=round(float(trun_n_d(str(m),2))*100)
130 |             mz_list.append(mz)
131 |             intens=round(i,4)
132 |             intes_list.append(intens)
133 |         int_mzs=[intes_list,mz_list]   
134 |         valid.append(int_mzs) # put intesities at first
135 |     return tf.ragged.constant(valid)
136 | 
137 | train=prepro_specs_train(metadata)
138 | 
139 | dimn=256
140 | def encoding(rag_tensor,P,dimn):
141 |     to_pad=[]
142 |     for sample in rag_tensor:
143 |         all_dim=[sample[0].numpy().tolist()]
144 |         pos_enc=[P[int(i)-1] for i in sample[1].numpy().tolist()]
145 |         for dim in range(dimn):
146 |             dim_n=[i[dim] for i in pos_enc]
147 |             all_dim.append(dim_n)
148 |         to_pad.append(all_dim)
149 |     to_pad=[tf.keras.preprocessing.sequence.pad_sequences(i,maxlen=501,dtype='float32',padding='post',truncating='post',value=10) for i in to_pad]
150 |     to_pad=np.stack((to_pad))
151 |     to_pad=np.swapaxes(to_pad, 1, -1)
152 |     return to_pad
153 | 
154 | xtrain=encoding(train,P,dimn)
155 | print("done!")
156 | #xval=np.load('/home/delser/train/tcn/casmi_specs.npy')
157 | class BaseAttention(tf.keras.layers.Layer):
158 |   def __init__(self, **kwargs):
159 |     super().__init__()
160 |     self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
161 |     self.layernorm = tf.keras.layers.LayerNormalization()
162 |     self.add = tf.keras.layers.Add()
163 |         
164 | class FeedForward(tf.keras.layers.Layer):
165 |   def __init__(self, d_model, dff, dropout_rate=0.1):
166 |     super().__init__()
167 |     self.seq = tf.keras.Sequential([
168 |       tf.keras.layers.Dense(dff, activation='relu'),
169 |       tf.keras.layers.Dense(d_model),
170 |       tf.keras.layers.Dropout(dropout_rate)
171 |     ])
172 |     self.add = tf.keras.layers.Add()
173 |     self.layer_norm = tf.keras.layers.LayerNormalization()
174 | 
175 |   def call(self, x):
176 |     x = self.add([x, self.seq(x)])
177 |     x = self.layer_norm(x) 
178 |     return x
179 |     
180 | class GlobalSelfAttention(BaseAttention):
181 |   def call(self, x):
182 |     attn_output = self.mha(
183 |         query=x,
184 |         value=x,
185 |         key=x)
186 |     x = self.add([x, attn_output])
187 |     x = self.layernorm(x)
188 |     return x
189 | 
190 | class EncoderLayer(tf.keras.layers.Layer):
191 |   def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
192 |     super().__init__()
193 | 
194 |     self.self_attention = GlobalSelfAttention(
195 |         num_heads=num_heads,
196 |         key_dim=d_model,
197 |         dropout=dropout_rate)
198 | 
199 |     self.ffn = FeedForward(d_model, dff)
200 | 
201 |   def call(self, x):
202 |     x = self.self_attention(x)
203 |     x = self.ffn(x)
204 |     return x
205 | 
206 | 
207 | class DataGenerator(keras.utils.Sequence):
208 |   def __init__(self, x_data, y_data,y1_data, batch_size):
209 |     self.x, self.y, self.y1 = x_data, y_data, y1_data,
210 |     self.batch_size = batch_size
211 |     self.num_batches = np.ceil(len(x_data) / batch_size)
212 |     self.batch_idx = np.array_split(range(len(x_data)), self.num_batches)
213 | 
214 |   def __len__(self):
215 |     return len(self.batch_idx)
216 | 
217 |   def __getitem__(self, idx):
218 |     batch_x = self.x[self.batch_idx[idx]]
219 |     batch_y = self.y[self.batch_idx[idx]]
220 |     batch_y1 = self.y1[self.batch_idx[idx]]
221 |     return batch_x, [batch_y,batch_y1]
222 | 
223 | 
224 | 
225 | 
226 | def call_existing_code(units, heads, dropout,dense_dropout,lr,filters,num_layers):
227 |     tcn=TCN(
228 |             nb_filters=filters,
229 |             kernel_size=8,
230 |             dilations=[2 ** i for i in range(6)],
231 |             use_skip_connections=True,
232 |             use_layer_norm=True,
233 |             kernel_initializer='glorot_uniform',
234 |             go_backwards=True,)
235 |     print(f'TCN.receptive_field: {tcn.receptive_field}.')
236 |     input0=tf.keras.Input(shape=(501,257))
237 |     input1=tf.keras.layers.Masking(mask_value=10,input_shape=(501,257))(input0)
238 |     att = Sequential([               
239 |             EncoderLayer(d_model=257, num_heads=heads, dff=units,dropout_rate=dropout) 
240 |         for _ in range(num_layers)])(input1)
241 |     hd_tcn=tcn(att)
242 |     output_b=Sequential([               
243 |                 Dropout(rate=dense_dropout),
244 |                 Dense(128, activation='tanh'),
245 |                 Dropout(rate=dense_dropout),              
246 |                 Dense(71, activation='sigmoid')],name="funct_groups")(hd_tcn)
247 |     output_a = Sequential([               
248 |             Dropout(rate=dense_dropout),
249 |             Dense(512, activation='relu'),
250 |             Dropout(rate=dense_dropout),              
251 |             Dense(512, activation='linear')
252 |         ],name="smiles")(hd_tcn)
253 |     model= tf.keras.Model(inputs=input0, outputs=[output_a,output_b])
254 |     model.compile(loss={"smiles":'mean_absolute_error',"funct_groups":'mse'}, optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),metrics={"smiles":'mean_squared_error',"funct_groups":'mean_absolute_error'})
255 |     
256 |         
257 |     return model
258 | 
259 | def build_model():
260 |     num_layers = 5
261 |     units = 2048
262 |     heads = 16
263 |     dropout = 0.1
264 |     dense_dropout = 0.1
265 |     filters=256
266 |     #activation = hp.Choice("activation", ["relu", "tanh"])
267 |     #dropout = hp.Boolean("dropout")
268 |     #lr = hp.Float("lr", min_value=1e-6, max_value=1e-4, sampling="log")
269 |     # call existing model-building code with the hyperparameter values.
270 |     #model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=lr,filters=filters)
271 |     model = call_existing_code(units=units, heads=heads, dropout=dropout,dense_dropout=dense_dropout,lr=1e-4,filters=filters, num_layers =num_layers)
272 |     return model
273 | 
274 | 
275 | 
276 | ##################### predict and decode ########################
277 | model=build_model()
278 | model.load_weights(os.path.normpath(sys.argv[2]+"/misunderstood-fire-207/model"))
279 | #model = keras.models.load_model(os.path.normpath(sys.argv[2]+"/upbeat-puddle-198"), custom_objects={'TCN': TCN, 'GlorotUniform': glorot_uniform()})
280 | #model.summary()
281 | result= model.predict(xtrain)
282 | np.save(os.path.join(sys.argv[2],"result_predict.npy"), result[0])
283 | np.save(os.path.join(sys.argv[2],"result_predict1.npy"), result[1])
284 | 
285 | 
286 | print('predict with  transformer done!')
287 | 
288 | ###### cddd decode predictions #####
289 | print("decode embeddings now!")
290 | x=subprocess.check_output(['conda', 'run','-n', 'cddd', 'python', 'app/cddd_decode.py',sys.argv[2]])
291 | print(x.decode('ascii')) 
292 | 
293 | print("done!")
294 | print("Everything was successfully predicted!")
295 | print("Everything was successfully predicted in --- %s minutes --- to the file predicted_results.tsv" % ((time.time() - start_time)/60))
296 | 


--------------------------------------------------------------------------------
/molecularformula.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "53aeece7-a596-411f-9e1f-3f3e5ba0d354",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from tqdm import tqdm\n",
 11 |     "import tensorflow as tf\n",
 12 |     "import json\n",
 13 |     "import tensorflow_text as text\n",
 14 |     "import os\n",
 15 |     "import numpy as np\n",
 16 |     "import pandas as pd\n",
 17 |     "import pickle"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "id": "e8baf966-5ead-4951-a254-af7b4b301d5c",
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from rdkit import DataStructs\n",
 28 |     "from rdkit import Chem\n",
 29 |     "from rdkit.Chem.rdMolDescriptors import CalcMolFormula\n",
 30 |     "#m_true=Chem.MolFromSmiles(df_final[\"decoded_test\"][1])\n",
 31 |     "#m_predict=Chem.MolFromSmiles(df_final[\"predicted\"][1])\n",
 32 |     "#ms=[m_true,m_predict]\n",
 33 |     "#fps = [Chem.RDKFingerprint(x) for x in ms]\n",
 34 |     "#DataStructs.FingerprintSimilarity(fps[0],fps[1])\n",
 35 |     "\n",
 36 |     "def get_mf(smiles1):\n",
 37 |     "    try:\n",
 38 |     "        m_true=Chem.MolFromSmiles(smiles1)\n",
 39 |     "        mf=CalcMolFormula(m_true)\n",
 40 |     "        return mf\n",
 41 |     "    except:\n",
 42 |     "        return \"no_prediction_or_error_with_parsing_by_rdkit\"\n",
 43 |     "    "
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "id": "4c5b299b-56ac-42b3-bd6c-ec3c01b46a39",
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/html": [
 55 |        "<div>\n",
 56 |        "<style scoped>\n",
 57 |        "    .dataframe tbody tr th:only-of-type {\n",
 58 |        "        vertical-align: middle;\n",
 59 |        "    }\n",
 60 |        "\n",
 61 |        "    .dataframe tbody tr th {\n",
 62 |        "        vertical-align: top;\n",
 63 |        "    }\n",
 64 |        "\n",
 65 |        "    .dataframe thead th {\n",
 66 |        "        text-align: right;\n",
 67 |        "    }\n",
 68 |        "</style>\n",
 69 |        "<table border=\"1\" class=\"dataframe\">\n",
 70 |        "  <thead>\n",
 71 |        "    <tr style=\"text-align: right;\">\n",
 72 |        "      <th></th>\n",
 73 |        "      <th>Unnamed: 0</th>\n",
 74 |        "      <th>spectrum_id</th>\n",
 75 |        "      <th>precursor_mz</th>\n",
 76 |        "      <th>mzs</th>\n",
 77 |        "      <th>intensities</th>\n",
 78 |        "      <th>loss_mzs</th>\n",
 79 |        "      <th>loss_intensities</th>\n",
 80 |        "      <th>smiles_preprocessed</th>\n",
 81 |        "      <th>num_of_sugars</th>\n",
 82 |        "      <th>Number of aliphatic carboxylic acids</th>\n",
 83 |        "      <th>...</th>\n",
 84 |        "      <th>Number of phenols</th>\n",
 85 |        "      <th>Number of phosphoric acid groups</th>\n",
 86 |        "      <th>Number of phosphoric ester groups</th>\n",
 87 |        "      <th>Number of piperdine rings</th>\n",
 88 |        "      <th>Number of primary amides</th>\n",
 89 |        "      <th>Number of pyridine rings</th>\n",
 90 |        "      <th>Number of quaternary nitrogens</th>\n",
 91 |        "      <th>Number of thioether</th>\n",
 92 |        "      <th>Number of thiazole rings</th>\n",
 93 |        "      <th>Number of unbranched alkanes of at least 4 members (excludes halogenated alkanes)</th>\n",
 94 |        "    </tr>\n",
 95 |        "  </thead>\n",
 96 |        "  <tbody>\n",
 97 |        "    <tr>\n",
 98 |        "      <th>0</th>\n",
 99 |        "      <td>0</td>\n",
100 |        "      <td>3</td>\n",
101 |        "      <td>719.2538</td>\n",
102 |        "      <td>[53.0387, 55.0179, 55.0545, 57.0338, 59.0492, ...</td>\n",
103 |        "      <td>[0.017272727272727273, 0.01818181818181818, 0....</td>\n",
104 |        "      <td>[314.15709999999996]</td>\n",
105 |        "      <td>[0.01818181818181818]</td>\n",
106 |        "      <td>CC1C(C(C(C(O1)OC2=C(OC3=C(C(=CC(=C3C2=O)O)O)CC...</td>\n",
107 |        "      <td>2.0</td>\n",
108 |        "      <td>0</td>\n",
109 |        "      <td>...</td>\n",
110 |        "      <td>2</td>\n",
111 |        "      <td>0</td>\n",
112 |        "      <td>0</td>\n",
113 |        "      <td>0</td>\n",
114 |        "      <td>0</td>\n",
115 |        "      <td>0</td>\n",
116 |        "      <td>0</td>\n",
117 |        "      <td>0</td>\n",
118 |        "      <td>0</td>\n",
119 |        "      <td>0</td>\n",
120 |        "    </tr>\n",
121 |        "    <tr>\n",
122 |        "      <th>1</th>\n",
123 |        "      <td>1</td>\n",
124 |        "      <td>5</td>\n",
125 |        "      <td>499.2298</td>\n",
126 |        "      <td>[67.0543, 69.0698, 81.0699, 83.0492, 83.0853, ...</td>\n",
127 |        "      <td>[0.010357142857142856, 0.04642857142857143, 0....</td>\n",
128 |        "      <td>[60.02260000000001, 118.02660000000003, 160.03...</td>\n",
129 |        "      <td>[0.060714285714285714, 0.39285714285714285, 0....</td>\n",
130 |        "      <td>CC1CC2(C(C1O)C=C(C(CC3C(C3(C)C)C=C(C2=O)C)OC(=...</td>\n",
131 |        "      <td>0.0</td>\n",
132 |        "      <td>1</td>\n",
133 |        "      <td>...</td>\n",
134 |        "      <td>0</td>\n",
135 |        "      <td>0</td>\n",
136 |        "      <td>0</td>\n",
137 |        "      <td>0</td>\n",
138 |        "      <td>0</td>\n",
139 |        "      <td>0</td>\n",
140 |        "      <td>0</td>\n",
141 |        "      <td>0</td>\n",
142 |        "      <td>0</td>\n",
143 |        "      <td>0</td>\n",
144 |        "    </tr>\n",
145 |        "    <tr>\n",
146 |        "      <th>2</th>\n",
147 |        "      <td>2</td>\n",
148 |        "      <td>6</td>\n",
149 |        "      <td>1102.5777</td>\n",
150 |        "      <td>[81.0334, 83.0491, 85.0224, 85.0284, 85.0333, ...</td>\n",
151 |        "      <td>[0.14210526315789473, 0.2236842105263158, 0.01...</td>\n",
152 |        "      <td>[]</td>\n",
153 |        "      <td>[]</td>\n",
154 |        "      <td>CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)O...</td>\n",
155 |        "      <td>4.0</td>\n",
156 |        "      <td>0</td>\n",
157 |        "      <td>...</td>\n",
158 |        "      <td>0</td>\n",
159 |        "      <td>0</td>\n",
160 |        "      <td>0</td>\n",
161 |        "      <td>0</td>\n",
162 |        "      <td>0</td>\n",
163 |        "      <td>0</td>\n",
164 |        "      <td>0</td>\n",
165 |        "      <td>0</td>\n",
166 |        "      <td>0</td>\n",
167 |        "      <td>0</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>3</th>\n",
171 |        "      <td>3</td>\n",
172 |        "      <td>10</td>\n",
173 |        "      <td>472.2082</td>\n",
174 |        "      <td>[72.2736, 145.0759, 148.0868, 149.0707, 172.06...</td>\n",
175 |        "      <td>[0.010697674418604652, 0.03488372093023256, 0....</td>\n",
176 |        "      <td>[135.07919999999996, 148.0632, 165.08959999999...</td>\n",
177 |        "      <td>[0.053488372093023255, 0.10465116279069768, 0....</td>\n",
178 |        "      <td>CN1C2=C(C=C(C=C2)C(=O)N(CCC(=O)O)C3=CC=CC=N3)N...</td>\n",
179 |        "      <td>0.0</td>\n",
180 |        "      <td>1</td>\n",
181 |        "      <td>...</td>\n",
182 |        "      <td>0</td>\n",
183 |        "      <td>0</td>\n",
184 |        "      <td>0</td>\n",
185 |        "      <td>0</td>\n",
186 |        "      <td>0</td>\n",
187 |        "      <td>1</td>\n",
188 |        "      <td>0</td>\n",
189 |        "      <td>0</td>\n",
190 |        "      <td>0</td>\n",
191 |        "      <td>0</td>\n",
192 |        "    </tr>\n",
193 |        "    <tr>\n",
194 |        "      <th>4</th>\n",
195 |        "      <td>4</td>\n",
196 |        "      <td>11</td>\n",
197 |        "      <td>657.3116</td>\n",
198 |        "      <td>[55.0542, 55.9818, 57.0334, 60.2456, 69.0335, ...</td>\n",
199 |        "      <td>[0.018260869565217393, 0.013043478260869565, 0...</td>\n",
200 |        "      <td>[158.094, 210.1282, 228.1352, 246.1485, 280.16...</td>\n",
201 |        "      <td>[0.02217391304347826, 0.06086956521739131, 0.1...</td>\n",
202 |        "      <td>CCCC(=O)OCC(C(C(CN1C2=C(C=C(C(=C2)C)C)N=C3C1=N...</td>\n",
203 |        "      <td>1.0</td>\n",
204 |        "      <td>0</td>\n",
205 |        "      <td>...</td>\n",
206 |        "      <td>0</td>\n",
207 |        "      <td>0</td>\n",
208 |        "      <td>0</td>\n",
209 |        "      <td>0</td>\n",
210 |        "      <td>0</td>\n",
211 |        "      <td>0</td>\n",
212 |        "      <td>0</td>\n",
213 |        "      <td>0</td>\n",
214 |        "      <td>0</td>\n",
215 |        "      <td>0</td>\n",
216 |        "    </tr>\n",
217 |        "    <tr>\n",
218 |        "      <th>...</th>\n",
219 |        "      <td>...</td>\n",
220 |        "      <td>...</td>\n",
221 |        "      <td>...</td>\n",
222 |        "      <td>...</td>\n",
223 |        "      <td>...</td>\n",
224 |        "      <td>...</td>\n",
225 |        "      <td>...</td>\n",
226 |        "      <td>...</td>\n",
227 |        "      <td>...</td>\n",
228 |        "      <td>...</td>\n",
229 |        "      <td>...</td>\n",
230 |        "      <td>...</td>\n",
231 |        "      <td>...</td>\n",
232 |        "      <td>...</td>\n",
233 |        "      <td>...</td>\n",
234 |        "      <td>...</td>\n",
235 |        "      <td>...</td>\n",
236 |        "      <td>...</td>\n",
237 |        "      <td>...</td>\n",
238 |        "      <td>...</td>\n",
239 |        "      <td>...</td>\n",
240 |        "    </tr>\n",
241 |        "    <tr>\n",
242 |        "      <th>237</th>\n",
243 |        "      <td>237</td>\n",
244 |        "      <td>490</td>\n",
245 |        "      <td>268.1541</td>\n",
246 |        "      <td>[86.0599, 109.065, 109.1013, 121.101, 123.1168...</td>\n",
247 |        "      <td>[0.10689655172413794, 0.04482758620689655, 0.0...</td>\n",
248 |        "      <td>[18.01030000000003, 36.02120000000002, 46.0054...</td>\n",
249 |        "      <td>[0.21724137931034482, 0.06206896551724138, 1.0...</td>\n",
250 |        "      <td>CC1CCC2C1C(=O)OC(C2C)NC(=O)C3C(O3)C</td>\n",
251 |        "      <td>0.0</td>\n",
252 |        "      <td>0</td>\n",
253 |        "      <td>...</td>\n",
254 |        "      <td>0</td>\n",
255 |        "      <td>0</td>\n",
256 |        "      <td>0</td>\n",
257 |        "      <td>0</td>\n",
258 |        "      <td>0</td>\n",
259 |        "      <td>0</td>\n",
260 |        "      <td>0</td>\n",
261 |        "      <td>0</td>\n",
262 |        "      <td>0</td>\n",
263 |        "      <td>0</td>\n",
264 |        "    </tr>\n",
265 |        "    <tr>\n",
266 |        "      <th>238</th>\n",
267 |        "      <td>238</td>\n",
268 |        "      <td>491</td>\n",
269 |        "      <td>411.3254</td>\n",
270 |        "      <td>[55.0544, 57.0701, 67.0544, 69.0699, 81.0699, ...</td>\n",
271 |        "      <td>[0.0215625, 0.053125, 0.015, 0.34375, 0.046875...</td>\n",
272 |        "      <td>[18.01060000000001, 88.08850000000001, 142.135...</td>\n",
273 |        "      <td>[0.153125, 0.02125, 0.01875, 0.02625, 0.028437...</td>\n",
274 |        "      <td>CC(C)C(C)C=CC(C)C1CCC2C1(CCC3=C2C(=O)C=C4C3(CC...</td>\n",
275 |        "      <td>0.0</td>\n",
276 |        "      <td>0</td>\n",
277 |        "      <td>...</td>\n",
278 |        "      <td>0</td>\n",
279 |        "      <td>0</td>\n",
280 |        "      <td>0</td>\n",
281 |        "      <td>0</td>\n",
282 |        "      <td>0</td>\n",
283 |        "      <td>0</td>\n",
284 |        "      <td>0</td>\n",
285 |        "      <td>0</td>\n",
286 |        "      <td>0</td>\n",
287 |        "      <td>0</td>\n",
288 |        "    </tr>\n",
289 |        "    <tr>\n",
290 |        "      <th>239</th>\n",
291 |        "      <td>239</td>\n",
292 |        "      <td>492</td>\n",
293 |        "      <td>430.2432</td>\n",
294 |        "      <td>[50.4192, 52.9216, 74.1355, 81.0699, 86.4846, ...</td>\n",
295 |        "      <td>[0.05555555555555555, 0.058333333333333334, 0....</td>\n",
296 |        "      <td>[98.63150000000002, 179.0794, 197.0896, 214.55...</td>\n",
297 |        "      <td>[0.06666666666666667, 0.2777777777777778, 1.0,...</td>\n",
298 |        "      <td>CC1CCC(C2(C1=CC(CC2)C(=C)C(=O)O)C)OC3C(C(C(C(O...</td>\n",
299 |        "      <td>1.0</td>\n",
300 |        "      <td>1</td>\n",
301 |        "      <td>...</td>\n",
302 |        "      <td>0</td>\n",
303 |        "      <td>0</td>\n",
304 |        "      <td>0</td>\n",
305 |        "      <td>0</td>\n",
306 |        "      <td>0</td>\n",
307 |        "      <td>0</td>\n",
308 |        "      <td>0</td>\n",
309 |        "      <td>0</td>\n",
310 |        "      <td>0</td>\n",
311 |        "      <td>0</td>\n",
312 |        "    </tr>\n",
313 |        "    <tr>\n",
314 |        "      <th>240</th>\n",
315 |        "      <td>240</td>\n",
316 |        "      <td>495</td>\n",
317 |        "      <td>578.2076</td>\n",
318 |        "      <td>[54.5814, 57.0338, 61.0286, 69.0335, 81.0331, ...</td>\n",
319 |        "      <td>[0.01, 0.010769230769230769, 0.038461538461538...</td>\n",
320 |        "      <td>[160.76359999999994]</td>\n",
321 |        "      <td>[0.011923076923076923]</td>\n",
322 |        "      <td>COC1=C2C(=CC(=C1OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(...</td>\n",
323 |        "      <td>2.0</td>\n",
324 |        "      <td>1</td>\n",
325 |        "      <td>...</td>\n",
326 |        "      <td>0</td>\n",
327 |        "      <td>0</td>\n",
328 |        "      <td>0</td>\n",
329 |        "      <td>0</td>\n",
330 |        "      <td>0</td>\n",
331 |        "      <td>0</td>\n",
332 |        "      <td>0</td>\n",
333 |        "      <td>0</td>\n",
334 |        "      <td>0</td>\n",
335 |        "      <td>0</td>\n",
336 |        "    </tr>\n",
337 |        "    <tr>\n",
338 |        "      <th>241</th>\n",
339 |        "      <td>241</td>\n",
340 |        "      <td>500</td>\n",
341 |        "      <td>243.1014</td>\n",
342 |        "      <td>[71.9563, 104.546, 105.0698, 107.0491, 107.056...</td>\n",
343 |        "      <td>[0.015833333333333335, 0.018333333333333333, 0...</td>\n",
344 |        "      <td>[24.813600000000008, 42.197700000000026, 94.04...</td>\n",
345 |        "      <td>[0.025833333333333333, 0.0175, 0.065, 1.0, 0.6...</td>\n",
346 |        "      <td>C1C(COC2=C1C=CC(=C2)O)C3=CC=C(C=C3)O</td>\n",
347 |        "      <td>0.0</td>\n",
348 |        "      <td>0</td>\n",
349 |        "      <td>...</td>\n",
350 |        "      <td>2</td>\n",
351 |        "      <td>0</td>\n",
352 |        "      <td>0</td>\n",
353 |        "      <td>0</td>\n",
354 |        "      <td>0</td>\n",
355 |        "      <td>0</td>\n",
356 |        "      <td>0</td>\n",
357 |        "      <td>0</td>\n",
358 |        "      <td>0</td>\n",
359 |        "      <td>0</td>\n",
360 |        "    </tr>\n",
361 |        "  </tbody>\n",
362 |        "</table>\n",
363 |        "<p>242 rows × 68 columns</p>\n",
364 |        "</div>"
365 |       ],
366 |       "text/plain": [
367 |        "     Unnamed: 0  spectrum_id  precursor_mz  \\\n",
368 |        "0             0            3      719.2538   \n",
369 |        "1             1            5      499.2298   \n",
370 |        "2             2            6     1102.5777   \n",
371 |        "3             3           10      472.2082   \n",
372 |        "4             4           11      657.3116   \n",
373 |        "..          ...          ...           ...   \n",
374 |        "237         237          490      268.1541   \n",
375 |        "238         238          491      411.3254   \n",
376 |        "239         239          492      430.2432   \n",
377 |        "240         240          495      578.2076   \n",
378 |        "241         241          500      243.1014   \n",
379 |        "\n",
380 |        "                                                   mzs  \\\n",
381 |        "0    [53.0387, 55.0179, 55.0545, 57.0338, 59.0492, ...   \n",
382 |        "1    [67.0543, 69.0698, 81.0699, 83.0492, 83.0853, ...   \n",
383 |        "2    [81.0334, 83.0491, 85.0224, 85.0284, 85.0333, ...   \n",
384 |        "3    [72.2736, 145.0759, 148.0868, 149.0707, 172.06...   \n",
385 |        "4    [55.0542, 55.9818, 57.0334, 60.2456, 69.0335, ...   \n",
386 |        "..                                                 ...   \n",
387 |        "237  [86.0599, 109.065, 109.1013, 121.101, 123.1168...   \n",
388 |        "238  [55.0544, 57.0701, 67.0544, 69.0699, 81.0699, ...   \n",
389 |        "239  [50.4192, 52.9216, 74.1355, 81.0699, 86.4846, ...   \n",
390 |        "240  [54.5814, 57.0338, 61.0286, 69.0335, 81.0331, ...   \n",
391 |        "241  [71.9563, 104.546, 105.0698, 107.0491, 107.056...   \n",
392 |        "\n",
393 |        "                                           intensities  \\\n",
394 |        "0    [0.017272727272727273, 0.01818181818181818, 0....   \n",
395 |        "1    [0.010357142857142856, 0.04642857142857143, 0....   \n",
396 |        "2    [0.14210526315789473, 0.2236842105263158, 0.01...   \n",
397 |        "3    [0.010697674418604652, 0.03488372093023256, 0....   \n",
398 |        "4    [0.018260869565217393, 0.013043478260869565, 0...   \n",
399 |        "..                                                 ...   \n",
400 |        "237  [0.10689655172413794, 0.04482758620689655, 0.0...   \n",
401 |        "238  [0.0215625, 0.053125, 0.015, 0.34375, 0.046875...   \n",
402 |        "239  [0.05555555555555555, 0.058333333333333334, 0....   \n",
403 |        "240  [0.01, 0.010769230769230769, 0.038461538461538...   \n",
404 |        "241  [0.015833333333333335, 0.018333333333333333, 0...   \n",
405 |        "\n",
406 |        "                                              loss_mzs  \\\n",
407 |        "0                                 [314.15709999999996]   \n",
408 |        "1    [60.02260000000001, 118.02660000000003, 160.03...   \n",
409 |        "2                                                   []   \n",
410 |        "3    [135.07919999999996, 148.0632, 165.08959999999...   \n",
411 |        "4    [158.094, 210.1282, 228.1352, 246.1485, 280.16...   \n",
412 |        "..                                                 ...   \n",
413 |        "237  [18.01030000000003, 36.02120000000002, 46.0054...   \n",
414 |        "238  [18.01060000000001, 88.08850000000001, 142.135...   \n",
415 |        "239  [98.63150000000002, 179.0794, 197.0896, 214.55...   \n",
416 |        "240                               [160.76359999999994]   \n",
417 |        "241  [24.813600000000008, 42.197700000000026, 94.04...   \n",
418 |        "\n",
419 |        "                                      loss_intensities  \\\n",
420 |        "0                                [0.01818181818181818]   \n",
421 |        "1    [0.060714285714285714, 0.39285714285714285, 0....   \n",
422 |        "2                                                   []   \n",
423 |        "3    [0.053488372093023255, 0.10465116279069768, 0....   \n",
424 |        "4    [0.02217391304347826, 0.06086956521739131, 0.1...   \n",
425 |        "..                                                 ...   \n",
426 |        "237  [0.21724137931034482, 0.06206896551724138, 1.0...   \n",
427 |        "238  [0.153125, 0.02125, 0.01875, 0.02625, 0.028437...   \n",
428 |        "239  [0.06666666666666667, 0.2777777777777778, 1.0,...   \n",
429 |        "240                             [0.011923076923076923]   \n",
430 |        "241  [0.025833333333333333, 0.0175, 0.065, 1.0, 0.6...   \n",
431 |        "\n",
432 |        "                                   smiles_preprocessed  num_of_sugars  \\\n",
433 |        "0    CC1C(C(C(C(O1)OC2=C(OC3=C(C(=CC(=C3C2=O)O)O)CC...            2.0   \n",
434 |        "1    CC1CC2(C(C1O)C=C(C(CC3C(C3(C)C)C=C(C2=O)C)OC(=...            0.0   \n",
435 |        "2    CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)O...            4.0   \n",
436 |        "3    CN1C2=C(C=C(C=C2)C(=O)N(CCC(=O)O)C3=CC=CC=N3)N...            0.0   \n",
437 |        "4    CCCC(=O)OCC(C(C(CN1C2=C(C=C(C(=C2)C)C)N=C3C1=N...            1.0   \n",
438 |        "..                                                 ...            ...   \n",
439 |        "237                CC1CCC2C1C(=O)OC(C2C)NC(=O)C3C(O3)C            0.0   \n",
440 |        "238  CC(C)C(C)C=CC(C)C1CCC2C1(CCC3=C2C(=O)C=C4C3(CC...            0.0   \n",
441 |        "239  CC1CCC(C2(C1=CC(CC2)C(=C)C(=O)O)C)OC3C(C(C(C(O...            1.0   \n",
442 |        "240  COC1=C2C(=CC(=C1OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(...            2.0   \n",
443 |        "241               C1C(COC2=C1C=CC(=C2)O)C3=CC=C(C=C3)O            0.0   \n",
444 |        "\n",
445 |        "     Number of aliphatic carboxylic acids  ...  Number of phenols  \\\n",
446 |        "0                                       0  ...                  2   \n",
447 |        "1                                       1  ...                  0   \n",
448 |        "2                                       0  ...                  0   \n",
449 |        "3                                       1  ...                  0   \n",
450 |        "4                                       0  ...                  0   \n",
451 |        "..                                    ...  ...                ...   \n",
452 |        "237                                     0  ...                  0   \n",
453 |        "238                                     0  ...                  0   \n",
454 |        "239                                     1  ...                  0   \n",
455 |        "240                                     1  ...                  0   \n",
456 |        "241                                     0  ...                  2   \n",
457 |        "\n",
458 |        "     Number of phosphoric acid groups  Number of phosphoric ester groups  \\\n",
459 |        "0                                   0                                  0   \n",
460 |        "1                                   0                                  0   \n",
461 |        "2                                   0                                  0   \n",
462 |        "3                                   0                                  0   \n",
463 |        "4                                   0                                  0   \n",
464 |        "..                                ...                                ...   \n",
465 |        "237                                 0                                  0   \n",
466 |        "238                                 0                                  0   \n",
467 |        "239                                 0                                  0   \n",
468 |        "240                                 0                                  0   \n",
469 |        "241                                 0                                  0   \n",
470 |        "\n",
471 |        "     Number of piperdine rings  Number of primary amides  \\\n",
472 |        "0                            0                         0   \n",
473 |        "1                            0                         0   \n",
474 |        "2                            0                         0   \n",
475 |        "3                            0                         0   \n",
476 |        "4                            0                         0   \n",
477 |        "..                         ...                       ...   \n",
478 |        "237                          0                         0   \n",
479 |        "238                          0                         0   \n",
480 |        "239                          0                         0   \n",
481 |        "240                          0                         0   \n",
482 |        "241                          0                         0   \n",
483 |        "\n",
484 |        "     Number of pyridine rings  Number of quaternary nitrogens  \\\n",
485 |        "0                           0                               0   \n",
486 |        "1                           0                               0   \n",
487 |        "2                           0                               0   \n",
488 |        "3                           1                               0   \n",
489 |        "4                           0                               0   \n",
490 |        "..                        ...                             ...   \n",
491 |        "237                         0                               0   \n",
492 |        "238                         0                               0   \n",
493 |        "239                         0                               0   \n",
494 |        "240                         0                               0   \n",
495 |        "241                         0                               0   \n",
496 |        "\n",
497 |        "     Number of thioether  Number of thiazole rings  \\\n",
498 |        "0                      0                         0   \n",
499 |        "1                      0                         0   \n",
500 |        "2                      0                         0   \n",
501 |        "3                      0                         0   \n",
502 |        "4                      0                         0   \n",
503 |        "..                   ...                       ...   \n",
504 |        "237                    0                         0   \n",
505 |        "238                    0                         0   \n",
506 |        "239                    0                         0   \n",
507 |        "240                    0                         0   \n",
508 |        "241                    0                         0   \n",
509 |        "\n",
510 |        "     Number of unbranched alkanes of at least 4 members (excludes halogenated alkanes)  \n",
511 |        "0                                                    0                                  \n",
512 |        "1                                                    0                                  \n",
513 |        "2                                                    0                                  \n",
514 |        "3                                                    0                                  \n",
515 |        "4                                                    0                                  \n",
516 |        "..                                                 ...                                  \n",
517 |        "237                                                  0                                  \n",
518 |        "238                                                  0                                  \n",
519 |        "239                                                  0                                  \n",
520 |        "240                                                  0                                  \n",
521 |        "241                                                  0                                  \n",
522 |        "\n",
523 |        "[242 rows x 68 columns]"
524 |       ]
525 |      },
526 |      "execution_count": 3,
527 |      "metadata": {},
528 |      "output_type": "execute_result"
529 |     }
530 |    ],
531 |    "source": [
532 |     "root='C:/Users/delser/'\n",
533 |     "#name='all-HRMS-loss-header-train-refine_prepro.tsv'\n",
534 |     "#name='all_HRMS_train_24012023_cddd_refine_s.tsv'\n",
535 |     "#name='all_HRMS_valid_24012023_cddd_refine_s.tsv'\n",
536 |     "name='casmi_func_groups_2201.tsv'\n",
537 |     "\n",
538 |     "os.path.join(root, name)\n",
539 |     "df_valid = pd.read_csv(os.path.join(root, name), sep=\"\\t\") \n",
540 |     "\n",
541 |     "df_valid"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "markdown",
546 |    "id": "0a17727e-845e-4778-9626-d283eed2e287",
547 |    "metadata": {},
548 |    "source": [
549 |     "adducts_v=df_valid['adduct'].to_list()"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "markdown",
554 |    "id": "6092dbac-8cfd-445b-96f0-dbad382d593d",
555 |    "metadata": {},
556 |    "source": [
557 |     "len(set(adducts_v+adducts))"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "markdown",
562 |    "id": "7404a6f3-6049-4a64-b839-9deaf36761ca",
563 |    "metadata": {},
564 |    "source": [
565 |     "list(set(adducts_v+adducts))"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "markdown",
570 |    "id": "b27a4cd7-9b87-4113-b512-87df6b0c8f98",
571 |    "metadata": {},
572 |    "source": [
573 |     "sorted_adducts=['[M+H-C2H4O2]+',#-60\n",
574 |     "  '[M-3H2O+H]+',#-54\n",
575 |     "  '[M-2H2O+H]+',#-36\n",
576 |     "  '[M-H2O+H]+',#-18\n",
577 |     " '[M-NH3+H]+',#-17\n",
578 |     "  '[M]+',\n",
579 |     "  '[M+H]+',\n",
580 |     "  '[M+H+2i]+',\n",
581 |     "  '[M+NH3]+',#+17\n",
582 |     "  '[M+NH4]+',#+18\n",
583 |     "  '[M+Na]+',#+23\n",
584 |     "  '[M+H+CH3OH]',#+33\n",
585 |     "  '[M+K]+',#+39\n",
586 |     "  '[2M+H]+',\n",
587 |     " '[2M+H+2i]+',\n",
588 |     "'[2M+NH4]+',\n",
589 |     "'[M-H+2Na]+',\n",
590 |     " '[2M+Na]+',\n",
591 |     "'[2M+K]+',]\n",
592 |     "\n",
593 |     "\n",
594 |     "\n",
595 |     "\n"
596 |    ]
597 |   },
598 |   {
599 |    "cell_type": "markdown",
600 |    "id": "d8a12ea1-c198-4f68-a908-82edd45776d3",
601 |    "metadata": {},
602 |    "source": [
603 |     "sorted_adducts.index('[M-NH3+H]+')"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": 4,
609 |    "id": "8cd72c14-9c64-4784-811f-744f88e54498",
610 |    "metadata": {},
611 |    "outputs": [],
612 |    "source": [
613 |     "def get_adduct_num(adduct):\n",
614 |     "    sorted_adducts=['[M+H-C2H4O2]+',#-60\n",
615 |     "  '[M-3H2O+H]+',#-54\n",
616 |     "  '[M-2H2O+H]+',#-36\n",
617 |     "  '[M-H2O+H]+',#-18\n",
618 |     " '[M-NH3+H]+',#-17\n",
619 |     "  '[M]+',\n",
620 |     "  '[M+H]+',\n",
621 |     "  '[M+H+2i]+',\n",
622 |     "  '[M+NH3]+',#+17\n",
623 |     "  '[M+NH4]+',#+18\n",
624 |     "  '[M+Na]+',#+23\n",
625 |     "  '[M+H+CH3OH]',#+33\n",
626 |     "  '[M+K]+',#+39\n",
627 |     "  '[2M+H]+',\n",
628 |     " '[2M+H+2i]+',\n",
629 |     "'[2M+NH4]+',\n",
630 |     "'[M-H+2Na]+',\n",
631 |     " '[2M+Na]+',\n",
632 |     "'[2M+K]+',]\n",
633 |     "    return sorted_adducts.index(adduct)\n"
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "code",
638 |    "execution_count": 5,
639 |    "id": "44d72a58-b378-4e7c-bff9-9e2a4ea1478f",
640 |    "metadata": {},
641 |    "outputs": [
642 |     {
643 |      "data": {
644 |       "text/plain": [
645 |        "4"
646 |       ]
647 |      },
648 |      "execution_count": 5,
649 |      "metadata": {},
650 |      "output_type": "execute_result"
651 |     }
652 |    ],
653 |    "source": [
654 |     "get_adduct_num('[M-NH3+H]+')"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 6,
660 |    "id": "898370cd-ab79-4442-921b-e1c9490faf50",
661 |    "metadata": {
662 |     "collapsed": true,
663 |     "jupyter": {
664 |      "outputs_hidden": true
665 |     },
666 |     "tags": []
667 |    },
668 |    "outputs": [
669 |     {
670 |      "ename": "AttributeError",
671 |      "evalue": "'Series' object has no attribute 'adduct'",
672 |      "output_type": "error",
673 |      "traceback": [
674 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
675 |       "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
676 |       "Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df_valid[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124madduct_enc\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf_valid\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mget_adduct_num\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madduct\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m      2\u001b[0m df_valid\n",
677 |       "File \u001b[1;32m~\\Anaconda3\\envs\\tf_new\\lib\\site-packages\\pandas\\core\\frame.py:9565\u001b[0m, in \u001b[0;36mDataFrame.apply\u001b[1;34m(self, func, axis, raw, result_type, args, **kwargs)\u001b[0m\n\u001b[0;32m   9554\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapply\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m frame_apply\n\u001b[0;32m   9556\u001b[0m op \u001b[38;5;241m=\u001b[39m frame_apply(\n\u001b[0;32m   9557\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m   9558\u001b[0m     func\u001b[38;5;241m=\u001b[39mfunc,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   9563\u001b[0m     kwargs\u001b[38;5;241m=\u001b[39mkwargs,\n\u001b[0;32m   9564\u001b[0m )\n\u001b[1;32m-> 9565\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapply\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
678 |       "File \u001b[1;32m~\\Anaconda3\\envs\\tf_new\\lib\\site-packages\\pandas\\core\\apply.py:746\u001b[0m, in \u001b[0;36mFrameApply.apply\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    743\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mraw:\n\u001b[0;32m    744\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_raw()\n\u001b[1;32m--> 746\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
679 |       "File \u001b[1;32m~\\Anaconda3\\envs\\tf_new\\lib\\site-packages\\pandas\\core\\apply.py:873\u001b[0m, in \u001b[0;36mFrameApply.apply_standard\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    872\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply_standard\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m--> 873\u001b[0m     results, res_index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_series_generator\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    875\u001b[0m     \u001b[38;5;66;03m# wrap results\u001b[39;00m\n\u001b[0;32m    876\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwrap_results(results, res_index)\n",
680 |       "File \u001b[1;32m~\\Anaconda3\\envs\\tf_new\\lib\\site-packages\\pandas\\core\\apply.py:889\u001b[0m, in \u001b[0;36mFrameApply.apply_series_generator\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    886\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m option_context(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmode.chained_assignment\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m    887\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m i, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(series_gen):\n\u001b[0;32m    888\u001b[0m         \u001b[38;5;66;03m# ignore SettingWithCopy here in case the user mutates\u001b[39;00m\n\u001b[1;32m--> 889\u001b[0m         results[i] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    890\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(results[i], ABCSeries):\n\u001b[0;32m    891\u001b[0m             \u001b[38;5;66;03m# If we have a view on v, we need to make a copy because\u001b[39;00m\n\u001b[0;32m    892\u001b[0m             \u001b[38;5;66;03m#  series_generator will swap out the underlying data\u001b[39;00m\n\u001b[0;32m    893\u001b[0m             results[i] \u001b[38;5;241m=\u001b[39m results[i]\u001b[38;5;241m.\u001b[39mcopy(deep\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
681 |       "Cell \u001b[1;32mIn[6], line 1\u001b[0m, in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m\n\u001b[1;32m----> 1\u001b[0m df_valid[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124madduct_enc\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m df_valid\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: get_adduct_num(\u001b[43mx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madduct\u001b[49m), axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m      2\u001b[0m df_valid\n",
682 |       "File \u001b[1;32m~\\Anaconda3\\envs\\tf_new\\lib\\site-packages\\pandas\\core\\generic.py:5902\u001b[0m, in \u001b[0;36mNDFrame.__getattr__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m   5895\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m   5896\u001b[0m     name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_internal_names_set\n\u001b[0;32m   5897\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_metadata\n\u001b[0;32m   5898\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accessors\n\u001b[0;32m   5899\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info_axis\u001b[38;5;241m.\u001b[39m_can_hold_identifiers_and_holds_name(name)\n\u001b[0;32m   5900\u001b[0m ):\n\u001b[0;32m   5901\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m[name]\n\u001b[1;32m-> 5902\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mobject\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__getattribute__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n",
683 |       "\u001b[1;31mAttributeError\u001b[0m: 'Series' object has no attribute 'adduct'"
684 |      ]
685 |     }
686 |    ],
687 |    "source": [
688 |     "df_valid['adduct_enc'] = df_valid.apply(lambda x: get_adduct_num(x.adduct), axis=1)\n",
689 |     "df_valid"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": null,
695 |    "id": "840ef319-6b66-4127-bff3-bf1fdbf4b42d",
696 |    "metadata": {},
697 |    "outputs": [],
698 |    "source": [
699 |     "df_valid['mf'] = df_valid.apply(lambda x: get_mf(x.smiles_preprocessed), axis=1)"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "markdown",
704 |    "id": "bad5061d-55c6-4939-a449-8b03db6565f7",
705 |    "metadata": {},
706 |    "source": [
707 |     "formulas=df_valid['mf'].to_list()+df_train['mf'].to_list()"
708 |    ]
709 |   },
710 |   {
711 |    "cell_type": "code",
712 |    "execution_count": null,
713 |    "id": "1341520a-a50d-4734-a514-591780f42cd8",
714 |    "metadata": {},
715 |    "outputs": [],
716 |    "source": [
717 |     "from molmass import Formula\n",
718 |     "\n",
719 |     "def number_of_atoms(form): \n",
720 |     "    f= Formula(form.replace(\"-\",\"\").replace(\"+\",\"\"))\n",
721 |     "    MF=(f._elements)\n",
722 |     "    try:\n",
723 |     "        C=MF[\"C\"][0]\n",
724 |     "    except:\n",
725 |     "        C=0\n",
726 |     "    try:\n",
727 |     "        H=MF[\"H\"][0]\n",
728 |     "    except:\n",
729 |     "        H=0\n",
730 |     "    try:\n",
731 |     "        O=MF[\"O\"][0]\n",
732 |     "    except:\n",
733 |     "        O=0\n",
734 |     "    try:\n",
735 |     "        N=MF[\"N\"][0]\n",
736 |     "    except:\n",
737 |     "        N=0\n",
738 |     "    try:\n",
739 |     "        S=MF[\"S\"][0]\n",
740 |     "    except:\n",
741 |     "        S=0\n",
742 |     "    try:\n",
743 |     "        I=MF[\"I\"][0]\n",
744 |     "    except:\n",
745 |     "        I=0\n",
746 |     "    try:\n",
747 |     "        Br=MF[\"Br\"][0]\n",
748 |     "    except:\n",
749 |     "        Br=0\n",
750 |     "    try:\n",
751 |     "        Cl=MF[\"Cl\"][0]\n",
752 |     "    except:\n",
753 |     "        Cl=0\n",
754 |     "    try:\n",
755 |     "        F=MF[\"F\"][0]\n",
756 |     "    except:\n",
757 |     "        F=0\n",
758 |     "    try:\n",
759 |     "        P=MF[\"P\"][0]\n",
760 |     "    except:\n",
761 |     "        P=0\n",
762 |     "        \n",
763 |     "    return [C,H, O, N, S, I, Br,Cl, F,P]\n",
764 |     "    \n",
765 |     " "
766 |    ]
767 |   },
768 |   {
769 |    "cell_type": "code",
770 |    "execution_count": null,
771 |    "id": "5471c5f4-279e-491f-abcc-cf273ae1171a",
772 |    "metadata": {},
773 |    "outputs": [],
774 |    "source": [
775 |     "df_valid['elements'] = df_valid.apply(lambda x:number_of_atoms(x.mf), axis=1)\n",
776 |     "df_valid"
777 |    ]
778 |   },
779 |   {
780 |    "cell_type": "code",
781 |    "execution_count": null,
782 |    "id": "d46734f6-4172-4859-8e56-b8adbbc7b1c0",
783 |    "metadata": {},
784 |    "outputs": [],
785 |    "source": [
786 |     "ele=[\"C\",\"H\", \"O\",\" N\", \"S\", \"I\", \"Br\",\"Cl\", \"F\",\"P\"]\n",
787 |     "df_elements= pd.DataFrame(df_valid['elements'].to_list(), columns=ele)\n",
788 |     "df_elements"
789 |    ]
790 |   },
791 |   {
792 |    "cell_type": "code",
793 |    "execution_count": null,
794 |    "id": "09cdb893-c1ca-40b2-a9d4-2a21752ce2d8",
795 |    "metadata": {},
796 |    "outputs": [],
797 |    "source": [
798 |     "df_elements.to_numpy().max()"
799 |    ]
800 |   },
801 |   {
802 |    "cell_type": "code",
803 |    "execution_count": null,
804 |    "id": "fafb0ac4-acd9-4be0-89d7-30778d0d753d",
805 |    "metadata": {},
806 |    "outputs": [],
807 |    "source": [
808 |     "df_valid=df_valid.join(df_elements,how=\"outer\")\n",
809 |     "df_valid"
810 |    ]
811 |   },
812 |   {
813 |    "cell_type": "code",
814 |    "execution_count": null,
815 |    "id": "0658692f-8df1-41c7-bcd9-2c39982f65fb",
816 |    "metadata": {},
817 |    "outputs": [],
818 |    "source": [
819 |     "df_valid=df_valid.drop([\"elements\",\"mf\"], axis=1)\n",
820 |     "df_valid"
821 |    ]
822 |   },
823 |   {
824 |    "cell_type": "code",
825 |    "execution_count": null,
826 |    "id": "dd96e6b0-1259-41a7-abc2-b8acdb9e76d5",
827 |    "metadata": {},
828 |    "outputs": [],
829 |    "source": [
830 |     "group_array=df_valid.iloc[:, 10:]\n",
831 |     "group_array"
832 |    ]
833 |   },
834 |   {
835 |    "cell_type": "code",
836 |    "execution_count": null,
837 |    "id": "dd11efb4-b712-4aa8-a7ab-44233704b3f9",
838 |    "metadata": {},
839 |    "outputs": [],
840 |    "source": [
841 |     "group_array.to_numpy().max()"
842 |    ]
843 |   },
844 |   {
845 |    "cell_type": "code",
846 |    "execution_count": null,
847 |    "id": "c2bb0831-02a7-4b1b-acdc-ce94b7086c5a",
848 |    "metadata": {},
849 |    "outputs": [],
850 |    "source": [
851 |     "group_array.to_numpy().shape"
852 |    ]
853 |   },
854 |   {
855 |    "cell_type": "code",
856 |    "execution_count": null,
857 |    "id": "d4a9922c-c1e4-41de-8ee1-fff110a5cb2c",
858 |    "metadata": {},
859 |    "outputs": [],
860 |    "source": [
861 |     "def make_float_targets(k):\n",
862 |     "  targets = np.zeros(k, dtype=np.float32)\n",
863 |     "  start = 1.0 / (2 * k)  # like 0.125\n",
864 |     "  delta = 1.0 / k        # like 0.250\n",
865 |     "  for i in range(k):\n",
866 |     "    targets[i] = start + (i * delta) \n",
867 |     "  return targets"
868 |    ]
869 |   },
870 |   {
871 |    "cell_type": "code",
872 |    "execution_count": null,
873 |    "id": "02f0a3e7-154b-4ef6-a2f3-e2667a1cc13b",
874 |    "metadata": {},
875 |    "outputs": [],
876 |    "source": [
877 |     "train_int=group_array.to_numpy()\n",
878 |     "ordinal=make_float_targets(65)\n",
879 |     "all_arr=[]\n",
880 |     "for sample in train_int:\n",
881 |     "    cache=[]\n",
882 |     "    for i in sample:\n",
883 |     "        cache.append(ordinal[int(i)])\n",
884 |     "    x=np.array(cache)\n",
885 |     "    all_arr.append(x)\n",
886 |     "    \n",
887 |     "train_y=np.stack(all_arr)   \n",
888 |     "train_y.shape  "
889 |    ]
890 |   },
891 |   {
892 |    "cell_type": "code",
893 |    "execution_count": null,
894 |    "id": "b5dfb35c-7230-4d0f-9cd7-19b8435f43c1",
895 |    "metadata": {},
896 |    "outputs": [],
897 |    "source": [
898 |     "#np.save('y1_all_HRMS_valid_24012023_cddd_mf.npy',train_y)"
899 |    ]
900 |   },
901 |   {
902 |    "cell_type": "code",
903 |    "execution_count": null,
904 |    "id": "73610487-9ad9-4bdc-a6c6-18f7f72298df",
905 |    "metadata": {},
906 |    "outputs": [],
907 |    "source": [
908 |     "df_valid.to_csv('casmi_func_groups_2201.tsv_mf.tsv',sep='\\t')"
909 |    ]
910 |   }
911 |  ],
912 |  "metadata": {
913 |   "kernelspec": {
914 |    "display_name": "Python 3",
915 |    "language": "python",
916 |    "name": "python3"
917 |   },
918 |   "language_info": {
919 |    "codemirror_mode": {
920 |     "name": "ipython",
921 |     "version": 3
922 |    },
923 |    "file_extension": ".py",
924 |    "mimetype": "text/x-python",
925 |    "name": "python",
926 |    "nbconvert_exporter": "python",
927 |    "pygments_lexer": "ipython3",
928 |    "version": "3.8.13"
929 |   }
930 |  },
931 |  "nbformat": 4,
932 |  "nbformat_minor": 5
933 | }
934 | 


--------------------------------------------------------------------------------
/preprocessing_onlin-v3_mgf.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "id": "0c60b9c7-bba8-4e7f-91d8-7935ef85da08",
   7 |    "metadata": {},
   8 |    "outputs": [],
   9 |    "source": [
  10 |     "\n",
  11 |     "from tqdm import tqdm\n",
  12 |     "import tensorflow as tf\n",
  13 |     "import json\n",
  14 |     "import tensorflow_text as text\n",
  15 |     "import os\n",
  16 |     "import numpy as np\n",
  17 |     "import pandas as pd\n",
  18 |     "import pickle\n",
  19 |     "from matchms import set_matchms_logger_level\n",
  20 |     "import pandas as pd\n",
  21 |     "set_matchms_logger_level(\"ERROR\")\n",
  22 |     "from matchms.filtering import add_losses\n",
  23 |     "from matchms.filtering import add_parent_mass\n",
  24 |     "from matchms.filtering import default_filters\n",
  25 |     "from matchms.filtering import normalize_intensities\n",
  26 |     "from matchms.filtering import select_by_intensity\n",
  27 |     "from matchms.filtering import reduce_to_number_of_peaks\n",
  28 |     "from matchms.filtering import require_minimum_number_of_peaks\n",
  29 |     "from matchms.filtering import select_by_mz\n",
  30 |     "from matchms.importing import load_from_mgf\n",
  31 |     "from matchms.exporting import save_as_mgf\n",
  32 |     "from matchms.importing import load_from_msp\n",
  33 |     "\n",
  34 |     "\n",
  35 |     "from matchms.filtering import repair_inchi_inchikey_smiles\n",
  36 |     "from matchms.filtering import derive_inchikey_from_inchi\n",
  37 |     "from matchms.filtering import derive_smiles_from_inchi\n",
  38 |     "from matchms.filtering import derive_inchi_from_smiles\n",
  39 |     "from matchms.filtering import harmonize_undefined_inchi\n",
  40 |     "from matchms.filtering import harmonize_undefined_inchikey\n",
  41 |     "from matchms.filtering import harmonize_undefined_smiles"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "code",
  46 |    "execution_count": 5,
  47 |    "id": "30ed8f9b-215b-4fac-8d09-64317fb965e3",
  48 |    "metadata": {},
  49 |    "outputs": [
  50 |     {
  51 |      "name": "stdout",
  52 |      "output_type": "stream",
  53 |      "text": [
  54 |       "Python 3.9.15\n"
  55 |      ]
  56 |     }
  57 |    ],
  58 |    "source": [
  59 |     "!python -V"
  60 |    ]
  61 |   },
  62 |   {
  63 |    "cell_type": "code",
  64 |    "execution_count": 2,
  65 |    "id": "1a88907a-9f26-4877-9356-1fe77bfed534",
  66 |    "metadata": {},
  67 |    "outputs": [
  68 |     {
  69 |      "name": "stdout",
  70 |      "output_type": "stream",
  71 |      "text": [
  72 |       "CPU times: total: 7.12 s\n",
  73 |       "Wall time: 7.25 s\n"
  74 |      ]
  75 |     }
  76 |    ],
  77 |    "source": [
  78 |     "%%time\n",
  79 |     "\n",
  80 |     "#from spec2vec.model_building import train_new_word2vec_model\n",
  81 |     "\n",
  82 |     "def spectrum_processing(s):\n",
  83 |     "    \"\"\"This is how one would typically design a desired pre- and post-\n",
  84 |     "    processing pipeline.\"\"\"\n",
  85 |     "    s = default_filters(s)\n",
  86 |     "    s = add_parent_mass(s)\n",
  87 |     "    s = normalize_intensities(s)\n",
  88 |     "    s = select_by_intensity(s, intensity_from=0.01)\n",
  89 |     "    s = reduce_to_number_of_peaks(s, n_required=5, n_max=250)\n",
  90 |     "    s = select_by_mz(s, mz_from=15, mz_to=2000)\n",
  91 |     "    s = add_losses(s, loss_mz_from=15.0, loss_mz_to=350.0)\n",
  92 |     "    s = require_minimum_number_of_peaks(s, n_required=5)\n",
  93 |     "    return s\n",
  94 |     "\n",
  95 |     "\n",
  96 |     "\n",
  97 |     "def metadata_processing(spectrum):\n",
  98 |     "    spectrum = default_filters(spectrum)\n",
  99 |     "    spectrum = repair_inchi_inchikey_smiles(spectrum)\n",
 100 |     "    spectrum = derive_inchi_from_smiles(spectrum)\n",
 101 |     "    spectrum = derive_smiles_from_inchi(spectrum)\n",
 102 |     "    spectrum = derive_inchikey_from_inchi(spectrum)\n",
 103 |     "    spectrum = harmonize_undefined_smiles(spectrum)\n",
 104 |     "    spectrum = harmonize_undefined_inchi(spectrum)\n",
 105 |     "    spectrum = harmonize_undefined_inchikey(spectrum)\n",
 106 |     "    return spectrum\n",
 107 |     "# Load data from MGF file and apply filters\n",
 108 |     "\n",
 109 |     "import os\n",
 110 |     "from matchms.importing import load_from_mgf\n",
 111 |     "path_data = \"C:/Users/delser/mass2smiles\"  # enter path to downloaded mgf file\n",
 112 |     "file_mgf = os.path.join(path_data, \n",
 113 |     "                        \"casmi_candidates_pos_casmi_id.mgf\")\n",
 114 |     "spectrums = list(load_from_mgf(file_mgf))\n",
 115 |     "\n",
 116 |     "spectrums = [metadata_processing(s) for s in spectrums]\n",
 117 |     "spectrums = [spectrum_processing(s) for s in spectrums]\n",
 118 |     "#spectrums = [spectrum_processing(s) for s in load_from_mgf(\"/Users/delser/Desktop/PhD/Phytochemistry/NP-Databases/CFM-4_DB/TOTAL_COMPOUNDS_DB.energies_merged_name.mgf\")]\n",
 119 |     "#spectrums = [spectrum_processing(s) for s in load_from_mgf(\"/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/altissues15072021-py.mgf\")]\n",
 120 |     "# Omit spectrums that didn't qualify for analysis\n",
 121 |     "spectrums = [s for s in spectrums if s is not None]\n",
 122 |     "\n",
 123 |     "\n",
 124 |     "# Create spectrum documents\n",
 125 |     "#reference_documents = [SpectrumDocument(s, n_decimals=2) for s in spectrums]"
 126 |    ]
 127 |   },
 128 |   {
 129 |    "cell_type": "code",
 130 |    "execution_count": 3,
 131 |    "id": "89dca0b7-9f27-441c-95b4-3f074766ead8",
 132 |    "metadata": {},
 133 |    "outputs": [],
 134 |    "source": [
 135 |     "precs = []\n",
 136 |     "IDs = []\n",
 137 |     "mzs=[]\n",
 138 |     "ints=[]\n",
 139 |     "loss_mzs=[]\n",
 140 |     "loss_ints=[]\n",
 141 |     "\n",
 142 |     "\n",
 143 |     "for spec in spectrums: \n",
 144 |     "    IDs.append(spec.get(\"feature_id\"))\n",
 145 |     "    precs.append(spec.get(\"precursor_mz\"))\n",
 146 |     "    mzs.append(list(spec.peaks.mz))\n",
 147 |     "    ints.append(list(spec.peaks.intensities))\n",
 148 |     "    loss_mzs.append(list(spec.losses.mz))\n",
 149 |     "    loss_ints.append(list(spec.losses.intensities))\n",
 150 |     "    "
 151 |    ]
 152 |   },
 153 |   {
 154 |    "cell_type": "code",
 155 |    "execution_count": 27,
 156 |    "id": "4321c672-2b43-41fc-930d-3665db0c2608",
 157 |    "metadata": {
 158 |     "tags": []
 159 |    },
 160 |    "outputs": [
 161 |     {
 162 |      "data": {
 163 |       "text/plain": [
 164 |        "{56.604: 0.020625,\n",
 165 |        " 57.0337: 0.03625,\n",
 166 |        " 61.0286: 0.10625,\n",
 167 |        " 69.0336: 0.51875,\n",
 168 |        " 71.0492: 0.2,\n",
 169 |        " 74.5668: 0.018125,\n",
 170 |        " 81.0336: 0.075,\n",
 171 |        " 83.0491: 0.3,\n",
 172 |        " 85.0283: 0.425,\n",
 173 |        " 87.044: 0.1625,\n",
 174 |        " 95.0492: 0.024375,\n",
 175 |        " 97.0286: 0.125,\n",
 176 |        " 99.0442: 0.034375,\n",
 177 |        " 109.0281: 0.0325,\n",
 178 |        " 111.0441: 0.1625,\n",
 179 |        " 115.0384: 0.02625,\n",
 180 |        " 127.0392: 0.026875,\n",
 181 |        " 129.0546: 0.1125,\n",
 182 |        " 232.1556: 0.020625,\n",
 183 |        " 299.0545: 1.0,\n",
 184 |        " 299.0909: 0.036875,\n",
 185 |        " 311.0572: 0.01875,\n",
 186 |        " 355.1155: 0.15,\n",
 187 |        " 391.0793: 0.0225,\n",
 188 |        " 407.7533: 0.02,\n",
 189 |        " 498.4384: 0.02125,\n",
 190 |        " 206.79970000000003: 0.02125,\n",
 191 |        " 297.4848: 0.02,\n",
 192 |        " 314.15880000000004: 0.0225}"
 193 |       ]
 194 |      },
 195 |      "execution_count": 27,
 196 |      "metadata": {},
 197 |      "output_type": "execute_result"
 198 |     }
 199 |    ],
 200 |    "source": [
 201 |     "res = dict(zip(mzs[3]+loss_mzs[3], ints[3]+loss_ints[3]))\n",
 202 |     "res"
 203 |    ]
 204 |   },
 205 |   {
 206 |    "cell_type": "code",
 207 |    "execution_count": 28,
 208 |    "id": "5cf3fe57-c57d-4913-8e1b-d1da1696486b",
 209 |    "metadata": {},
 210 |    "outputs": [
 211 |     {
 212 |      "data": {
 213 |       "text/plain": [
 214 |        "{56.604: 0.020625,\n",
 215 |        " 57.0337: 0.03625,\n",
 216 |        " 61.0286: 0.10625,\n",
 217 |        " 69.0336: 0.51875,\n",
 218 |        " 71.0492: 0.2,\n",
 219 |        " 74.5668: 0.018125,\n",
 220 |        " 81.0336: 0.075,\n",
 221 |        " 83.0491: 0.3,\n",
 222 |        " 85.0283: 0.425,\n",
 223 |        " 87.044: 0.1625,\n",
 224 |        " 95.0492: 0.024375,\n",
 225 |        " 97.0286: 0.125,\n",
 226 |        " 99.0442: 0.034375,\n",
 227 |        " 109.0281: 0.0325,\n",
 228 |        " 111.0441: 0.1625,\n",
 229 |        " 115.0384: 0.02625,\n",
 230 |        " 127.0392: 0.026875,\n",
 231 |        " 129.0546: 0.1125,\n",
 232 |        " 206.79970000000003: 0.02125,\n",
 233 |        " 232.1556: 0.020625,\n",
 234 |        " 297.4848: 0.02,\n",
 235 |        " 299.0545: 1.0,\n",
 236 |        " 299.0909: 0.036875,\n",
 237 |        " 311.0572: 0.01875,\n",
 238 |        " 314.15880000000004: 0.0225,\n",
 239 |        " 355.1155: 0.15,\n",
 240 |        " 391.0793: 0.0225,\n",
 241 |        " 407.7533: 0.02,\n",
 242 |        " 498.4384: 0.02125}"
 243 |       ]
 244 |      },
 245 |      "execution_count": 28,
 246 |      "metadata": {},
 247 |      "output_type": "execute_result"
 248 |     }
 249 |    ],
 250 |    "source": [
 251 |     "res=dict(sorted(res.items()))\n",
 252 |     "res"
 253 |    ]
 254 |   },
 255 |   {
 256 |    "cell_type": "code",
 257 |    "execution_count": 32,
 258 |    "id": "777cd1bc-3b0e-4c29-a327-2c1fd755e258",
 259 |    "metadata": {},
 260 |    "outputs": [
 261 |     {
 262 |      "data": {
 263 |       "text/plain": [
 264 |        "list"
 265 |       ]
 266 |      },
 267 |      "execution_count": 32,
 268 |      "metadata": {},
 269 |      "output_type": "execute_result"
 270 |     }
 271 |    ],
 272 |    "source": [
 273 |     "type(list(res.keys()))"
 274 |    ]
 275 |   },
 276 |   {
 277 |    "cell_type": "code",
 278 |    "execution_count": 30,
 279 |    "id": "7fe78582-fb81-4480-aea3-850216adf45c",
 280 |    "metadata": {},
 281 |    "outputs": [
 282 |     {
 283 |      "data": {
 284 |       "text/plain": [
 285 |        "dict_values([0.020625, 0.03625, 0.10625, 0.51875, 0.2, 0.018125, 0.075, 0.3, 0.425, 0.1625, 0.024375, 0.125, 0.034375, 0.0325, 0.1625, 0.02625, 0.026875, 0.1125, 0.02125, 0.020625, 0.02, 1.0, 0.036875, 0.01875, 0.0225, 0.15, 0.0225, 0.02, 0.02125])"
 286 |       ]
 287 |      },
 288 |      "execution_count": 30,
 289 |      "metadata": {},
 290 |      "output_type": "execute_result"
 291 |     }
 292 |    ],
 293 |    "source": [
 294 |     "res.values()"
 295 |    ]
 296 |   },
 297 |   {
 298 |    "cell_type": "code",
 299 |    "execution_count": 4,
 300 |    "id": "2d4707a7-6f95-4fd3-89bc-23ac60ab019a",
 301 |    "metadata": {},
 302 |    "outputs": [
 303 |     {
 304 |      "data": {
 305 |       "text/html": [
 306 |        "<div>\n",
 307 |        "<style scoped>\n",
 308 |        "    .dataframe tbody tr th:only-of-type {\n",
 309 |        "        vertical-align: middle;\n",
 310 |        "    }\n",
 311 |        "\n",
 312 |        "    .dataframe tbody tr th {\n",
 313 |        "        vertical-align: top;\n",
 314 |        "    }\n",
 315 |        "\n",
 316 |        "    .dataframe thead th {\n",
 317 |        "        text-align: right;\n",
 318 |        "    }\n",
 319 |        "</style>\n",
 320 |        "<table border=\"1\" class=\"dataframe\">\n",
 321 |        "  <thead>\n",
 322 |        "    <tr style=\"text-align: right;\">\n",
 323 |        "      <th></th>\n",
 324 |        "      <th>feature_id</th>\n",
 325 |        "      <th>precursor_mz</th>\n",
 326 |        "      <th>mzs</th>\n",
 327 |        "      <th>intensities</th>\n",
 328 |        "      <th>loss_mzs</th>\n",
 329 |        "      <th>loss_intensities</th>\n",
 330 |        "    </tr>\n",
 331 |        "  </thead>\n",
 332 |        "  <tbody>\n",
 333 |        "    <tr>\n",
 334 |        "      <th>0</th>\n",
 335 |        "      <td>398</td>\n",
 336 |        "      <td>235.1691</td>\n",
 337 |        "      <td>[53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...</td>\n",
 338 |        "      <td>[0.07166666666666667, 0.020833333333333332, 0....</td>\n",
 339 |        "      <td>[19.8476, 19.87299999999999, 56.06339999999997...</td>\n",
 340 |        "      <td>[0.075, 0.07083333333333333, 0.031666666666666...</td>\n",
 341 |        "    </tr>\n",
 342 |        "    <tr>\n",
 343 |        "      <th>1</th>\n",
 344 |        "      <td>398</td>\n",
 345 |        "      <td>235.1691</td>\n",
 346 |        "      <td>[53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...</td>\n",
 347 |        "      <td>[0.07166666666666667, 0.020833333333333332, 0....</td>\n",
 348 |        "      <td>[19.8476, 19.87299999999999, 56.06339999999997...</td>\n",
 349 |        "      <td>[0.075, 0.07083333333333333, 0.031666666666666...</td>\n",
 350 |        "    </tr>\n",
 351 |        "    <tr>\n",
 352 |        "      <th>2</th>\n",
 353 |        "      <td>159</td>\n",
 354 |        "      <td>485.2164</td>\n",
 355 |        "      <td>[66.3952, 68.416, 81.3048, 100.9429, 121.0081,...</td>\n",
 356 |        "      <td>[0.05, 0.04565217391304348, 0.0521739130434782...</td>\n",
 357 |        "      <td>[18.01230000000004, 96.17940000000004, 102.068...</td>\n",
 358 |        "      <td>[1.0, 0.058695652173913045, 0.0630434782608695...</td>\n",
 359 |        "    </tr>\n",
 360 |        "    <tr>\n",
 361 |        "      <th>3</th>\n",
 362 |        "      <td>169</td>\n",
 363 |        "      <td>705.2381</td>\n",
 364 |        "      <td>[56.604, 57.0337, 61.0286, 69.0336, 71.0492, 7...</td>\n",
 365 |        "      <td>[0.020625, 0.03625, 0.10625, 0.51875, 0.2, 0.0...</td>\n",
 366 |        "      <td>[206.79970000000003, 297.4848, 314.15880000000...</td>\n",
 367 |        "      <td>[0.02125, 0.02, 0.0225]</td>\n",
 368 |        "    </tr>\n",
 369 |        "    <tr>\n",
 370 |        "      <th>4</th>\n",
 371 |        "      <td>423</td>\n",
 372 |        "      <td>441.2264</td>\n",
 373 |        "      <td>[62.099, 62.3016, 65.6795, 71.5638, 95.0857, 1...</td>\n",
 374 |        "      <td>[0.07096774193548387, 0.08709677419354839, 0.0...</td>\n",
 375 |        "      <td>[70.07659999999998, 88.08980000000003, 130.173...</td>\n",
 376 |        "      <td>[1.0, 0.14516129032258066, 0.12903225806451613...</td>\n",
 377 |        "    </tr>\n",
 378 |        "    <tr>\n",
 379 |        "      <th>...</th>\n",
 380 |        "      <td>...</td>\n",
 381 |        "      <td>...</td>\n",
 382 |        "      <td>...</td>\n",
 383 |        "      <td>...</td>\n",
 384 |        "      <td>...</td>\n",
 385 |        "      <td>...</td>\n",
 386 |        "    </tr>\n",
 387 |        "    <tr>\n",
 388 |        "      <th>231</th>\n",
 389 |        "      <td>439</td>\n",
 390 |        "      <td>336.0647</td>\n",
 391 |        "      <td>[55.173, 55.6729, 57.7501, 63.2992, 64.3001, 7...</td>\n",
 392 |        "      <td>[0.010476190476190476, 0.010952380952380953, 0...</td>\n",
 393 |        "      <td>[17.859800000000007, 72.62130000000002, 119.40...</td>\n",
 394 |        "      <td>[0.04428571428571428, 0.010952380952380953, 0....</td>\n",
 395 |        "    </tr>\n",
 396 |        "    <tr>\n",
 397 |        "      <th>232</th>\n",
 398 |        "      <td>322</td>\n",
 399 |        "      <td>532.3113</td>\n",
 400 |        "      <td>[53.6431, 61.0821, 67.0543, 69.0334, 81.0699, ...</td>\n",
 401 |        "      <td>[0.0125, 0.012, 0.016, 0.08, 0.23, 0.285, 0.07...</td>\n",
 402 |        "      <td>[197.09179999999998, 207.04439999999994, 215.1...</td>\n",
 403 |        "      <td>[0.0245, 0.015, 0.205, 0.135, 0.28, 0.0205, 0....</td>\n",
 404 |        "    </tr>\n",
 405 |        "    <tr>\n",
 406 |        "      <th>233</th>\n",
 407 |        "      <td>300</td>\n",
 408 |        "      <td>478.3159</td>\n",
 409 |        "      <td>[63.7647, 71.5594, 93.5641, 108.7938, 119.8244...</td>\n",
 410 |        "      <td>[0.10476190476190476, 0.1, 0.11428571428571428...</td>\n",
 411 |        "      <td>[16.459400000000016, 35.03750000000002, 53.050...</td>\n",
 412 |        "      <td>[0.10952380952380952, 0.13333333333333333, 0.1...</td>\n",
 413 |        "    </tr>\n",
 414 |        "    <tr>\n",
 415 |        "      <th>234</th>\n",
 416 |        "      <td>254</td>\n",
 417 |        "      <td>329.2320</td>\n",
 418 |        "      <td>[53.0389, 55.0182, 55.0546, 67.008, 67.0544, 6...</td>\n",
 419 |        "      <td>[0.0456140350877193, 0.05087719298245614, 0.28...</td>\n",
 420 |        "      <td>[89.70510000000002, 112.67550000000003, 145.15...</td>\n",
 421 |        "      <td>[0.042105263157894736, 0.2631578947368421, 0.0...</td>\n",
 422 |        "    </tr>\n",
 423 |        "    <tr>\n",
 424 |        "      <th>235</th>\n",
 425 |        "      <td>237</td>\n",
 426 |        "      <td>278.1172</td>\n",
 427 |        "      <td>[50.8153, 74.535, 78.0275, 93.2556, 97.6804, 1...</td>\n",
 428 |        "      <td>[0.01818181818181818, 0.01818181818181818, 0.0...</td>\n",
 429 |        "      <td>[67.65480000000002, 83.67750000000004, 86.4763...</td>\n",
 430 |        "      <td>[0.02, 0.019090909090909092, 0.018181818181818...</td>\n",
 431 |        "    </tr>\n",
 432 |        "  </tbody>\n",
 433 |        "</table>\n",
 434 |        "<p>236 rows × 6 columns</p>\n",
 435 |        "</div>"
 436 |       ],
 437 |       "text/plain": [
 438 |        "    feature_id  precursor_mz  \\\n",
 439 |        "0          398      235.1691   \n",
 440 |        "1          398      235.1691   \n",
 441 |        "2          159      485.2164   \n",
 442 |        "3          169      705.2381   \n",
 443 |        "4          423      441.2264   \n",
 444 |        "..         ...           ...   \n",
 445 |        "231        439      336.0647   \n",
 446 |        "232        322      532.3113   \n",
 447 |        "233        300      478.3159   \n",
 448 |        "234        254      329.2320   \n",
 449 |        "235        237      278.1172   \n",
 450 |        "\n",
 451 |        "                                                   mzs  \\\n",
 452 |        "0    [53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...   \n",
 453 |        "1    [53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...   \n",
 454 |        "2    [66.3952, 68.416, 81.3048, 100.9429, 121.0081,...   \n",
 455 |        "3    [56.604, 57.0337, 61.0286, 69.0336, 71.0492, 7...   \n",
 456 |        "4    [62.099, 62.3016, 65.6795, 71.5638, 95.0857, 1...   \n",
 457 |        "..                                                 ...   \n",
 458 |        "231  [55.173, 55.6729, 57.7501, 63.2992, 64.3001, 7...   \n",
 459 |        "232  [53.6431, 61.0821, 67.0543, 69.0334, 81.0699, ...   \n",
 460 |        "233  [63.7647, 71.5594, 93.5641, 108.7938, 119.8244...   \n",
 461 |        "234  [53.0389, 55.0182, 55.0546, 67.008, 67.0544, 6...   \n",
 462 |        "235  [50.8153, 74.535, 78.0275, 93.2556, 97.6804, 1...   \n",
 463 |        "\n",
 464 |        "                                           intensities  \\\n",
 465 |        "0    [0.07166666666666667, 0.020833333333333332, 0....   \n",
 466 |        "1    [0.07166666666666667, 0.020833333333333332, 0....   \n",
 467 |        "2    [0.05, 0.04565217391304348, 0.0521739130434782...   \n",
 468 |        "3    [0.020625, 0.03625, 0.10625, 0.51875, 0.2, 0.0...   \n",
 469 |        "4    [0.07096774193548387, 0.08709677419354839, 0.0...   \n",
 470 |        "..                                                 ...   \n",
 471 |        "231  [0.010476190476190476, 0.010952380952380953, 0...   \n",
 472 |        "232  [0.0125, 0.012, 0.016, 0.08, 0.23, 0.285, 0.07...   \n",
 473 |        "233  [0.10476190476190476, 0.1, 0.11428571428571428...   \n",
 474 |        "234  [0.0456140350877193, 0.05087719298245614, 0.28...   \n",
 475 |        "235  [0.01818181818181818, 0.01818181818181818, 0.0...   \n",
 476 |        "\n",
 477 |        "                                              loss_mzs  \\\n",
 478 |        "0    [19.8476, 19.87299999999999, 56.06339999999997...   \n",
 479 |        "1    [19.8476, 19.87299999999999, 56.06339999999997...   \n",
 480 |        "2    [18.01230000000004, 96.17940000000004, 102.068...   \n",
 481 |        "3    [206.79970000000003, 297.4848, 314.15880000000...   \n",
 482 |        "4    [70.07659999999998, 88.08980000000003, 130.173...   \n",
 483 |        "..                                                 ...   \n",
 484 |        "231  [17.859800000000007, 72.62130000000002, 119.40...   \n",
 485 |        "232  [197.09179999999998, 207.04439999999994, 215.1...   \n",
 486 |        "233  [16.459400000000016, 35.03750000000002, 53.050...   \n",
 487 |        "234  [89.70510000000002, 112.67550000000003, 145.15...   \n",
 488 |        "235  [67.65480000000002, 83.67750000000004, 86.4763...   \n",
 489 |        "\n",
 490 |        "                                      loss_intensities  \n",
 491 |        "0    [0.075, 0.07083333333333333, 0.031666666666666...  \n",
 492 |        "1    [0.075, 0.07083333333333333, 0.031666666666666...  \n",
 493 |        "2    [1.0, 0.058695652173913045, 0.0630434782608695...  \n",
 494 |        "3                              [0.02125, 0.02, 0.0225]  \n",
 495 |        "4    [1.0, 0.14516129032258066, 0.12903225806451613...  \n",
 496 |        "..                                                 ...  \n",
 497 |        "231  [0.04428571428571428, 0.010952380952380953, 0....  \n",
 498 |        "232  [0.0245, 0.015, 0.205, 0.135, 0.28, 0.0205, 0....  \n",
 499 |        "233  [0.10952380952380952, 0.13333333333333333, 0.1...  \n",
 500 |        "234  [0.042105263157894736, 0.2631578947368421, 0.0...  \n",
 501 |        "235  [0.02, 0.019090909090909092, 0.018181818181818...  \n",
 502 |        "\n",
 503 |        "[236 rows x 6 columns]"
 504 |       ]
 505 |      },
 506 |      "execution_count": 4,
 507 |      "metadata": {},
 508 |      "output_type": "execute_result"
 509 |     }
 510 |    ],
 511 |    "source": [
 512 |     "metadata = pd.DataFrame(list(zip(IDs, precs,mzs,ints,loss_mzs,loss_ints)), columns=[\"feature_id\", \"precursor_mz\",\"mzs\",\"intensities\",\"loss_mzs\",\"loss_intensities\" ])\n",
 513 |     "metadata"
 514 |    ]
 515 |   },
 516 |   {
 517 |    "cell_type": "code",
 518 |    "execution_count": 5,
 519 |    "id": "538b75bb-ec4d-4bdd-82f6-687a89cd5276",
 520 |    "metadata": {},
 521 |    "outputs": [
 522 |     {
 523 |      "data": {
 524 |       "text/html": [
 525 |        "<div>\n",
 526 |        "<style scoped>\n",
 527 |        "    .dataframe tbody tr th:only-of-type {\n",
 528 |        "        vertical-align: middle;\n",
 529 |        "    }\n",
 530 |        "\n",
 531 |        "    .dataframe tbody tr th {\n",
 532 |        "        vertical-align: top;\n",
 533 |        "    }\n",
 534 |        "\n",
 535 |        "    .dataframe thead th {\n",
 536 |        "        text-align: right;\n",
 537 |        "    }\n",
 538 |        "</style>\n",
 539 |        "<table border=\"1\" class=\"dataframe\">\n",
 540 |        "  <thead>\n",
 541 |        "    <tr style=\"text-align: right;\">\n",
 542 |        "      <th></th>\n",
 543 |        "      <th>feature_id</th>\n",
 544 |        "      <th>precursor_mz</th>\n",
 545 |        "      <th>mzs</th>\n",
 546 |        "      <th>intensities</th>\n",
 547 |        "      <th>loss_mzs</th>\n",
 548 |        "      <th>loss_intensities</th>\n",
 549 |        "    </tr>\n",
 550 |        "  </thead>\n",
 551 |        "  <tbody>\n",
 552 |        "    <tr>\n",
 553 |        "      <th>0</th>\n",
 554 |        "      <td>398</td>\n",
 555 |        "      <td>235.1691</td>\n",
 556 |        "      <td>[53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...</td>\n",
 557 |        "      <td>[0.07166666666666667, 0.020833333333333332, 0....</td>\n",
 558 |        "      <td>[19.8476, 19.87299999999999, 56.06339999999997...</td>\n",
 559 |        "      <td>[0.075, 0.07083333333333333, 0.031666666666666...</td>\n",
 560 |        "    </tr>\n",
 561 |        "    <tr>\n",
 562 |        "      <th>1</th>\n",
 563 |        "      <td>398</td>\n",
 564 |        "      <td>235.1691</td>\n",
 565 |        "      <td>[53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...</td>\n",
 566 |        "      <td>[0.07166666666666667, 0.020833333333333332, 0....</td>\n",
 567 |        "      <td>[19.8476, 19.87299999999999, 56.06339999999997...</td>\n",
 568 |        "      <td>[0.075, 0.07083333333333333, 0.031666666666666...</td>\n",
 569 |        "    </tr>\n",
 570 |        "    <tr>\n",
 571 |        "      <th>2</th>\n",
 572 |        "      <td>159</td>\n",
 573 |        "      <td>485.2164</td>\n",
 574 |        "      <td>[66.3952, 68.416, 81.3048, 100.9429, 121.0081,...</td>\n",
 575 |        "      <td>[0.05, 0.04565217391304348, 0.0521739130434782...</td>\n",
 576 |        "      <td>[18.01230000000004, 96.17940000000004, 102.068...</td>\n",
 577 |        "      <td>[1.0, 0.058695652173913045, 0.0630434782608695...</td>\n",
 578 |        "    </tr>\n",
 579 |        "    <tr>\n",
 580 |        "      <th>3</th>\n",
 581 |        "      <td>169</td>\n",
 582 |        "      <td>705.2381</td>\n",
 583 |        "      <td>[56.604, 57.0337, 61.0286, 69.0336, 71.0492, 7...</td>\n",
 584 |        "      <td>[0.020625, 0.03625, 0.10625, 0.51875, 0.2, 0.0...</td>\n",
 585 |        "      <td>[206.79970000000003, 297.4848, 314.15880000000...</td>\n",
 586 |        "      <td>[0.02125, 0.02, 0.0225]</td>\n",
 587 |        "    </tr>\n",
 588 |        "    <tr>\n",
 589 |        "      <th>4</th>\n",
 590 |        "      <td>423</td>\n",
 591 |        "      <td>441.2264</td>\n",
 592 |        "      <td>[62.099, 62.3016, 65.6795, 71.5638, 95.0857, 1...</td>\n",
 593 |        "      <td>[0.07096774193548387, 0.08709677419354839, 0.0...</td>\n",
 594 |        "      <td>[70.07659999999998, 88.08980000000003, 130.173...</td>\n",
 595 |        "      <td>[1.0, 0.14516129032258066, 0.12903225806451613...</td>\n",
 596 |        "    </tr>\n",
 597 |        "    <tr>\n",
 598 |        "      <th>...</th>\n",
 599 |        "      <td>...</td>\n",
 600 |        "      <td>...</td>\n",
 601 |        "      <td>...</td>\n",
 602 |        "      <td>...</td>\n",
 603 |        "      <td>...</td>\n",
 604 |        "      <td>...</td>\n",
 605 |        "    </tr>\n",
 606 |        "    <tr>\n",
 607 |        "      <th>231</th>\n",
 608 |        "      <td>439</td>\n",
 609 |        "      <td>336.0647</td>\n",
 610 |        "      <td>[55.173, 55.6729, 57.7501, 63.2992, 64.3001, 7...</td>\n",
 611 |        "      <td>[0.010476190476190476, 0.010952380952380953, 0...</td>\n",
 612 |        "      <td>[17.859800000000007, 72.62130000000002, 119.40...</td>\n",
 613 |        "      <td>[0.04428571428571428, 0.010952380952380953, 0....</td>\n",
 614 |        "    </tr>\n",
 615 |        "    <tr>\n",
 616 |        "      <th>232</th>\n",
 617 |        "      <td>322</td>\n",
 618 |        "      <td>532.3113</td>\n",
 619 |        "      <td>[53.6431, 61.0821, 67.0543, 69.0334, 81.0699, ...</td>\n",
 620 |        "      <td>[0.0125, 0.012, 0.016, 0.08, 0.23, 0.285, 0.07...</td>\n",
 621 |        "      <td>[197.09179999999998, 207.04439999999994, 215.1...</td>\n",
 622 |        "      <td>[0.0245, 0.015, 0.205, 0.135, 0.28, 0.0205, 0....</td>\n",
 623 |        "    </tr>\n",
 624 |        "    <tr>\n",
 625 |        "      <th>233</th>\n",
 626 |        "      <td>300</td>\n",
 627 |        "      <td>478.3159</td>\n",
 628 |        "      <td>[63.7647, 71.5594, 93.5641, 108.7938, 119.8244...</td>\n",
 629 |        "      <td>[0.10476190476190476, 0.1, 0.11428571428571428...</td>\n",
 630 |        "      <td>[16.459400000000016, 35.03750000000002, 53.050...</td>\n",
 631 |        "      <td>[0.10952380952380952, 0.13333333333333333, 0.1...</td>\n",
 632 |        "    </tr>\n",
 633 |        "    <tr>\n",
 634 |        "      <th>234</th>\n",
 635 |        "      <td>254</td>\n",
 636 |        "      <td>329.2320</td>\n",
 637 |        "      <td>[53.0389, 55.0182, 55.0546, 67.008, 67.0544, 6...</td>\n",
 638 |        "      <td>[0.0456140350877193, 0.05087719298245614, 0.28...</td>\n",
 639 |        "      <td>[89.70510000000002, 112.67550000000003, 145.15...</td>\n",
 640 |        "      <td>[0.042105263157894736, 0.2631578947368421, 0.0...</td>\n",
 641 |        "    </tr>\n",
 642 |        "    <tr>\n",
 643 |        "      <th>235</th>\n",
 644 |        "      <td>237</td>\n",
 645 |        "      <td>278.1172</td>\n",
 646 |        "      <td>[50.8153, 74.535, 78.0275, 93.2556, 97.6804, 1...</td>\n",
 647 |        "      <td>[0.01818181818181818, 0.01818181818181818, 0.0...</td>\n",
 648 |        "      <td>[67.65480000000002, 83.67750000000004, 86.4763...</td>\n",
 649 |        "      <td>[0.02, 0.019090909090909092, 0.018181818181818...</td>\n",
 650 |        "    </tr>\n",
 651 |        "  </tbody>\n",
 652 |        "</table>\n",
 653 |        "<p>236 rows × 6 columns</p>\n",
 654 |        "</div>"
 655 |       ],
 656 |       "text/plain": [
 657 |        "    feature_id  precursor_mz  \\\n",
 658 |        "0          398      235.1691   \n",
 659 |        "1          398      235.1691   \n",
 660 |        "2          159      485.2164   \n",
 661 |        "3          169      705.2381   \n",
 662 |        "4          423      441.2264   \n",
 663 |        "..         ...           ...   \n",
 664 |        "231        439      336.0647   \n",
 665 |        "232        322      532.3113   \n",
 666 |        "233        300      478.3159   \n",
 667 |        "234        254      329.2320   \n",
 668 |        "235        237      278.1172   \n",
 669 |        "\n",
 670 |        "                                                   mzs  \\\n",
 671 |        "0    [53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...   \n",
 672 |        "1    [53.0389, 53.0509, 55.0181, 55.0545, 55.5835, ...   \n",
 673 |        "2    [66.3952, 68.416, 81.3048, 100.9429, 121.0081,...   \n",
 674 |        "3    [56.604, 57.0337, 61.0286, 69.0336, 71.0492, 7...   \n",
 675 |        "4    [62.099, 62.3016, 65.6795, 71.5638, 95.0857, 1...   \n",
 676 |        "..                                                 ...   \n",
 677 |        "231  [55.173, 55.6729, 57.7501, 63.2992, 64.3001, 7...   \n",
 678 |        "232  [53.6431, 61.0821, 67.0543, 69.0334, 81.0699, ...   \n",
 679 |        "233  [63.7647, 71.5594, 93.5641, 108.7938, 119.8244...   \n",
 680 |        "234  [53.0389, 55.0182, 55.0546, 67.008, 67.0544, 6...   \n",
 681 |        "235  [50.8153, 74.535, 78.0275, 93.2556, 97.6804, 1...   \n",
 682 |        "\n",
 683 |        "                                           intensities  \\\n",
 684 |        "0    [0.07166666666666667, 0.020833333333333332, 0....   \n",
 685 |        "1    [0.07166666666666667, 0.020833333333333332, 0....   \n",
 686 |        "2    [0.05, 0.04565217391304348, 0.0521739130434782...   \n",
 687 |        "3    [0.020625, 0.03625, 0.10625, 0.51875, 0.2, 0.0...   \n",
 688 |        "4    [0.07096774193548387, 0.08709677419354839, 0.0...   \n",
 689 |        "..                                                 ...   \n",
 690 |        "231  [0.010476190476190476, 0.010952380952380953, 0...   \n",
 691 |        "232  [0.0125, 0.012, 0.016, 0.08, 0.23, 0.285, 0.07...   \n",
 692 |        "233  [0.10476190476190476, 0.1, 0.11428571428571428...   \n",
 693 |        "234  [0.0456140350877193, 0.05087719298245614, 0.28...   \n",
 694 |        "235  [0.01818181818181818, 0.01818181818181818, 0.0...   \n",
 695 |        "\n",
 696 |        "                                              loss_mzs  \\\n",
 697 |        "0    [19.8476, 19.87299999999999, 56.06339999999997...   \n",
 698 |        "1    [19.8476, 19.87299999999999, 56.06339999999997...   \n",
 699 |        "2    [18.01230000000004, 96.17940000000004, 102.068...   \n",
 700 |        "3    [206.79970000000003, 297.4848, 314.15880000000...   \n",
 701 |        "4    [70.07659999999998, 88.08980000000003, 130.173...   \n",
 702 |        "..                                                 ...   \n",
 703 |        "231  [17.859800000000007, 72.62130000000002, 119.40...   \n",
 704 |        "232  [197.09179999999998, 207.04439999999994, 215.1...   \n",
 705 |        "233  [16.459400000000016, 35.03750000000002, 53.050...   \n",
 706 |        "234  [89.70510000000002, 112.67550000000003, 145.15...   \n",
 707 |        "235  [67.65480000000002, 83.67750000000004, 86.4763...   \n",
 708 |        "\n",
 709 |        "                                      loss_intensities  \n",
 710 |        "0    [0.075, 0.07083333333333333, 0.031666666666666...  \n",
 711 |        "1    [0.075, 0.07083333333333333, 0.031666666666666...  \n",
 712 |        "2    [1.0, 0.058695652173913045, 0.0630434782608695...  \n",
 713 |        "3                              [0.02125, 0.02, 0.0225]  \n",
 714 |        "4    [1.0, 0.14516129032258066, 0.12903225806451613...  \n",
 715 |        "..                                                 ...  \n",
 716 |        "231  [0.04428571428571428, 0.010952380952380953, 0....  \n",
 717 |        "232  [0.0245, 0.015, 0.205, 0.135, 0.28, 0.0205, 0....  \n",
 718 |        "233  [0.10952380952380952, 0.13333333333333333, 0.1...  \n",
 719 |        "234  [0.042105263157894736, 0.2631578947368421, 0.0...  \n",
 720 |        "235  [0.02, 0.019090909090909092, 0.018181818181818...  \n",
 721 |        "\n",
 722 |        "[236 rows x 6 columns]"
 723 |       ]
 724 |      },
 725 |      "execution_count": 5,
 726 |      "metadata": {},
 727 |      "output_type": "execute_result"
 728 |     }
 729 |    ],
 730 |    "source": [
 731 |     "df_train=metadata.dropna()\n",
 732 |     "df_train"
 733 |    ]
 734 |   },
 735 |   {
 736 |    "cell_type": "markdown",
 737 |    "id": "17d5d4e0-789e-415e-98c0-7b357c3262bc",
 738 |    "metadata": {},
 739 |    "source": [
 740 |     "df_train=df_train.loc[df_train.loss_mzs.apply(str) != \"[]\"]\n",
 741 |     "df_train"
 742 |    ]
 743 |   },
 744 |   {
 745 |    "cell_type": "markdown",
 746 |    "id": "b9428407-e8b2-450e-82ca-2d7a5541cdf7",
 747 |    "metadata": {},
 748 |    "source": [
 749 |     "df_wrong=metadata.loc[set(metadata.index) - set(df_train.index.values.tolist())]\n",
 750 |     "df_wrong"
 751 |    ]
 752 |   },
 753 |   {
 754 |    "cell_type": "code",
 755 |    "execution_count": 6,
 756 |    "id": "ac599deb-d5cc-4bec-824b-52a3ae976d8a",
 757 |    "metadata": {},
 758 |    "outputs": [],
 759 |    "source": [
 760 |     "df_train.to_csv('casmi_ids.tsv',sep='\\t')"
 761 |    ]
 762 |   },
 763 |   {
 764 |    "cell_type": "code",
 765 |    "execution_count": 33,
 766 |    "id": "c9d0f5a9-2e05-4143-bf21-a62d075b273b",
 767 |    "metadata": {},
 768 |    "outputs": [
 769 |     {
 770 |      "data": {
 771 |       "text/html": [
 772 |        "<div>\n",
 773 |        "<style scoped>\n",
 774 |        "    .dataframe tbody tr th:only-of-type {\n",
 775 |        "        vertical-align: middle;\n",
 776 |        "    }\n",
 777 |        "\n",
 778 |        "    .dataframe tbody tr th {\n",
 779 |        "        vertical-align: top;\n",
 780 |        "    }\n",
 781 |        "\n",
 782 |        "    .dataframe thead th {\n",
 783 |        "        text-align: right;\n",
 784 |        "    }\n",
 785 |        "</style>\n",
 786 |        "<table border=\"1\" class=\"dataframe\">\n",
 787 |        "  <thead>\n",
 788 |        "    <tr style=\"text-align: right;\">\n",
 789 |        "      <th></th>\n",
 790 |        "      <th>Unnamed: 0</th>\n",
 791 |        "      <th>spectrum_id</th>\n",
 792 |        "      <th>precursor_mz</th>\n",
 793 |        "      <th>mzs</th>\n",
 794 |        "      <th>intensities</th>\n",
 795 |        "      <th>loss_mzs</th>\n",
 796 |        "      <th>loss_intensities</th>\n",
 797 |        "      <th>smiles_preprocessed</th>\n",
 798 |        "      <th>num_of_sugars</th>\n",
 799 |        "      <th>Number of aliphatic carboxylic acids</th>\n",
 800 |        "      <th>...</th>\n",
 801 |        "      <th>Number of phenols</th>\n",
 802 |        "      <th>Number of phosphoric acid groups</th>\n",
 803 |        "      <th>Number of phosphoric ester groups</th>\n",
 804 |        "      <th>Number of piperdine rings</th>\n",
 805 |        "      <th>Number of primary amides</th>\n",
 806 |        "      <th>Number of pyridine rings</th>\n",
 807 |        "      <th>Number of quaternary nitrogens</th>\n",
 808 |        "      <th>Number of thioether</th>\n",
 809 |        "      <th>Number of thiazole rings</th>\n",
 810 |        "      <th>Number of unbranched alkanes of at least 4 members (excludes halogenated alkanes)</th>\n",
 811 |        "    </tr>\n",
 812 |        "  </thead>\n",
 813 |        "  <tbody>\n",
 814 |        "    <tr>\n",
 815 |        "      <th>0</th>\n",
 816 |        "      <td>0</td>\n",
 817 |        "      <td>3</td>\n",
 818 |        "      <td>719.2538</td>\n",
 819 |        "      <td>[53.0387, 55.0179, 55.0545, 57.0338, 59.0492, ...</td>\n",
 820 |        "      <td>[0.017272727272727273, 0.01818181818181818, 0....</td>\n",
 821 |        "      <td>[314.15709999999996]</td>\n",
 822 |        "      <td>[0.01818181818181818]</td>\n",
 823 |        "      <td>CC1C(C(C(C(O1)OC2=C(OC3=C(C(=CC(=C3C2=O)O)O)CC...</td>\n",
 824 |        "      <td>2.0</td>\n",
 825 |        "      <td>0</td>\n",
 826 |        "      <td>...</td>\n",
 827 |        "      <td>2</td>\n",
 828 |        "      <td>0</td>\n",
 829 |        "      <td>0</td>\n",
 830 |        "      <td>0</td>\n",
 831 |        "      <td>0</td>\n",
 832 |        "      <td>0</td>\n",
 833 |        "      <td>0</td>\n",
 834 |        "      <td>0</td>\n",
 835 |        "      <td>0</td>\n",
 836 |        "      <td>0</td>\n",
 837 |        "    </tr>\n",
 838 |        "    <tr>\n",
 839 |        "      <th>1</th>\n",
 840 |        "      <td>1</td>\n",
 841 |        "      <td>5</td>\n",
 842 |        "      <td>499.2298</td>\n",
 843 |        "      <td>[67.0543, 69.0698, 81.0699, 83.0492, 83.0853, ...</td>\n",
 844 |        "      <td>[0.010357142857142856, 0.04642857142857143, 0....</td>\n",
 845 |        "      <td>[60.02260000000001, 118.02660000000003, 160.03...</td>\n",
 846 |        "      <td>[0.060714285714285714, 0.39285714285714285, 0....</td>\n",
 847 |        "      <td>CC1CC2(C(C1O)C=C(C(CC3C(C3(C)C)C=C(C2=O)C)OC(=...</td>\n",
 848 |        "      <td>0.0</td>\n",
 849 |        "      <td>1</td>\n",
 850 |        "      <td>...</td>\n",
 851 |        "      <td>0</td>\n",
 852 |        "      <td>0</td>\n",
 853 |        "      <td>0</td>\n",
 854 |        "      <td>0</td>\n",
 855 |        "      <td>0</td>\n",
 856 |        "      <td>0</td>\n",
 857 |        "      <td>0</td>\n",
 858 |        "      <td>0</td>\n",
 859 |        "      <td>0</td>\n",
 860 |        "      <td>0</td>\n",
 861 |        "    </tr>\n",
 862 |        "    <tr>\n",
 863 |        "      <th>2</th>\n",
 864 |        "      <td>2</td>\n",
 865 |        "      <td>6</td>\n",
 866 |        "      <td>1102.5777</td>\n",
 867 |        "      <td>[81.0334, 83.0491, 85.0224, 85.0284, 85.0333, ...</td>\n",
 868 |        "      <td>[0.14210526315789473, 0.2236842105263158, 0.01...</td>\n",
 869 |        "      <td>[]</td>\n",
 870 |        "      <td>[]</td>\n",
 871 |        "      <td>CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)O...</td>\n",
 872 |        "      <td>4.0</td>\n",
 873 |        "      <td>0</td>\n",
 874 |        "      <td>...</td>\n",
 875 |        "      <td>0</td>\n",
 876 |        "      <td>0</td>\n",
 877 |        "      <td>0</td>\n",
 878 |        "      <td>0</td>\n",
 879 |        "      <td>0</td>\n",
 880 |        "      <td>0</td>\n",
 881 |        "      <td>0</td>\n",
 882 |        "      <td>0</td>\n",
 883 |        "      <td>0</td>\n",
 884 |        "      <td>0</td>\n",
 885 |        "    </tr>\n",
 886 |        "    <tr>\n",
 887 |        "      <th>3</th>\n",
 888 |        "      <td>3</td>\n",
 889 |        "      <td>10</td>\n",
 890 |        "      <td>472.2082</td>\n",
 891 |        "      <td>[72.2736, 145.0759, 148.0868, 149.0707, 172.06...</td>\n",
 892 |        "      <td>[0.010697674418604652, 0.03488372093023256, 0....</td>\n",
 893 |        "      <td>[135.07919999999996, 148.0632, 165.08959999999...</td>\n",
 894 |        "      <td>[0.053488372093023255, 0.10465116279069768, 0....</td>\n",
 895 |        "      <td>CN1C2=C(C=C(C=C2)C(=O)N(CCC(=O)O)C3=CC=CC=N3)N...</td>\n",
 896 |        "      <td>0.0</td>\n",
 897 |        "      <td>1</td>\n",
 898 |        "      <td>...</td>\n",
 899 |        "      <td>0</td>\n",
 900 |        "      <td>0</td>\n",
 901 |        "      <td>0</td>\n",
 902 |        "      <td>0</td>\n",
 903 |        "      <td>0</td>\n",
 904 |        "      <td>1</td>\n",
 905 |        "      <td>0</td>\n",
 906 |        "      <td>0</td>\n",
 907 |        "      <td>0</td>\n",
 908 |        "      <td>0</td>\n",
 909 |        "    </tr>\n",
 910 |        "    <tr>\n",
 911 |        "      <th>4</th>\n",
 912 |        "      <td>4</td>\n",
 913 |        "      <td>11</td>\n",
 914 |        "      <td>657.3116</td>\n",
 915 |        "      <td>[55.0542, 55.9818, 57.0334, 60.2456, 69.0335, ...</td>\n",
 916 |        "      <td>[0.018260869565217393, 0.013043478260869565, 0...</td>\n",
 917 |        "      <td>[158.094, 210.1282, 228.1352, 246.1485, 280.16...</td>\n",
 918 |        "      <td>[0.02217391304347826, 0.06086956521739131, 0.1...</td>\n",
 919 |        "      <td>CCCC(=O)OCC(C(C(CN1C2=C(C=C(C(=C2)C)C)N=C3C1=N...</td>\n",
 920 |        "      <td>1.0</td>\n",
 921 |        "      <td>0</td>\n",
 922 |        "      <td>...</td>\n",
 923 |        "      <td>0</td>\n",
 924 |        "      <td>0</td>\n",
 925 |        "      <td>0</td>\n",
 926 |        "      <td>0</td>\n",
 927 |        "      <td>0</td>\n",
 928 |        "      <td>0</td>\n",
 929 |        "      <td>0</td>\n",
 930 |        "      <td>0</td>\n",
 931 |        "      <td>0</td>\n",
 932 |        "      <td>0</td>\n",
 933 |        "    </tr>\n",
 934 |        "    <tr>\n",
 935 |        "      <th>...</th>\n",
 936 |        "      <td>...</td>\n",
 937 |        "      <td>...</td>\n",
 938 |        "      <td>...</td>\n",
 939 |        "      <td>...</td>\n",
 940 |        "      <td>...</td>\n",
 941 |        "      <td>...</td>\n",
 942 |        "      <td>...</td>\n",
 943 |        "      <td>...</td>\n",
 944 |        "      <td>...</td>\n",
 945 |        "      <td>...</td>\n",
 946 |        "      <td>...</td>\n",
 947 |        "      <td>...</td>\n",
 948 |        "      <td>...</td>\n",
 949 |        "      <td>...</td>\n",
 950 |        "      <td>...</td>\n",
 951 |        "      <td>...</td>\n",
 952 |        "      <td>...</td>\n",
 953 |        "      <td>...</td>\n",
 954 |        "      <td>...</td>\n",
 955 |        "      <td>...</td>\n",
 956 |        "      <td>...</td>\n",
 957 |        "    </tr>\n",
 958 |        "    <tr>\n",
 959 |        "      <th>237</th>\n",
 960 |        "      <td>237</td>\n",
 961 |        "      <td>490</td>\n",
 962 |        "      <td>268.1541</td>\n",
 963 |        "      <td>[86.0599, 109.065, 109.1013, 121.101, 123.1168...</td>\n",
 964 |        "      <td>[0.10689655172413794, 0.04482758620689655, 0.0...</td>\n",
 965 |        "      <td>[18.01030000000003, 36.02120000000002, 46.0054...</td>\n",
 966 |        "      <td>[0.21724137931034482, 0.06206896551724138, 1.0...</td>\n",
 967 |        "      <td>CC1CCC2C1C(=O)OC(C2C)NC(=O)C3C(O3)C</td>\n",
 968 |        "      <td>0.0</td>\n",
 969 |        "      <td>0</td>\n",
 970 |        "      <td>...</td>\n",
 971 |        "      <td>0</td>\n",
 972 |        "      <td>0</td>\n",
 973 |        "      <td>0</td>\n",
 974 |        "      <td>0</td>\n",
 975 |        "      <td>0</td>\n",
 976 |        "      <td>0</td>\n",
 977 |        "      <td>0</td>\n",
 978 |        "      <td>0</td>\n",
 979 |        "      <td>0</td>\n",
 980 |        "      <td>0</td>\n",
 981 |        "    </tr>\n",
 982 |        "    <tr>\n",
 983 |        "      <th>238</th>\n",
 984 |        "      <td>238</td>\n",
 985 |        "      <td>491</td>\n",
 986 |        "      <td>411.3254</td>\n",
 987 |        "      <td>[55.0544, 57.0701, 67.0544, 69.0699, 81.0699, ...</td>\n",
 988 |        "      <td>[0.0215625, 0.053125, 0.015, 0.34375, 0.046875...</td>\n",
 989 |        "      <td>[18.01060000000001, 88.08850000000001, 142.135...</td>\n",
 990 |        "      <td>[0.153125, 0.02125, 0.01875, 0.02625, 0.028437...</td>\n",
 991 |        "      <td>CC(C)C(C)C=CC(C)C1CCC2C1(CCC3=C2C(=O)C=C4C3(CC...</td>\n",
 992 |        "      <td>0.0</td>\n",
 993 |        "      <td>0</td>\n",
 994 |        "      <td>...</td>\n",
 995 |        "      <td>0</td>\n",
 996 |        "      <td>0</td>\n",
 997 |        "      <td>0</td>\n",
 998 |        "      <td>0</td>\n",
 999 |        "      <td>0</td>\n",
1000 |        "      <td>0</td>\n",
1001 |        "      <td>0</td>\n",
1002 |        "      <td>0</td>\n",
1003 |        "      <td>0</td>\n",
1004 |        "      <td>0</td>\n",
1005 |        "    </tr>\n",
1006 |        "    <tr>\n",
1007 |        "      <th>239</th>\n",
1008 |        "      <td>239</td>\n",
1009 |        "      <td>492</td>\n",
1010 |        "      <td>430.2432</td>\n",
1011 |        "      <td>[50.4192, 52.9216, 74.1355, 81.0699, 86.4846, ...</td>\n",
1012 |        "      <td>[0.05555555555555555, 0.058333333333333334, 0....</td>\n",
1013 |        "      <td>[98.63150000000002, 179.0794, 197.0896, 214.55...</td>\n",
1014 |        "      <td>[0.06666666666666667, 0.2777777777777778, 1.0,...</td>\n",
1015 |        "      <td>CC1CCC(C2(C1=CC(CC2)C(=C)C(=O)O)C)OC3C(C(C(C(O...</td>\n",
1016 |        "      <td>1.0</td>\n",
1017 |        "      <td>1</td>\n",
1018 |        "      <td>...</td>\n",
1019 |        "      <td>0</td>\n",
1020 |        "      <td>0</td>\n",
1021 |        "      <td>0</td>\n",
1022 |        "      <td>0</td>\n",
1023 |        "      <td>0</td>\n",
1024 |        "      <td>0</td>\n",
1025 |        "      <td>0</td>\n",
1026 |        "      <td>0</td>\n",
1027 |        "      <td>0</td>\n",
1028 |        "      <td>0</td>\n",
1029 |        "    </tr>\n",
1030 |        "    <tr>\n",
1031 |        "      <th>240</th>\n",
1032 |        "      <td>240</td>\n",
1033 |        "      <td>495</td>\n",
1034 |        "      <td>578.2076</td>\n",
1035 |        "      <td>[54.5814, 57.0338, 61.0286, 69.0335, 81.0331, ...</td>\n",
1036 |        "      <td>[0.01, 0.010769230769230769, 0.038461538461538...</td>\n",
1037 |        "      <td>[160.76359999999994]</td>\n",
1038 |        "      <td>[0.011923076923076923]</td>\n",
1039 |        "      <td>COC1=C2C(=CC(=C1OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(...</td>\n",
1040 |        "      <td>2.0</td>\n",
1041 |        "      <td>1</td>\n",
1042 |        "      <td>...</td>\n",
1043 |        "      <td>0</td>\n",
1044 |        "      <td>0</td>\n",
1045 |        "      <td>0</td>\n",
1046 |        "      <td>0</td>\n",
1047 |        "      <td>0</td>\n",
1048 |        "      <td>0</td>\n",
1049 |        "      <td>0</td>\n",
1050 |        "      <td>0</td>\n",
1051 |        "      <td>0</td>\n",
1052 |        "      <td>0</td>\n",
1053 |        "    </tr>\n",
1054 |        "    <tr>\n",
1055 |        "      <th>241</th>\n",
1056 |        "      <td>241</td>\n",
1057 |        "      <td>500</td>\n",
1058 |        "      <td>243.1014</td>\n",
1059 |        "      <td>[71.9563, 104.546, 105.0698, 107.0491, 107.056...</td>\n",
1060 |        "      <td>[0.015833333333333335, 0.018333333333333333, 0...</td>\n",
1061 |        "      <td>[24.813600000000008, 42.197700000000026, 94.04...</td>\n",
1062 |        "      <td>[0.025833333333333333, 0.0175, 0.065, 1.0, 0.6...</td>\n",
1063 |        "      <td>C1C(COC2=C1C=CC(=C2)O)C3=CC=C(C=C3)O</td>\n",
1064 |        "      <td>0.0</td>\n",
1065 |        "      <td>0</td>\n",
1066 |        "      <td>...</td>\n",
1067 |        "      <td>2</td>\n",
1068 |        "      <td>0</td>\n",
1069 |        "      <td>0</td>\n",
1070 |        "      <td>0</td>\n",
1071 |        "      <td>0</td>\n",
1072 |        "      <td>0</td>\n",
1073 |        "      <td>0</td>\n",
1074 |        "      <td>0</td>\n",
1075 |        "      <td>0</td>\n",
1076 |        "      <td>0</td>\n",
1077 |        "    </tr>\n",
1078 |        "  </tbody>\n",
1079 |        "</table>\n",
1080 |        "<p>242 rows × 68 columns</p>\n",
1081 |        "</div>"
1082 |       ],
1083 |       "text/plain": [
1084 |        "     Unnamed: 0  spectrum_id  precursor_mz  \\\n",
1085 |        "0             0            3      719.2538   \n",
1086 |        "1             1            5      499.2298   \n",
1087 |        "2             2            6     1102.5777   \n",
1088 |        "3             3           10      472.2082   \n",
1089 |        "4             4           11      657.3116   \n",
1090 |        "..          ...          ...           ...   \n",
1091 |        "237         237          490      268.1541   \n",
1092 |        "238         238          491      411.3254   \n",
1093 |        "239         239          492      430.2432   \n",
1094 |        "240         240          495      578.2076   \n",
1095 |        "241         241          500      243.1014   \n",
1096 |        "\n",
1097 |        "                                                   mzs  \\\n",
1098 |        "0    [53.0387, 55.0179, 55.0545, 57.0338, 59.0492, ...   \n",
1099 |        "1    [67.0543, 69.0698, 81.0699, 83.0492, 83.0853, ...   \n",
1100 |        "2    [81.0334, 83.0491, 85.0224, 85.0284, 85.0333, ...   \n",
1101 |        "3    [72.2736, 145.0759, 148.0868, 149.0707, 172.06...   \n",
1102 |        "4    [55.0542, 55.9818, 57.0334, 60.2456, 69.0335, ...   \n",
1103 |        "..                                                 ...   \n",
1104 |        "237  [86.0599, 109.065, 109.1013, 121.101, 123.1168...   \n",
1105 |        "238  [55.0544, 57.0701, 67.0544, 69.0699, 81.0699, ...   \n",
1106 |        "239  [50.4192, 52.9216, 74.1355, 81.0699, 86.4846, ...   \n",
1107 |        "240  [54.5814, 57.0338, 61.0286, 69.0335, 81.0331, ...   \n",
1108 |        "241  [71.9563, 104.546, 105.0698, 107.0491, 107.056...   \n",
1109 |        "\n",
1110 |        "                                           intensities  \\\n",
1111 |        "0    [0.017272727272727273, 0.01818181818181818, 0....   \n",
1112 |        "1    [0.010357142857142856, 0.04642857142857143, 0....   \n",
1113 |        "2    [0.14210526315789473, 0.2236842105263158, 0.01...   \n",
1114 |        "3    [0.010697674418604652, 0.03488372093023256, 0....   \n",
1115 |        "4    [0.018260869565217393, 0.013043478260869565, 0...   \n",
1116 |        "..                                                 ...   \n",
1117 |        "237  [0.10689655172413794, 0.04482758620689655, 0.0...   \n",
1118 |        "238  [0.0215625, 0.053125, 0.015, 0.34375, 0.046875...   \n",
1119 |        "239  [0.05555555555555555, 0.058333333333333334, 0....   \n",
1120 |        "240  [0.01, 0.010769230769230769, 0.038461538461538...   \n",
1121 |        "241  [0.015833333333333335, 0.018333333333333333, 0...   \n",
1122 |        "\n",
1123 |        "                                              loss_mzs  \\\n",
1124 |        "0                                 [314.15709999999996]   \n",
1125 |        "1    [60.02260000000001, 118.02660000000003, 160.03...   \n",
1126 |        "2                                                   []   \n",
1127 |        "3    [135.07919999999996, 148.0632, 165.08959999999...   \n",
1128 |        "4    [158.094, 210.1282, 228.1352, 246.1485, 280.16...   \n",
1129 |        "..                                                 ...   \n",
1130 |        "237  [18.01030000000003, 36.02120000000002, 46.0054...   \n",
1131 |        "238  [18.01060000000001, 88.08850000000001, 142.135...   \n",
1132 |        "239  [98.63150000000002, 179.0794, 197.0896, 214.55...   \n",
1133 |        "240                               [160.76359999999994]   \n",
1134 |        "241  [24.813600000000008, 42.197700000000026, 94.04...   \n",
1135 |        "\n",
1136 |        "                                      loss_intensities  \\\n",
1137 |        "0                                [0.01818181818181818]   \n",
1138 |        "1    [0.060714285714285714, 0.39285714285714285, 0....   \n",
1139 |        "2                                                   []   \n",
1140 |        "3    [0.053488372093023255, 0.10465116279069768, 0....   \n",
1141 |        "4    [0.02217391304347826, 0.06086956521739131, 0.1...   \n",
1142 |        "..                                                 ...   \n",
1143 |        "237  [0.21724137931034482, 0.06206896551724138, 1.0...   \n",
1144 |        "238  [0.153125, 0.02125, 0.01875, 0.02625, 0.028437...   \n",
1145 |        "239  [0.06666666666666667, 0.2777777777777778, 1.0,...   \n",
1146 |        "240                             [0.011923076923076923]   \n",
1147 |        "241  [0.025833333333333333, 0.0175, 0.065, 1.0, 0.6...   \n",
1148 |        "\n",
1149 |        "                                   smiles_preprocessed  num_of_sugars  \\\n",
1150 |        "0    CC1C(C(C(C(O1)OC2=C(OC3=C(C(=CC(=C3C2=O)O)O)CC...            2.0   \n",
1151 |        "1    CC1CC2(C(C1O)C=C(C(CC3C(C3(C)C)C=C(C2=O)C)OC(=...            0.0   \n",
1152 |        "2    CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OCC3C(C(C(C(O3)O...            4.0   \n",
1153 |        "3    CN1C2=C(C=C(C=C2)C(=O)N(CCC(=O)O)C3=CC=CC=N3)N...            0.0   \n",
1154 |        "4    CCCC(=O)OCC(C(C(CN1C2=C(C=C(C(=C2)C)C)N=C3C1=N...            1.0   \n",
1155 |        "..                                                 ...            ...   \n",
1156 |        "237                CC1CCC2C1C(=O)OC(C2C)NC(=O)C3C(O3)C            0.0   \n",
1157 |        "238  CC(C)C(C)C=CC(C)C1CCC2C1(CCC3=C2C(=O)C=C4C3(CC...            0.0   \n",
1158 |        "239  CC1CCC(C2(C1=CC(CC2)C(=C)C(=O)O)C)OC3C(C(C(C(O...            1.0   \n",
1159 |        "240  COC1=C2C(=CC(=C1OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(...            2.0   \n",
1160 |        "241               C1C(COC2=C1C=CC(=C2)O)C3=CC=C(C=C3)O            0.0   \n",
1161 |        "\n",
1162 |        "     Number of aliphatic carboxylic acids  ...  Number of phenols  \\\n",
1163 |        "0                                       0  ...                  2   \n",
1164 |        "1                                       1  ...                  0   \n",
1165 |        "2                                       0  ...                  0   \n",
1166 |        "3                                       1  ...                  0   \n",
1167 |        "4                                       0  ...                  0   \n",
1168 |        "..                                    ...  ...                ...   \n",
1169 |        "237                                     0  ...                  0   \n",
1170 |        "238                                     0  ...                  0   \n",
1171 |        "239                                     1  ...                  0   \n",
1172 |        "240                                     1  ...                  0   \n",
1173 |        "241                                     0  ...                  2   \n",
1174 |        "\n",
1175 |        "     Number of phosphoric acid groups  Number of phosphoric ester groups  \\\n",
1176 |        "0                                   0                                  0   \n",
1177 |        "1                                   0                                  0   \n",
1178 |        "2                                   0                                  0   \n",
1179 |        "3                                   0                                  0   \n",
1180 |        "4                                   0                                  0   \n",
1181 |        "..                                ...                                ...   \n",
1182 |        "237                                 0                                  0   \n",
1183 |        "238                                 0                                  0   \n",
1184 |        "239                                 0                                  0   \n",
1185 |        "240                                 0                                  0   \n",
1186 |        "241                                 0                                  0   \n",
1187 |        "\n",
1188 |        "     Number of piperdine rings  Number of primary amides  \\\n",
1189 |        "0                            0                         0   \n",
1190 |        "1                            0                         0   \n",
1191 |        "2                            0                         0   \n",
1192 |        "3                            0                         0   \n",
1193 |        "4                            0                         0   \n",
1194 |        "..                         ...                       ...   \n",
1195 |        "237                          0                         0   \n",
1196 |        "238                          0                         0   \n",
1197 |        "239                          0                         0   \n",
1198 |        "240                          0                         0   \n",
1199 |        "241                          0                         0   \n",
1200 |        "\n",
1201 |        "     Number of pyridine rings  Number of quaternary nitrogens  \\\n",
1202 |        "0                           0                               0   \n",
1203 |        "1                           0                               0   \n",
1204 |        "2                           0                               0   \n",
1205 |        "3                           1                               0   \n",
1206 |        "4                           0                               0   \n",
1207 |        "..                        ...                             ...   \n",
1208 |        "237                         0                               0   \n",
1209 |        "238                         0                               0   \n",
1210 |        "239                         0                               0   \n",
1211 |        "240                         0                               0   \n",
1212 |        "241                         0                               0   \n",
1213 |        "\n",
1214 |        "     Number of thioether  Number of thiazole rings  \\\n",
1215 |        "0                      0                         0   \n",
1216 |        "1                      0                         0   \n",
1217 |        "2                      0                         0   \n",
1218 |        "3                      0                         0   \n",
1219 |        "4                      0                         0   \n",
1220 |        "..                   ...                       ...   \n",
1221 |        "237                    0                         0   \n",
1222 |        "238                    0                         0   \n",
1223 |        "239                    0                         0   \n",
1224 |        "240                    0                         0   \n",
1225 |        "241                    0                         0   \n",
1226 |        "\n",
1227 |        "     Number of unbranched alkanes of at least 4 members (excludes halogenated alkanes)  \n",
1228 |        "0                                                    0                                  \n",
1229 |        "1                                                    0                                  \n",
1230 |        "2                                                    0                                  \n",
1231 |        "3                                                    0                                  \n",
1232 |        "4                                                    0                                  \n",
1233 |        "..                                                 ...                                  \n",
1234 |        "237                                                  0                                  \n",
1235 |        "238                                                  0                                  \n",
1236 |        "239                                                  0                                  \n",
1237 |        "240                                                  0                                  \n",
1238 |        "241                                                  0                                  \n",
1239 |        "\n",
1240 |        "[242 rows x 68 columns]"
1241 |       ]
1242 |      },
1243 |      "execution_count": 33,
1244 |      "metadata": {},
1245 |      "output_type": "execute_result"
1246 |     }
1247 |    ],
1248 |    "source": [
1249 |     "df_train = pd.read_csv(\"casmi_func_groups_2201.tsv\",sep=\"\\t\")\n",
1250 |     "#data_df = pd.read_csv(\"/Users/delser/mass2smiles/retrain/nist/all_HRMS_validation_16122022_cddd_refine.tsv\",sep=\"\\t\")\n",
1251 |     "df_train=df_train.dropna()\n",
1252 |     "df_train"
1253 |    ]
1254 |   },
1255 |   {
1256 |    "cell_type": "code",
1257 |    "execution_count": 9,
1258 |    "id": "0d79b609-3512-4b19-950c-be76089d8a96",
1259 |    "metadata": {},
1260 |    "outputs": [],
1261 |    "source": [
1262 |     "df_wrong.to_csv('loss_fail_matchms.tsv',sep='\\t')"
1263 |    ]
1264 |   },
1265 |   {
1266 |    "cell_type": "code",
1267 |    "execution_count": 34,
1268 |    "id": "62e7f44e-b7fc-4154-a48c-c6f12b02ddb9",
1269 |    "metadata": {},
1270 |    "outputs": [],
1271 |    "source": [
1272 |     "def positional_encoding(max_position, d_model, min_freq=1e-6):\n",
1273 |     "    position = np.arange(max_position)\n",
1274 |     "    freqs = min_freq**(2*(np.arange(d_model)//2)/d_model)\n",
1275 |     "    pos_enc = position.reshape(-1,1)*freqs.reshape(1,-1)\n",
1276 |     "    pos_enc[:, ::2] = np.cos(pos_enc[:, ::2])\n",
1277 |     "    pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2])\n",
1278 |     "    return pos_enc\n",
1279 |     "\n",
1280 |     "def trun_n_d(n,d):\n",
1281 |     "    return (  n if not n.find('.')+1 else n[:n.find('.')+d+1]  )"
1282 |    ]
1283 |   },
1284 |   {
1285 |    "cell_type": "code",
1286 |    "execution_count": 35,
1287 |    "id": "23408430-22d9-4190-a930-54743b8a48ce",
1288 |    "metadata": {},
1289 |    "outputs": [],
1290 |    "source": [
1291 |     "P=positional_encoding(200000,256, min_freq=1e2)\n",
1292 |     "#np.save('positions_512_1e2.npy',P)"
1293 |    ]
1294 |   },
1295 |   {
1296 |    "cell_type": "code",
1297 |    "execution_count": 11,
1298 |    "id": "84cb2056-f65f-4a64-8139-ddae05cefe63",
1299 |    "metadata": {},
1300 |    "outputs": [],
1301 |    "source": [
1302 |     "#matchms mgf encoding\n",
1303 |     "\n",
1304 |     "def prepro_specs_train(df):\n",
1305 |     "    valid=[]\n",
1306 |     "    precs=df['precursor_mz'].to_list()\n",
1307 |     "    mzs=df['mzs'].to_list()\n",
1308 |     "    ints=df['intensities'].to_list()\n",
1309 |     "    loss_mzs=df['loss_mzs'].to_list()\n",
1310 |     "    loss_ints=df['loss_intensities'].to_list()\n",
1311 |     "    for one_pre,one_mzs,one_ints,one_loss,one_loss_ints in tqdm(zip(precs,mzs,ints,loss_mzs,loss_ints)):\n",
1312 |     "        mz_list=[round(float(trun_n_d(str(one_pre),2))*100)] # add precursor mz\n",
1313 |     "        intes_list=[2.0] # add precursor int\n",
1314 |     "        res = dict(zip(one_mzs+one_loss, one_ints+one_loss_ints))  # order by mzs\n",
1315 |     "        res=dict(sorted(res.items()))\n",
1316 |     "        for m,i in zip(list(res.keys()), list(res.values())): # change this from mgf from matchms\n",
1317 |     "            mz=round(float(trun_n_d(str(m),2))*100)\n",
1318 |     "            mz_list.append(mz)\n",
1319 |     "            intens=round(i,4)\n",
1320 |     "            intes_list.append(intens)\n",
1321 |     "        int_mzs=[intes_list,mz_list]   \n",
1322 |     "        valid.append(int_mzs) # put intesities at first\n",
1323 |     "    return tf.ragged.constant(valid)\n",
1324 |     "        "
1325 |    ]
1326 |   },
1327 |   {
1328 |    "cell_type": "code",
1329 |    "execution_count": 36,
1330 |    "id": "c1230761-5a62-431e-906b-a403f2127558",
1331 |    "metadata": {},
1332 |    "outputs": [],
1333 |    "source": [
1334 |     "def prepro_specs_train(df):\n",
1335 |     "    valid=[]\n",
1336 |     "    precs=df['precursor_mz'].to_list()\n",
1337 |     "    mzs=df['mzs'].to_list()\n",
1338 |     "    ints=df['intensities'].to_list()\n",
1339 |     "    loss_mzs=df['loss_mzs'].to_list()\n",
1340 |     "    loss_ints=df['loss_intensities'].to_list()\n",
1341 |     "    for one_pre,one_mzs,one_ints,one_loss,one_loss_ints in tqdm(zip(precs,mzs,ints,loss_mzs,loss_ints)):\n",
1342 |     "        mz_list=[round(float(trun_n_d(str(one_pre),2))*100)] # add precursor mz\n",
1343 |     "        intes_list=[2.0] # add precursor int\n",
1344 |     "        res = dict(zip(json.loads(one_mzs)+json.loads(one_loss), json.loads(one_ints)+json.loads(one_loss_ints)))  # order by mzs\n",
1345 |     "        res=dict(sorted(res.items()))\n",
1346 |     "        for m,i in zip(list(res.keys()), list(res.values())): # change this from mgf from matchms\n",
1347 |     "            mz=round(float(trun_n_d(str(m),2))*100)\n",
1348 |     "            mz_list.append(mz)\n",
1349 |     "            intens=round(i,4)\n",
1350 |     "            intes_list.append(intens)\n",
1351 |     "        int_mzs=[intes_list,mz_list]   \n",
1352 |     "        valid.append(int_mzs) # put intesities at first\n",
1353 |     "    return tf.ragged.constant(valid)"
1354 |    ]
1355 |   },
1356 |   {
1357 |    "cell_type": "code",
1358 |    "execution_count": 37,
1359 |    "id": "e63172d5-e297-4760-91c1-2c4e1eae540e",
1360 |    "metadata": {},
1361 |    "outputs": [
1362 |     {
1363 |      "name": "stderr",
1364 |      "output_type": "stream",
1365 |      "text": [
1366 |       "242it [00:00, 1234.65it/s]\n"
1367 |      ]
1368 |     },
1369 |     {
1370 |      "name": "stdout",
1371 |      "output_type": "stream",
1372 |      "text": [
1373 |       "CPU times: total: 516 ms\n",
1374 |       "Wall time: 475 ms\n"
1375 |      ]
1376 |     }
1377 |    ],
1378 |    "source": [
1379 |     "%%time\n",
1380 |     "train=prepro_specs_train(df_train)"
1381 |    ]
1382 |   },
1383 |   {
1384 |    "cell_type": "code",
1385 |    "execution_count": 17,
1386 |    "id": "c7c0941a-7433-4d4c-b78e-572fe1ba72a0",
1387 |    "metadata": {},
1388 |    "outputs": [
1389 |     {
1390 |      "data": {
1391 |       "text/plain": [
1392 |        "TensorShape([5, None, None])"
1393 |       ]
1394 |      },
1395 |      "execution_count": 17,
1396 |      "metadata": {},
1397 |      "output_type": "execute_result"
1398 |     }
1399 |    ],
1400 |    "source": [
1401 |     "tf.gather(train, [0, 1, 2, 3, 4]).shape"
1402 |    ]
1403 |   },
1404 |   {
1405 |    "cell_type": "code",
1406 |    "execution_count": 33,
1407 |    "id": "54d9db5e-77e0-4718-8565-b9ddd4f45063",
1408 |    "metadata": {},
1409 |    "outputs": [
1410 |     {
1411 |      "data": {
1412 |       "text/plain": [
1413 |        "TensorShape([32, None, None])"
1414 |       ]
1415 |      },
1416 |      "execution_count": 33,
1417 |      "metadata": {},
1418 |      "output_type": "execute_result"
1419 |     }
1420 |    ],
1421 |    "source": [
1422 |     "train[0:32].shape"
1423 |    ]
1424 |   },
1425 |   {
1426 |    "cell_type": "code",
1427 |    "execution_count": 26,
1428 |    "id": "57acb8e7-faaa-42d5-a7ca-89b969e20a0f",
1429 |    "metadata": {},
1430 |    "outputs": [
1431 |     {
1432 |      "data": {
1433 |       "text/plain": [
1434 |        "253"
1435 |       ]
1436 |      },
1437 |      "execution_count": 26,
1438 |      "metadata": {},
1439 |      "output_type": "execute_result"
1440 |     }
1441 |    ],
1442 |    "source": [
1443 |     "length=[i[0].shape[0] for i in train]\n",
1444 |     "max(length)"
1445 |    ]
1446 |   },
1447 |   {
1448 |    "cell_type": "code",
1449 |    "execution_count": 38,
1450 |    "id": "38425b96-bac0-4aac-8390-77598f6d39f4",
1451 |    "metadata": {},
1452 |    "outputs": [],
1453 |    "source": [
1454 |     "dimn=256\n",
1455 |     "def encoding(rag_tensor,P,dimn):\n",
1456 |     "    to_pad=[]\n",
1457 |     "    for sample in rag_tensor:\n",
1458 |     "        all_dim=[sample[0].numpy().tolist()]\n",
1459 |     "        pos_enc=[P[int(i)-1] for i in sample[1].numpy().tolist()]\n",
1460 |     "        for dim in range(dimn):\n",
1461 |     "            dim_n=[i[dim] for i in pos_enc]\n",
1462 |     "            all_dim.append(dim_n)\n",
1463 |     "        to_pad.append(all_dim)\n",
1464 |     "    to_pad=[tf.keras.preprocessing.sequence.pad_sequences(i,maxlen=501,dtype='float32',padding='post',truncating='post',value=10) for i in to_pad]\n",
1465 |     "    to_pad=np.stack((to_pad))\n",
1466 |     "    to_pad=np.swapaxes(to_pad, 1, -1)\n",
1467 |     "    return to_pad"
1468 |    ]
1469 |   },
1470 |   {
1471 |    "cell_type": "code",
1472 |    "execution_count": 38,
1473 |    "id": "6aa1bde2-2aa0-4508-bbe9-614fefa5c73b",
1474 |    "metadata": {},
1475 |    "outputs": [
1476 |     {
1477 |      "data": {
1478 |       "text/plain": [
1479 |        "array([ 0.0717    ,  0.5332156 , -0.8459794 ,  0.01143887, -0.99993455,\n",
1480 |        "        0.28827846, -0.95754665,  0.9954703 ,  0.09507313, -0.95233387,\n",
1481 |        "        0.3050577 ,  0.59969896,  0.8002257 ,  0.57598245,  0.8174621 ,\n",
1482 |        "       -0.996885  , -0.07886912, -0.17545353,  0.9844877 , -0.9999982 ,\n",
1483 |        "       -0.00188863,  0.1089685 ,  0.9940452 , -0.98186713, -0.18957031,\n",
1484 |        "       -0.95267904,  0.30397803,  0.9439646 ,  0.33004668, -0.80919385,\n",
1485 |        "        0.58754176, -0.9424226 , -0.3344243 , -0.86887133, -0.49503803,\n",
1486 |        "       -0.94475156, -0.32778722, -0.9721955 , -0.2341707 , -0.7721371 ,\n",
1487 |        "       -0.63545597,  0.5649307 , -0.82513833, -0.4526213 ,  0.89170283,\n",
1488 |        "        0.68732804,  0.72634715, -0.56023455,  0.82833403,  0.93247634,\n",
1489 |        "        0.36123106, -0.654053  ,  0.75644875, -0.6053066 ,  0.7959924 ,\n",
1490 |        "        0.69717646,  0.7168996 ,  0.22841789, -0.9735632 , -0.92402524,\n",
1491 |        "        0.38233152,  0.37131587,  0.9285066 ,  0.55711097,  0.8304381 ,\n",
1492 |        "       -0.9599877 ,  0.28004214,  0.25998947,  0.96561146, -0.96464807,\n",
1493 |        "       -0.26354155, -0.84793615, -0.5300983 , -0.9946696 ,  0.10311342,\n",
1494 |        "       -0.6722058 ,  0.7403644 , -0.7557188 ,  0.6548962 , -0.3556158 ,\n",
1495 |        "       -0.93463224, -0.9336576 ,  0.3581669 ,  0.2960882 , -0.9551606 ,\n",
1496 |        "        0.90296847, -0.42970684,  0.9975964 ,  0.06929295,  0.16034408,\n",
1497 |        "        0.9870612 ,  0.04580148, -0.99895054,  0.33682257, -0.94156814,\n",
1498 |        "       -0.90871364, -0.41742012, -0.05195929,  0.9986492 ,  0.8410124 ,\n",
1499 |        "        0.5410158 , -0.2589479 ,  0.9658913 ,  0.83173   ,  0.5551804 ,\n",
1500 |        "       -0.07052226, -0.9975102 , -0.99015146,  0.1400004 , -0.904356  ,\n",
1501 |        "       -0.42677885, -0.22271346,  0.974884  ,  0.82213503, -0.5692925 ,\n",
1502 |        "       -0.22886883, -0.9734573 ,  0.99570584,  0.09257355,  0.73236376,\n",
1503 |        "        0.6809136 , -0.5831401 ,  0.8123716 ,  0.97060597,  0.24067426,\n",
1504 |        "       -0.9765509 , -0.21528676,  0.3335899 ,  0.94271827, -0.7904735 ,\n",
1505 |        "        0.61249626, -0.99346447, -0.1141419 ,  0.9764814 , -0.21560162,\n",
1506 |        "        0.5799171 ,  0.8146755 , -0.9991347 , -0.04159042, -0.9890586 ,\n",
1507 |        "        0.1475232 , -0.989487  , -0.14462198,  0.7041925 ,  0.7100091 ,\n",
1508 |        "        0.19167444, -0.98145854,  0.99982166,  0.01888514, -0.46084157,\n",
1509 |        "        0.8874824 , -0.33013698,  0.943933  , -0.9985991 , -0.05291281,\n",
1510 |        "       -0.97547275, -0.2201203 ,  0.99999547,  0.00300953, -0.9642097 ,\n",
1511 |        "       -0.26514086,  0.4496711 , -0.8931942 , -0.98045963, -0.19672027,\n",
1512 |        "       -0.7123622 ,  0.701812  ,  0.8237492 ,  0.56695443, -0.96114033,\n",
1513 |        "        0.2760603 ,  0.01536015,  0.99988204, -0.27245298,  0.9621691 ,\n",
1514 |        "       -0.94391733,  0.3301819 , -0.85027707, -0.52633536, -0.6628971 ,\n",
1515 |        "       -0.7487105 , -0.9762616 , -0.21659462, -0.13901651,  0.99029005,\n",
1516 |        "        0.66964203, -0.742684  , -0.7097949 ,  0.7044084 ,  0.7883553 ,\n",
1517 |        "       -0.6152202 , -0.930816  , -0.36548817, -0.9543821 , -0.2985879 ,\n",
1518 |        "        0.8720212 ,  0.4894681 , -0.8894411 ,  0.45704985,  0.7674    ,\n",
1519 |        "       -0.6411686 ,  0.5122702 , -0.8588243 , -0.46367675, -0.88600445,\n",
1520 |        "        0.65450615, -0.75605667,  0.88037896, -0.47427094, -0.86710006,\n",
1521 |        "        0.49813405,  0.9908141 , -0.135231  , -0.26741374,  0.9635818 ,\n",
1522 |        "        0.7690229 ,  0.63922125,  0.03958566,  0.9992162 , -0.89686114,\n",
1523 |        "       -0.44231218,  0.95714754,  0.28960076, -0.3952544 , -0.9185717 ,\n",
1524 |        "       -0.86790514, -0.49672997,  0.8436555 , -0.53688484,  0.86669165,\n",
1525 |        "       -0.49884427,  0.9220753 , -0.3870106 , -0.7610874 ,  0.64864933,\n",
1526 |        "        0.16541438, -0.9862242 , -0.29848945, -0.95441294,  0.6250325 ,\n",
1527 |        "       -0.78059876,  0.9744055 ,  0.22479734,  0.673075  ,  0.7395742 ,\n",
1528 |        "        0.6001858 ,  0.7998606 ,  0.35591003,  0.93452024, -0.9983589 ,\n",
1529 |        "        0.05726733, -0.75771755,  0.65258265, -0.563865  ,  0.82586694,\n",
1530 |        "        0.9665189 , -0.2565955 ], dtype=float32)"
1531 |       ]
1532 |      },
1533 |      "execution_count": 38,
1534 |      "metadata": {},
1535 |      "output_type": "execute_result"
1536 |     }
1537 |    ],
1538 |    "source": [
1539 |     "xtrain[0][1]"
1540 |    ]
1541 |   },
1542 |   {
1543 |    "cell_type": "code",
1544 |    "execution_count": 39,
1545 |    "id": "adbccd44-0de9-4492-8a39-ac30cd2f29fa",
1546 |    "metadata": {},
1547 |    "outputs": [
1548 |     {
1549 |      "name": "stdout",
1550 |      "output_type": "stream",
1551 |      "text": [
1552 |       "CPU times: total: 7.39 s\n",
1553 |       "Wall time: 7.39 s\n"
1554 |      ]
1555 |     }
1556 |    ],
1557 |    "source": [
1558 |     "%%time\n",
1559 |     "xtrain=encoding(train,P,dimn)"
1560 |    ]
1561 |   },
1562 |   {
1563 |    "cell_type": "code",
1564 |    "execution_count": 40,
1565 |    "id": "56740753-1960-4e35-9cb9-08951cd8aa42",
1566 |    "metadata": {},
1567 |    "outputs": [
1568 |     {
1569 |      "name": "stdout",
1570 |      "output_type": "stream",
1571 |      "text": [
1572 |       "CPU times: total: 3.28 s\n",
1573 |       "Wall time: 3.29 s\n"
1574 |      ]
1575 |     }
1576 |    ],
1577 |    "source": [
1578 |     "%%time\n",
1579 |     "np.save('casmi_specs.npy',xtrain)"
1580 |    ]
1581 |   },
1582 |   {
1583 |    "cell_type": "code",
1584 |    "execution_count": null,
1585 |    "id": "73610487-9ad9-4bdc-a6c6-18f7f72298df",
1586 |    "metadata": {},
1587 |    "outputs": [],
1588 |    "source": []
1589 |   }
1590 |  ],
1591 |  "metadata": {
1592 |   "kernelspec": {
1593 |    "display_name": "Python 3 (ipykernel)",
1594 |    "language": "python",
1595 |    "name": "python3"
1596 |   },
1597 |   "language_info": {
1598 |    "codemirror_mode": {
1599 |     "name": "ipython",
1600 |     "version": 3
1601 |    },
1602 |    "file_extension": ".py",
1603 |    "mimetype": "text/x-python",
1604 |    "name": "python",
1605 |    "nbconvert_exporter": "python",
1606 |    "pygments_lexer": "ipython3",
1607 |    "version": "3.9.15"
1608 |   }
1609 |  },
1610 |  "nbformat": 4,
1611 |  "nbformat_minor": 5
1612 | }
1613 | 


--------------------------------------------------------------------------------