├── .gitattributes ├── .gitignore ├── .vscode └── settings.json ├── README.md ├── compare_performance.py ├── coord_tf.py ├── example.tsv ├── imgs ├── hcd1.png ├── hcd2.png └── model.png ├── predfull.py └── train_model.py /.gitattributes: -------------------------------------------------------------------------------- 1 | example_prediction.mgf filter=lfs diff=lfs merge=lfs -text 2 | *.mgf filter=lfs diff=lfs merge=lfs -text 3 | *.h5 filter=lfs diff=lfs merge=lfs -text 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | *.mgf 5 | *.h5 6 | *.hdf5 7 | 8 | # User-specific files 9 | *.suo 10 | *.user 11 | *.userosscache 12 | *.sln.docstates 13 | 14 | # User-specific files (MonoDevelop/Xamarin Studio) 15 | *.userprefs 16 | 17 | # Build results 18 | [Dd]ebug/ 19 | [Dd]ebugPublic/ 20 | [Rr]elease/ 21 | [Rr]eleases/ 22 | x64/ 23 | x86/ 24 | bld/ 25 | [Bb]in/ 26 | [Oo]bj/ 27 | [Ll]og/ 28 | 29 | # Visual Studio 2015 cache/options directory 30 | .vs/ 31 | # Uncomment if you have tasks that create the project's static files in wwwroot 32 | #wwwroot/ 33 | 34 | # MSTest test Results 35 | [Tt]est[Rr]esult*/ 36 | [Bb]uild[Ll]og.* 37 | 38 | # NUNIT 39 | *.VisualState.xml 40 | TestResult.xml 41 | 42 | # Build Results of an ATL Project 43 | [Dd]ebugPS/ 44 | [Rr]eleasePS/ 45 | dlldata.c 46 | 47 | # DNX 48 | project.lock.json 49 | project.fragment.lock.json 50 | artifacts/ 51 | 52 | *_i.c 53 | *_p.c 54 | *_i.h 55 | *.ilk 56 | *.meta 57 | *.obj 58 | *.pch 59 | *.pdb 60 | *.pgc 61 | *.pgd 62 | *.rsp 63 | *.sbr 64 | *.tlb 65 | *.tli 66 | *.tlh 67 | *.tmp 68 | *.tmp_proj 69 | *.log 70 | *.vspscc 71 | *.vssscc 72 | .builds 73 | *.pidb 74 | *.svclog 75 | *.scc 76 | 77 | # Chutzpah Test files 78 | _Chutzpah* 79 | 80 | # Visual C++ cache files 81 | ipch/ 82 | *.aps 83 | *.ncb 84 | *.opendb 85 | *.opensdf 86 | *.sdf 87 | *.cachefile 88 | *.VC.db 89 | *.VC.VC.opendb 90 | 91 | # Visual Studio profiler 92 | *.psess 93 | *.vsp 94 | *.vspx 95 | *.sap 96 | 97 | # TFS 2012 Local Workspace 98 | $tf/ 99 | 100 | # Guidance Automation Toolkit 101 | *.gpState 102 | 103 | # ReSharper is a .NET coding add-in 104 | _ReSharper*/ 105 | *.[Rr]e[Ss]harper 106 | *.DotSettings.user 107 | 108 | # JustCode is a .NET coding add-in 109 | .JustCode 110 | 111 | # TeamCity is a build add-in 112 | _TeamCity* 113 | 114 | # DotCover is a Code Coverage Tool 115 | *.dotCover 116 | 117 | # NCrunch 118 | _NCrunch_* 119 | .*crunch*.local.xml 120 | nCrunchTemp_* 121 | 122 | # MightyMoose 123 | *.mm.* 124 | AutoTest.Net/ 125 | 126 | # Web workbench (sass) 127 | .sass-cache/ 128 | 129 | # Installshield output folder 130 | [Ee]xpress/ 131 | 132 | # Click-Once directory 133 | publish/ 134 | 135 | # Publish Web Output 136 | *.[Pp]ublish.xml 137 | *.azurePubxml 138 | # TODO: Comment the next line if you want to checkin your web deploy settings 139 | # but database connection strings (with potential passwords) will be unencrypted 140 | #*.pubxml 141 | *.publishproj 142 | 143 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 144 | # checkin your Azure Web App publish settings, but sensitive information contained 145 | # in these scripts will be unencrypted 146 | PublishScripts/ 147 | 148 | # NuGet Packages 149 | *.nupkg 150 | # The packages folder can be ignored because of Package Restore 151 | **/packages/* 152 | # except build/, which is used as an MSBuild target. 153 | !**/packages/build/ 154 | # Uncomment if necessary however generally it will be regenerated when needed 155 | #!**/packages/repositories.config 156 | # NuGet v3's project.json files produces more ignoreable files 157 | *.nuget.props 158 | *.nuget.targets 159 | 160 | # Microsoft Azure Build Output 161 | csx/ 162 | *.build.csdef 163 | 164 | # Microsoft Azure Emulator 165 | ecf/ 166 | rcf/ 167 | 168 | # Windows Store app package directories and files 169 | AppPackages/ 170 | BundleArtifacts/ 171 | Package.StoreAssociation.xml 172 | _pkginfo.txt 173 | 174 | # Visual Studio cache files 175 | # files ending in .cache can be ignored 176 | *.[Cc]ache 177 | # but keep track of directories ending in .cache 178 | !*.[Cc]ache/ 179 | 180 | # Others 181 | ClientBin/ 182 | ~$* 183 | *~ 184 | *.dbmdl 185 | *.dbproj.schemaview 186 | *.jfm 187 | *.pfx 188 | *.publishsettings 189 | node_modules/ 190 | orleans.codegen.cs 191 | 192 | # Since there are multiple workflows, uncomment next line to ignore bower_components 193 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 194 | #bower_components/ 195 | 196 | # RIA/Silverlight projects 197 | Generated_Code/ 198 | 199 | # Backup & report files from converting an old project file 200 | # to a newer Visual Studio version. Backup files are not needed, 201 | # because we have git ;-) 202 | _UpgradeReport_Files/ 203 | Backup*/ 204 | UpgradeLog*.XML 205 | UpgradeLog*.htm 206 | 207 | # SQL Server files 208 | *.mdf 209 | *.ldf 210 | 211 | # Business Intelligence projects 212 | *.rdl.data 213 | *.bim.layout 214 | *.bim_*.settings 215 | 216 | # Microsoft Fakes 217 | FakesAssemblies/ 218 | 219 | # GhostDoc plugin setting file 220 | *.GhostDoc.xml 221 | 222 | # Node.js Tools for Visual Studio 223 | .ntvs_analysis.dat 224 | 225 | # Visual Studio 6 build log 226 | *.plg 227 | 228 | # Visual Studio 6 workspace options file 229 | *.opt 230 | 231 | # Visual Studio LightSwitch build output 232 | **/*.HTMLClient/GeneratedArtifacts 233 | **/*.DesktopClient/GeneratedArtifacts 234 | **/*.DesktopClient/ModelManifest.xml 235 | **/*.Server/GeneratedArtifacts 236 | **/*.Server/ModelManifest.xml 237 | _Pvt_Extensions 238 | 239 | # Paket dependency manager 240 | .paket/paket.exe 241 | paket-files/ 242 | 243 | # FAKE - F# Make 244 | .fake/ 245 | 246 | # JetBrains Rider 247 | .idea/ 248 | *.sln.iml 249 | 250 | # CodeRush 251 | .cr/ 252 | 253 | # Python Tools for Visual Studio (PTVS) 254 | __pycache__/ 255 | *.pyc 256 | .ipynb_checkpoints 257 | 258 | *.list 259 | *.jar 260 | *.out 261 | *.newcluster 262 | *.backup 263 | *.exe 264 | /CodeGraphData 265 | readme 266 | *.cluster 267 | *.ipynb 268 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "jupyter.jupyterServerType": "local", 3 | "[python]": { 4 | "editor.defaultFormatter": "ms-python.black-formatter" 5 | }, 6 | "python.formatting.provider": "none" 7 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PredFull 2 | 3 | __Visit [http://predfull.com/](http://predfull.com/) to try online prediction__ 4 | 5 | > This work was published on Analytical Chemistry: [`Full-Spectrum Prediction of Peptides Tandem Mass Spectra using Deep Neural Network`](https://pubs.acs.org/doi/10.1021/acs.analchem.9b04867) 6 | > 7 | > Kaiyuan Liu, Sujun Li, Lei Wang, Yuzhen Ye, Haixu Tang 8 | 9 | The first model for predicting complete tandem mass spectra from peptides sequences, using a deep CNN neural network trained on over 2 million experimental spectra. 10 | 11 | Free for academic uses. 12 | 13 | ## Update History 14 | 15 | * 2022.05.19: Support input peptide of any length 16 | * 2021.05.18: Support predicting peptides with oxidized methionine. 17 | * 2021.01.01: Update example results. 18 | * 2020.08.22: Fixed performance issues. 19 | * 2020.05.25: Support predicting non-tryptic peptides. 20 | * 2019.09.01: First version. 21 | 22 | ## Method 23 | 24 | Based on the structure of the residual convolutional networks. Current precision (bin size): 0.1 Th. 25 | 26 | ![model](imgs/model.png) 27 | 28 | ## How to use 29 | 30 | __Expect clone this project, you should download `pm.h5` from [google drive](https://drive.google.com/drive/folders/1Ca3HdV-w8TZPRa9KhPBbjrTtGSmtEIsn?usp=sharing) and place it into this folder.__ 31 | 32 | ### Important Notes 33 | 34 | * The only modification (PTM) supported is **oxidation on Methionine**, otherwise only UNMODIFIED peptides are allowed. To indicate an oxidized methionine, use the format "M(O)". 35 | * This model assumes a __FIXED__ carbamidomethyl on C 36 | * The length of input peptides are __NOT__ limited, however, would expect poor performance with peptides longer than 30 37 | * The prediction will NOT output peaks with M/z > 2000 38 | * Predicted peaks that are weaker than STRONGEST_PEAK / 1000 are regarded as noises thus will be omitted from the final output. 39 | 40 | ### Required Packages 41 | 42 | Recommend to install dependency via [Anaconda](https://www.anaconda.com/distribution/) 43 | 44 | * Python >= 3.7 45 | * Tensorflow >= 2.3.0 46 | * Pandas >= 0.20 47 | * pyteomics 48 | * lxml 49 | 50 | __The Tensorflow has to be 2.30 or newer! A compatibility bug in Tensorflow made version before 2.3.0 can't load the model correctly. We'll release a new model once the Tensorflow team solve this.__ 51 | 52 | ### Input format 53 | 54 | The required input format is TSV, with the following columns: 55 | 56 | Peptide | Charge | Type | NCE 57 | ------- | ------ | ---- | --- 58 | AAAAAAAAAVSR | 2 | HCD | 25 59 | AAGAAESEEDFLR | 2 | HCD | 25 60 | AAPAPTASSTININTSTSK | 2 | HCD | 25 61 | AAPAPM(O)NTSTSK | 2 | HCD | 25 62 | 63 | Apparently, 'Peptide' and 'Charge' columns mean what it says. The 'Type' must be HCD or ETD (in uppercase). NCE means normalized collision energy, set to 25 as default. Note that in the above examples the last peptide has an oxidized methionine, and it's the only modification supported now. Check `example.tsv` for examples. 64 | 65 | ### Usage 66 | 67 | Simply run: 68 | 69 | `python predfull.py --input example.tsv --model pm.h5 --output example_prediction.mgf` 70 | 71 | The output file is in MGF format 72 | 73 | * --input: the input file 74 | * --output: the output path 75 | * --model: the pretrained model 76 | 77 | ## Prediction Examples 78 | 79 | __Note that intensities are shown by square rooted values__ 80 | 81 | ![example 1](imgs/hcd2.png) 82 | 83 | ![example 2](imgs/hcd1.png) 84 | 85 | ## Performance Evaluation 86 | 87 | We provide sample data on [google drive](https://drive.google.com/drive/folders/1Ca3HdV-w8TZPRa9KhPBbjrTtGSmtEIsn?usp=sharing) and codes for you to evaluate the prediction performance. The `hcd_testingset.mgf` file on google drive contains ground truth spectra (randomly sampled from [NIST Human Synthetic Peptide Spectral Library](https://chemdata.nist.gov/dokuwiki/doku.php?id=peptidew:lib:kustersynselected20170530)) that corresponding to items in `example.tsv`, while the `example_prediction.mgf` file contains pre-run predictions. 88 | 89 | To evaluate the similarity, first download groud truth reference file `hcd_testingset.mgf` from [google drive](https://drive.google.com/drive/folders/1Ca3HdV-w8TZPRa9KhPBbjrTtGSmtEIsn?usp=sharing), then run: 90 | 91 | `python compare_performance.py --real hcd_testingset.mgf --pred example_prediction.mgf` 92 | 93 | * --real: the ground truth file 94 | * --pred: the prediction file 95 | 96 | You should get around ~0.789 average similarities using these two pre-given MGF files. 97 | 98 | __Make sure that items in `example.tsv` and `hcd_testingset.mgf` are of the same order! Don't permute items or add/delete items unless you will align them by yourself.__ 99 | 100 | ## How to build & train the model 101 | 102 | For those who are interested in reproducing this model, here we provide `train_model.py` of example codes to build and train the model. 103 | -------------------------------------------------------------------------------- /compare_performance.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import math 3 | import numpy as np 4 | from pyteomics import mgf, mass 5 | 6 | 7 | def norm(x): return np.linalg.norm(x) 8 | 9 | 10 | def cosine(u, v): return np.dot(u, v) / max(norm(u) * norm(v), 1e-16) 11 | 12 | 13 | DIMENSION = 20000 14 | BIN_SIZE = 0.1 15 | 16 | 17 | def spectrum2vector(mz_list, itensity_list, mass, bin_size, charge): 18 | itensity_list = itensity_list / np.max(itensity_list) 19 | 20 | vector = np.zeros(DIMENSION, dtype='float32') 21 | 22 | mz_list = np.asarray(mz_list) 23 | 24 | indexes = mz_list / bin_size 25 | indexes = np.around(indexes).astype('int32') 26 | 27 | for i, index in enumerate(indexes): 28 | if index >= DIMENSION: continue 29 | vector[index] += itensity_list[i] 30 | 31 | # normalize 32 | vector = np.sqrt(vector) 33 | 34 | # remove precursors, including isotropic precursor peaks 35 | for delta in (0, 1, 2): 36 | precursor_mz = mass + delta / charge 37 | if precursor_mz > 0 and precursor_mz < 2000: 38 | vector[round(precursor_mz / bin_size)] = 0 39 | 40 | return vector 41 | 42 | 43 | # ratio constants for NCE 44 | cr = {1: 1, 2: 0.9, 3: 0.85, 4: 0.8, 5: 0.75, 6: 0.75, 7: 0.75, 8: 0.75} 45 | 46 | 47 | def parse_spectra(sps): 48 | db = [] 49 | 50 | for sp in sps: 51 | param = sp['params'] 52 | 53 | c = int(str(param['charge'][0])[0]) 54 | 55 | if 'seq' in param: 56 | pep = param['seq'] 57 | else: 58 | pep = param['title'] 59 | 60 | if 'pepmass' in param: 61 | mass = param['pepmass'][0] 62 | else: 63 | mass = float(param['parent']) 64 | 65 | if 'hcd' in param: 66 | try: 67 | hcd = param['hcd'] 68 | if hcd[-1] == '%': 69 | hcd = float(hcd) 70 | elif hcd[-2:] == 'eV': 71 | hcd = float(hcd[:-2]) 72 | hcd = hcd * 500 * cr[c] / mass 73 | else: 74 | raise Exception("Invalid type!") 75 | except: 76 | hcd = 0 77 | else: 78 | hcd = 0 79 | 80 | mz = sp['m/z array'] 81 | it = sp['intensity array'] 82 | 83 | db.append({'pep': pep, 'charge': c, 84 | 'mass': mass, 'mz': mz, 'it': it, 'nce': hcd}) 85 | 86 | return db 87 | 88 | 89 | def readmgf(fn): 90 | file = open(fn, "r") 91 | data = mgf.read(file, convert_arrays=1, read_charges=False, 92 | dtype='float32', use_index=False) 93 | 94 | codes = parse_spectra(data) 95 | return codes 96 | 97 | 98 | parser = argparse.ArgumentParser() 99 | parser.add_argument('--real', type=str, 100 | help='Real MGF file path', default='hcd_testingset.mgf') 101 | parser.add_argument('--pred', type=str, 102 | help='predicted MGF file path', default='example_prediction.mgf') 103 | 104 | args = parser.parse_args() 105 | 106 | print('Reading', args.real) 107 | real_vectors = [spectrum2vector(sp['mz'], sp['it'], sp['mass'], BIN_SIZE, 108 | sp['charge']) for sp in readmgf(args.real)] 109 | 110 | print('Reading', args.pred) 111 | pred_vectors = [spectrum2vector(sp['mz'], sp['it'], sp['mass'], BIN_SIZE, 112 | sp['charge']) for sp in readmgf(args.pred)] 113 | 114 | similarites = [cosine(sp1, sp2) 115 | for sp1, sp2 in zip(real_vectors, pred_vectors)] 116 | 117 | print('Average Cosine similarites:', np.mean(similarites)) 118 | -------------------------------------------------------------------------------- /coord_tf.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Layer, InputSpec 2 | from tensorflow.keras import backend as K 3 | from tensorflow.keras.utils import get_custom_objects 4 | 5 | 6 | class _CoordinateChannel(Layer): 7 | """ Adds Coordinate Channels to the input tensor. 8 | 9 | # Arguments 10 | rank: An integer, the rank of the input data-uniform, 11 | e.g. "2" for 2D convolution. 12 | use_radius: Boolean flag to determine whether the 13 | radius coordinate should be added for 2D rank 14 | inputs or not. 15 | data_format: A string, 16 | one of `"channels_last"` or `"channels_first"`. 17 | The ordering of the dimensions in the inputs. 18 | `"channels_last"` corresponds to inputs with shape 19 | `(batch, ..., channels)` while `"channels_first"` corresponds to 20 | inputs with shape `(batch, channels, ...)`. 21 | It defaults to the `image_data_format` value found in your 22 | Keras config file at `~/.keras/keras.json`. 23 | If you never set it, then it will be "channels_last". 24 | 25 | # Input shape 26 | ND tensor with shape: 27 | `(samples, channels, *)` 28 | if `data_format` is `"channels_first"` 29 | or ND tensor with shape: 30 | `(samples, *, channels)` 31 | if `data_format` is `"channels_last"`. 32 | 33 | # Output shape 34 | ND tensor with shape: 35 | `(samples, channels + 2, *)` 36 | if `data_format` is `"channels_first"` 37 | or 5D tensor with shape: 38 | `(samples, *, channels + 2)` 39 | if `data_format` is `"channels_last"`. 40 | 41 | # References: 42 | - [An Intriguing Failing of Convolutional Neural Networks and the CoordConv Solution](https://arxiv.org/abs/1807.03247) 43 | """ 44 | 45 | def __init__(self, rank, 46 | use_radius=False, 47 | data_format=None, 48 | **kwargs): 49 | super(_CoordinateChannel, self).__init__(**kwargs) 50 | 51 | if data_format not in [None, 'channels_first', 'channels_last']: 52 | raise ValueError('`data_format` must be either "channels_last", "channels_first" ' 53 | 'or None.') 54 | 55 | self.rank = rank 56 | self.use_radius = use_radius 57 | self.data_format = K.image_data_format() if data_format is None else data_format 58 | self.axis = 1 if K.image_data_format() == 'channels_first' else -1 59 | 60 | self.input_spec = InputSpec(min_ndim=2) 61 | self.supports_masking = True 62 | 63 | def build(self, input_shape): 64 | assert len(input_shape) >= 2 65 | input_dim = input_shape[self.axis] 66 | 67 | self.input_spec = InputSpec(min_ndim=self.rank + 2, 68 | axes={self.axis: input_dim}) 69 | self.built = True 70 | 71 | def call(self, inputs, training=None, mask=None): 72 | input_shape = K.shape(inputs) 73 | 74 | if self.rank == 1: 75 | input_shape = [input_shape[i] for i in range(3)] 76 | batch_shape, dim, channels = input_shape 77 | 78 | xx_range = K.tile(K.expand_dims(K.arange(0, dim), axis=0), 79 | K.stack([batch_shape, 1])) 80 | xx_range = K.expand_dims(xx_range, axis=-1) 81 | 82 | xx_channels = K.cast(xx_range, K.dtype(inputs)) 83 | xx_channels = xx_channels / K.cast(dim - 1, K.dtype(inputs)) 84 | xx_channels = (xx_channels * 2) - 1. 85 | 86 | outputs = K.concatenate([inputs, xx_channels], axis=-1) 87 | 88 | if self.rank == 2: 89 | if self.data_format == 'channels_first': 90 | inputs = K.permute_dimensions(inputs, [0, 2, 3, 1]) 91 | input_shape = K.shape(inputs) 92 | 93 | input_shape = [input_shape[i] for i in range(4)] 94 | batch_shape, dim1, dim2, channels = input_shape 95 | 96 | xx_ones = K.ones(K.stack([batch_shape, dim2]), dtype='int32') 97 | xx_ones = K.expand_dims(xx_ones, axis=-1) 98 | 99 | xx_range = K.tile(K.expand_dims(K.arange(0, dim1), axis=0), 100 | K.stack([batch_shape, 1])) 101 | xx_range = K.expand_dims(xx_range, axis=1) 102 | xx_channels = K.batch_dot(xx_ones, xx_range, axes=[2, 1]) 103 | xx_channels = K.expand_dims(xx_channels, axis=-1) 104 | xx_channels = K.permute_dimensions(xx_channels, [0, 2, 1, 3]) 105 | 106 | yy_ones = K.ones(K.stack([batch_shape, dim1]), dtype='int32') 107 | yy_ones = K.expand_dims(yy_ones, axis=1) 108 | 109 | yy_range = K.tile(K.expand_dims(K.arange(0, dim2), axis=0), 110 | K.stack([batch_shape, 1])) 111 | yy_range = K.expand_dims(yy_range, axis=-1) 112 | 113 | yy_channels = K.batch_dot(yy_range, yy_ones, axes=[2, 1]) 114 | yy_channels = K.expand_dims(yy_channels, axis=-1) 115 | yy_channels = K.permute_dimensions(yy_channels, [0, 2, 1, 3]) 116 | 117 | xx_channels = K.cast(xx_channels, K.floatx()) 118 | xx_channels = xx_channels / K.cast(dim1 - 1, K.floatx()) 119 | xx_channels = (xx_channels * 2) - 1. 120 | 121 | yy_channels = K.cast(yy_channels, K.floatx()) 122 | yy_channels = yy_channels / K.cast(dim2 - 1, K.floatx()) 123 | yy_channels = (yy_channels * 2) - 1. 124 | 125 | outputs = K.concatenate([inputs, xx_channels, yy_channels], axis=-1) 126 | 127 | if self.use_radius: 128 | rr = K.sqrt(K.square(xx_channels - 0.5) + 129 | K.square(yy_channels - 0.5)) 130 | outputs = K.concatenate([outputs, rr], axis=-1) 131 | 132 | if self.data_format == 'channels_first': 133 | outputs = K.permute_dimensions(outputs, [0, 3, 1, 2]) 134 | 135 | if self.rank == 3: 136 | if self.data_format == 'channels_first': 137 | inputs = K.permute_dimensions(inputs, [0, 2, 3, 4, 1]) 138 | input_shape = K.shape(inputs) 139 | 140 | input_shape = [input_shape[i] for i in range(5)] 141 | batch_shape, dim1, dim2, dim3, channels = input_shape 142 | 143 | xx_ones = K.ones(K.stack([batch_shape, dim3]), dtype='int32') 144 | xx_ones = K.expand_dims(xx_ones, axis=-1) 145 | 146 | xx_range = K.tile(K.expand_dims(K.arange(0, dim2), axis=0), 147 | K.stack([batch_shape, 1])) 148 | xx_range = K.expand_dims(xx_range, axis=1) 149 | 150 | xx_channels = K.batch_dot(xx_ones, xx_range, axes=[2, 1]) 151 | xx_channels = K.expand_dims(xx_channels, axis=-1) 152 | xx_channels = K.permute_dimensions(xx_channels, [0, 2, 1, 3]) 153 | 154 | xx_channels = K.expand_dims(xx_channels, axis=1) 155 | xx_channels = K.tile(xx_channels, 156 | [1, dim1, 1, 1, 1]) 157 | 158 | yy_ones = K.ones(K.stack([batch_shape, dim2]), dtype='int32') 159 | yy_ones = K.expand_dims(yy_ones, axis=1) 160 | 161 | yy_range = K.tile(K.expand_dims(K.arange(0, dim3), axis=0), 162 | K.stack([batch_shape, 1])) 163 | yy_range = K.expand_dims(yy_range, axis=-1) 164 | 165 | yy_channels = K.batch_dot(yy_range, yy_ones, axes=[2, 1]) 166 | yy_channels = K.expand_dims(yy_channels, axis=-1) 167 | yy_channels = K.permute_dimensions(yy_channels, [0, 2, 1, 3]) 168 | 169 | yy_channels = K.expand_dims(yy_channels, axis=1) 170 | yy_channels = K.tile(yy_channels, 171 | [1, dim1, 1, 1, 1]) 172 | 173 | zz_range = K.tile(K.expand_dims(K.arange(0, dim1), axis=0), 174 | K.stack([batch_shape, 1])) 175 | zz_range = K.expand_dims(zz_range, axis=-1) 176 | zz_range = K.expand_dims(zz_range, axis=-1) 177 | 178 | zz_channels = K.tile(zz_range, 179 | [1, 1, dim2, dim3]) 180 | zz_channels = K.expand_dims(zz_channels, axis=-1) 181 | 182 | xx_channels = K.cast(xx_channels, K.floatx()) 183 | xx_channels = xx_channels / K.cast(dim2 - 1, K.floatx()) 184 | xx_channels = xx_channels * 2 - 1. 185 | 186 | yy_channels = K.cast(yy_channels, K.floatx()) 187 | yy_channels = yy_channels / K.cast(dim3 - 1, K.floatx()) 188 | yy_channels = yy_channels * 2 - 1. 189 | 190 | zz_channels = K.cast(zz_channels, K.floatx()) 191 | zz_channels = zz_channels / K.cast(dim1 - 1, K.floatx()) 192 | zz_channels = zz_channels * 2 - 1. 193 | 194 | outputs = K.concatenate([inputs, zz_channels, xx_channels, yy_channels], 195 | axis=-1) 196 | 197 | if self.data_format == 'channels_first': 198 | outputs = K.permute_dimensions(outputs, [0, 4, 1, 2, 3]) 199 | 200 | return outputs 201 | 202 | def compute_output_shape(self, input_shape): 203 | assert input_shape and len(input_shape) >= 2 204 | assert input_shape[self.axis] 205 | 206 | if self.use_radius and self.rank == 2: 207 | channel_count = 3 208 | else: 209 | channel_count = self.rank 210 | 211 | output_shape = list(input_shape) 212 | output_shape[self.axis] = input_shape[self.axis] + channel_count 213 | return tuple(output_shape) 214 | 215 | def get_config(self): 216 | config = { 217 | 'rank': self.rank, 218 | 'use_radius': self.use_radius, 219 | 'data_format': self.data_format 220 | } 221 | base_config = super(_CoordinateChannel, self).get_config() 222 | return dict(list(base_config.items()) + list(config.items())) 223 | 224 | 225 | class CoordinateChannel1D(_CoordinateChannel): 226 | """ Adds Coordinate Channels to the input tensor of rank 1. 227 | 228 | # Arguments 229 | data_format: A string, 230 | one of `"channels_last"` or `"channels_first"`. 231 | The ordering of the dimensions in the inputs. 232 | `"channels_last"` corresponds to inputs with shape 233 | `(batch, ..., channels)` while `"channels_first"` corresponds to 234 | inputs with shape `(batch, channels, ...)`. 235 | It defaults to the `image_data_format` value found in your 236 | Keras config file at `~/.keras/keras.json`. 237 | If you never set it, then it will be "channels_last". 238 | 239 | # Input shape 240 | 3D tensor with shape: `(batch_size, steps, input_dim)` 241 | 242 | # Output shape 243 | 3D tensor with shape: `(batch_size, steps, input_dim + 2)` 244 | 245 | # References: 246 | - [An Intriguing Failing of Convolutional Neural Networks and the CoordConv Solution](https://arxiv.org/abs/1807.03247) 247 | """ 248 | 249 | def __init__(self, data_format=None, **kwargs): 250 | super(CoordinateChannel1D, self).__init__( 251 | rank=1, 252 | use_radius=False, 253 | data_format=data_format, 254 | **kwargs 255 | ) 256 | 257 | def get_config(self): 258 | config = super(CoordinateChannel1D, self).get_config() 259 | config.pop('rank') 260 | config.pop('use_radius') 261 | return config 262 | 263 | 264 | class CoordinateChannel2D(_CoordinateChannel): 265 | """ Adds Coordinate Channels to the input tensor. 266 | 267 | # Arguments 268 | use_radius: Boolean flag to determine whether the 269 | radius coordinate should be added for 2D rank 270 | inputs or not. 271 | data_format: A string, 272 | one of `"channels_last"` or `"channels_first"`. 273 | The ordering of the dimensions in the inputs. 274 | `"channels_last"` corresponds to inputs with shape 275 | `(batch, ..., channels)` while `"channels_first"` corresponds to 276 | inputs with shape `(batch, channels, ...)`. 277 | It defaults to the `image_data_format` value found in your 278 | Keras config file at `~/.keras/keras.json`. 279 | If you never set it, then it will be "channels_last". 280 | 281 | # Input shape 282 | 4D tensor with shape: 283 | `(samples, channels, rows, cols)` 284 | if `data_format` is `"channels_first"` 285 | or 4D tensor with shape: 286 | `(samples, rows, cols, channels)` 287 | if `data_format` is `"channels_last"`. 288 | 289 | # Output shape 290 | 4D tensor with shape: 291 | `(samples, channels + 2/3, rows, cols)` 292 | if `data_format` is `"channels_first"` 293 | or 4D tensor with shape: 294 | `(samples, rows, cols, channels + 2/3)` 295 | if `data_format` is `"channels_last"`. 296 | 297 | If `use_radius` is set, then will have 3 additional filers, 298 | else only 2 additional filters will be added. 299 | 300 | # References: 301 | - [An Intriguing Failing of Convolutional Neural Networks and the CoordConv Solution](https://arxiv.org/abs/1807.03247) 302 | """ 303 | 304 | def __init__(self, use_radius=False, 305 | data_format=None, 306 | **kwargs): 307 | super(CoordinateChannel2D, self).__init__( 308 | rank=2, 309 | use_radius=use_radius, 310 | data_format=data_format, 311 | **kwargs 312 | ) 313 | 314 | def get_config(self): 315 | config = super(CoordinateChannel2D, self).get_config() 316 | config.pop('rank') 317 | return config 318 | 319 | 320 | class CoordinateChannel3D(_CoordinateChannel): 321 | """ Adds Coordinate Channels to the input tensor. 322 | 323 | # Arguments 324 | rank: An integer, the rank of the input data-uniform, 325 | e.g. "2" for 2D convolution. 326 | use_radius: Boolean flag to determine whether the 327 | radius coordinate should be added for 2D rank 328 | inputs or not. 329 | data_format: A string, 330 | one of `"channels_last"` or `"channels_first"`. 331 | The ordering of the dimensions in the inputs. 332 | `"channels_last"` corresponds to inputs with shape 333 | `(batch, ..., channels)` while `"channels_first"` corresponds to 334 | inputs with shape `(batch, channels, ...)`. 335 | It defaults to the `image_data_format` value found in your 336 | Keras config file at `~/.keras/keras.json`. 337 | If you never set it, then it will be "channels_last". 338 | 339 | # Input shape 340 | 5D tensor with shape: 341 | `(samples, channels, conv_dim1, conv_dim2, conv_dim3)` 342 | if `data_format` is `"channels_first"` 343 | or 5D tensor with shape: 344 | `(samples, conv_dim1, conv_dim2, conv_dim3, channels)` 345 | if `data_format` is `"channels_last"`. 346 | 347 | # Output shape 348 | 5D tensor with shape: 349 | `(samples, channels + 2, conv_dim1, conv_dim2, conv_dim3)` 350 | if `data_format` is `"channels_first"` 351 | or 5D tensor with shape: 352 | `(samples, conv_dim1, conv_dim2, conv_dim3, channels + 2)` 353 | if `data_format` is `"channels_last"`. 354 | 355 | # References: 356 | - [An Intriguing Failing of Convolutional Neural Networks and the CoordConv Solution](https://arxiv.org/abs/1807.03247) 357 | """ 358 | 359 | def __init__(self, data_format=None, 360 | **kwargs): 361 | super(CoordinateChannel3D, self).__init__( 362 | rank=3, 363 | use_radius=False, 364 | data_format=data_format, 365 | **kwargs 366 | ) 367 | 368 | def get_config(self): 369 | config = super(CoordinateChannel3D, self).get_config() 370 | config.pop('rank') 371 | config.pop('use_radius') 372 | return config 373 | 374 | 375 | get_custom_objects().update({'CoordinateChannel1D': CoordinateChannel1D, 376 | 'CoordinateChannel2D': CoordinateChannel2D, 377 | 'CoordinateChannel3D': CoordinateChannel3D}) 378 | -------------------------------------------------------------------------------- /imgs/hcd1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lkytal/PredFull/fc3641e0bc511e2947be7e33021ed871140db030/imgs/hcd1.png -------------------------------------------------------------------------------- /imgs/hcd2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lkytal/PredFull/fc3641e0bc511e2947be7e33021ed871140db030/imgs/hcd2.png -------------------------------------------------------------------------------- /imgs/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lkytal/PredFull/fc3641e0bc511e2947be7e33021ed871140db030/imgs/model.png -------------------------------------------------------------------------------- /predfull.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import math 6 | from pyteomics import mgf, mass 7 | import argparse 8 | 9 | import tensorflow.keras as k 10 | from tensorflow.keras import backend as K 11 | from tensorflow.keras.layers import Layer, InputSpec 12 | from tensorflow.keras.layers import ( 13 | Conv1D, 14 | MaxPooling1D, 15 | Dense, 16 | Add, 17 | Flatten, 18 | Activation, 19 | BatchNormalization, 20 | LayerNormalization, 21 | ) 22 | from tensorflow.keras import Model, Input 23 | 24 | from coord_tf import CoordinateChannel2D, CoordinateChannel1D 25 | 26 | # Hyper Parameters 27 | precision = 0.1 28 | MZ_START = 0 29 | SPEC_DIMENSION = 20000 30 | 31 | PRECURSOR_SCALE = 20000.0 32 | LENGTH_SCALE = 1000 33 | 34 | MAX_CHARGE = 30 35 | 36 | mono = { 37 | "G": 57.021464, 38 | "A": 71.037114, 39 | "S": 87.032029, 40 | "P": 97.052764, 41 | "V": 99.068414, 42 | "T": 101.04768, 43 | "C": 160.03019, 44 | "L": 113.08406, 45 | "I": 113.08406, 46 | "D": 115.02694, 47 | "Q": 128.05858, 48 | "K": 128.09496, 49 | "E": 129.04259, 50 | "M": 131.04048, 51 | "m": 147.0354, 52 | "H": 137.05891, 53 | "F": 147.06441, 54 | "R": 156.10111, 55 | "Y": 163.06333, 56 | "N": 114.04293, 57 | "W": 186.07931, 58 | "O": 147.03538, 59 | } 60 | 61 | ave_mass = { 62 | "A": 71.0788, 63 | "R": 156.1875, 64 | "N": 114.1038, 65 | "D": 115.0886, 66 | "C": 160.1598, 67 | "E": 129.1155, 68 | "Q": 128.1307, 69 | "G": 57.0519, 70 | "H": 137.1411, 71 | "I": 113.1594, 72 | "L": 113.1594, 73 | "K": 128.1741, 74 | "M": 131.1926, 75 | "F": 147.1766, 76 | "P": 97.1167, 77 | "S": 87.0782, 78 | "T": 101.1051, 79 | "W": 186.2132, 80 | "Y": 163.1760, 81 | "V": 99.1326, 82 | } 83 | 84 | Alist = list("ACDEFGHIKLMNPQRSTVWYZ") 85 | encoding_dimension = len(Alist) + 3 86 | 87 | charMap = {"*": 0, "]": len(Alist) + 1, "[": len(Alist) + 2} 88 | for i, a in enumerate(Alist): 89 | charMap[a] = i + 1 90 | 91 | 92 | # help functions 93 | def mz2pos(mz, pre=precision): 94 | return int(round((mz - MZ_START) / pre)) 95 | 96 | 97 | def pos2mz(pos, pre=precision): 98 | return pos * pre + MZ_START 99 | 100 | 101 | def asnp(x): 102 | return np.asarray(x) 103 | 104 | 105 | def asnp32(x): 106 | return np.asarray(x, dtype="float32") 107 | 108 | 109 | def f2(x): 110 | return "{0:.2f}".format(x) 111 | 112 | 113 | def f4(x): 114 | return "{0:.4f}".format(x) 115 | 116 | 117 | # compute percursor mass 118 | def fastmass(pep, ion_type, charge, mod=None, cam=True): 119 | base = mass.fast_mass(pep, ion_type=ion_type, charge=charge) 120 | 121 | if cam: 122 | base += 57.021 * pep.count("C") / charge 123 | 124 | if not mod is None: 125 | base += 15.995 * np.sum(mod == 1) / charge 126 | 127 | base += -np.sum(mod[mod < 0]) 128 | return base 129 | 130 | 131 | # help function to parse modifications 132 | 133 | 134 | def getmod(pep): 135 | mod = np.zeros(len(pep)) 136 | 137 | if pep.isalpha(): 138 | return pep, mod, 0 139 | 140 | seq = [] 141 | nmod = 0 142 | 143 | i = -1 144 | while len(pep) > 0: 145 | if pep[0] == "(": 146 | if pep[:3] == "(O)": 147 | mod[i] = 1 148 | pep = pep[3:] 149 | elif pep[:4] == "(ox)": 150 | mod[i] = 1 151 | pep = pep[4:] 152 | elif pep[2] == ")" and pep[1] in "ASDFGHJKLZXCVBNMQWERTYUIOP": 153 | mod[i] = -2 154 | pep = pep[3:] 155 | else: 156 | raise "unknown mod: " + pep 157 | 158 | elif pep[0] == "+" or pep[0] == "-": 159 | sign = 1 if pep[0] == "+" else -1 160 | 161 | for j in range(1, len(pep)): 162 | if pep[j] not in ".1234567890": 163 | if i == -1: # N-term mod 164 | nmod += sign * float(pep[1:j]) 165 | else: 166 | mod[i] += sign * float(pep[1:j]) 167 | pep = pep[j:] 168 | break 169 | 170 | if j == len(pep) - 1 and pep[-1] in ".1234567890": # till end 171 | mod[i] += sign * float(pep[1:]) 172 | break 173 | else: 174 | seq += pep[0] 175 | pep = pep[1:] 176 | i = len(seq) - 1 # more realible 177 | 178 | return "".join(seq), mod[: len(seq)], nmod 179 | 180 | 181 | META_SHAPE = (3, 30) # (charge, ftype, other(mass, nce)) 182 | INPUT_DIMENSION = encoding_dimension + 2 + 3 183 | INPUT_SHAPE = [-1, INPUT_DIMENSION] 184 | 185 | 186 | # embed input item into a matrix 187 | def embed( 188 | spectrum, shape=INPUT_SHAPE, mass_scale=200, embedding=None, ignore=False, pep=None 189 | ): 190 | if embedding is None: 191 | embedding = np.zeros(shape, dtype="float32") 192 | 193 | if pep is None: 194 | pep = spectrum["pep"] 195 | pep = pep.replace("L", "I") 196 | 197 | embedding[len(pep)][encoding_dimension - 1] = 1 # ending pos 198 | for i, aa in enumerate(pep): 199 | embedding[i][charMap[aa]] = 1 # 1 - 20 200 | embedding[i][encoding_dimension] = mono[aa] / mass_scale 201 | 202 | embedding[: len(pep), encoding_dimension + 1] = ( 203 | np.arange(len(pep)) / LENGTH_SCALE 204 | ) # position info 205 | embedding[len(pep) + 1, 0] = 1 # padding info 206 | 207 | if "mod" in spectrum: 208 | for i, modi in enumerate(spectrum["mod"]): 209 | embedding[i][encoding_dimension + 2 + int(modi)] = 1 210 | 211 | return embedding 212 | 213 | 214 | # preprocess function for inputs 215 | def preprocessor(batch): 216 | batch_size = len(batch) 217 | embedding = np.zeros((batch_size, *INPUT_SHAPE), dtype="float32") 218 | meta = np.zeros((batch_size, *META_SHAPE), dtype="float32") 219 | 220 | for i, sp in enumerate(batch): 221 | pep = sp["pep"] 222 | 223 | embed(sp, embedding=embedding[i]) 224 | meta[i][0][sp["charge"] - 1] = 1 # charge 225 | meta[i][1][sp["type"]] = 1 # ftype 226 | meta[i][2][0] = fastmass(pep, ion_type="M", charge=1) / PRECURSOR_SCALE 227 | 228 | if not "nce" in sp or sp["nce"] == 0: 229 | meta[i][2][-1] = 0.25 230 | else: 231 | meta[i][2][-1] = sp["nce"] / 100.0 232 | 233 | return (embedding, meta) 234 | 235 | 236 | # generator for inputs 237 | class input_generator(k.utils.Sequence): 238 | def __init__(self, spectra, processor, batch_size, shuffle=1): 239 | self.spectra = spectra 240 | self.processor = processor 241 | self.batch_size = batch_size 242 | self.shuffle = shuffle 243 | 244 | def on_epoch_begin(self, epoch): 245 | if epoch > 0 and self.shuffle: 246 | np.random.shuffle(self.spectra) 247 | 248 | def __len__(self): 249 | return math.ceil(len(self.spectra) / self.batch_size) 250 | 251 | def __getitem__(self, idx): 252 | start_idx = idx * self.batch_size 253 | end_idx = min(start_idx + self.batch_size, len(self.spectra)) 254 | 255 | return (self.processor(self.spectra[start_idx:end_idx]),) 256 | 257 | 258 | # functions that transfer predictions into mgf format 259 | def sparse(x, y, th=0.0002): 260 | x = np.asarray(x, dtype="float32") 261 | y = np.asarray(y, dtype="float32") 262 | 263 | y /= np.max(y) 264 | 265 | return x[y > th], y[y > th] 266 | 267 | 268 | def tomgf(sp, y): 269 | head = ( 270 | "BEGIN IONS\n" 271 | f"TITLE={sp['title']}\n" 272 | f"PEPTIDE={sp['title']}\n" 273 | f"CHARGE={sp['charge']}+\n" 274 | f"PEPMASS={sp['mass']}\n" 275 | ) 276 | 277 | y[min(math.ceil(sp["mass"] * sp["charge"] / precision), len(y)) :] = 0 278 | 279 | imz = ( 280 | np.arange(0, SPEC_DIMENSION, dtype="int32") * precision + MZ_START 281 | ) # more acurate 282 | mzs, its = sparse(imz, y) 283 | 284 | # mzs *= 1.00052 285 | 286 | peaks = [f"{f2(mz)} {f4(it * 1000)}" for mz, it in zip(mzs, its)] 287 | 288 | return head + "\n".join(peaks) + "\nEND IONS" 289 | 290 | 291 | parser = argparse.ArgumentParser() 292 | parser.add_argument("--input", type=str, help="input file path", default="example.tsv") 293 | parser.add_argument("--batch_size", type=str, help="batch size per loop", default=256) 294 | parser.add_argument( 295 | "--output", type=str, help="output file path", default="example_prediction.mgf" 296 | ) 297 | parser.add_argument("--model", type=str, help="model file path", default="pm.h5") 298 | 299 | args = parser.parse_args() 300 | 301 | K.clear_session() 302 | 303 | pm = k.models.load_model(args.model, compile=0) 304 | pm.compile(optimizer=k.optimizers.Adam(lr=0.0003), loss="cosine") 305 | 306 | # fragmentation types 307 | types = {"un": 0, "cid": 1, "etd": 2, "hcd": 3, "ethcd": 4, "etcid": 5} 308 | 309 | # read inputs 310 | inputs = [] 311 | for item in pd.read_csv(args.input, sep="\t").itertuples(): 312 | if item.Charge < 1 or item.Charge > MAX_CHARGE: 313 | print("input", item.Peptide, "exceed max charge of", MAX_CHARGE, ", ignored") 314 | continue 315 | 316 | pep, mod, nterm_mod = getmod(item.Peptide) 317 | 318 | if nterm_mod != 0: 319 | print("input", item.Peptide, "has N-term modification, ignored") 320 | continue 321 | 322 | if np.any(mod != 0) and set(mod) != set([0, 1]): 323 | print("Only Oxidation modification is supported, ignored", item.Peptide) 324 | continue 325 | 326 | inputs.append( 327 | { 328 | "pep": pep, 329 | "mod": mod, 330 | "charge": item.Charge, 331 | "title": item.Peptide, 332 | "nce": item.NCE, 333 | "type": types[item.Type.lower()], 334 | "mass": fastmass(pep, "M", item.Charge, mod=mod), 335 | } 336 | ) 337 | 338 | INPUT_SHAPE[0] = max( 339 | INPUT_SHAPE[0], len(pep) + 2 340 | ) # update xshape to match max input peptide 341 | 342 | batch_per_loop = 64 343 | loop_size = args.batch_size * batch_per_loop 344 | 345 | f = open(args.output, "w+") 346 | 347 | while len(inputs) > 0: 348 | if len(inputs) >= loop_size: 349 | sliced_spectra = inputs[:loop_size] 350 | inputs = inputs[loop_size:] 351 | else: 352 | sliced_spectra = inputs 353 | inputs = [] 354 | 355 | y = pm.predict( 356 | input_generator(sliced_spectra, preprocessor, batch_size=args.batch_size), 357 | verbose=1, 358 | ) 359 | y = np.square(y) 360 | 361 | f.writelines("%s\n\n" % tomgf(sp, yi) for sp, yi in zip(sliced_spectra, y)) 362 | 363 | f.close() 364 | print("Prediction finished") 365 | -------------------------------------------------------------------------------- /train_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import numpy as np 4 | import math 5 | from pyteomics import mgf, mass 6 | import argparse 7 | 8 | import tensorflow as tf 9 | import tensorflow.keras as k 10 | from tensorflow.keras import backend as K 11 | from tensorflow.keras.layers import ( 12 | Conv1D, 13 | MaxPooling1D, 14 | Dense, 15 | Add, 16 | Flatten, 17 | Activation, 18 | BatchNormalization, 19 | ) 20 | from tensorflow.keras import Model, Input 21 | 22 | from coord_tf import CoordinateChannel2D, CoordinateChannel1D 23 | 24 | # hyper parameter and constants 25 | 26 | SPECTRA_DIMENSION = 20000 27 | BIN_SIZE = 0.1 28 | MAX_PEPTIDE_LENGTH = 30 29 | MAX_MZ = 2000 30 | LENGTH_SCALE = 1000 31 | PRECURSOR_SCALE = 20000.0 32 | 33 | 34 | mono = { 35 | "G": 57.021464, 36 | "A": 71.037114, 37 | "S": 87.032029, 38 | "P": 97.052764, 39 | "V": 99.068414, 40 | "T": 101.04768, 41 | "C": 160.03019, 42 | "L": 113.08406, 43 | "I": 113.08406, 44 | "D": 115.02694, 45 | "Q": 128.05858, 46 | "K": 128.09496, 47 | "E": 129.04259, 48 | "M": 131.04048, 49 | "m": 147.0354, 50 | "H": 137.05891, 51 | "F": 147.06441, 52 | "R": 156.10111, 53 | "Y": 163.06333, 54 | "N": 114.04293, 55 | "W": 186.07931, 56 | "O": 147.03538, 57 | } 58 | 59 | ave_mass = { 60 | "A": 71.0788, 61 | "R": 156.1875, 62 | "N": 114.1038, 63 | "D": 115.0886, 64 | "C": 160.1598, 65 | "E": 129.1155, 66 | "Q": 128.1307, 67 | "G": 57.0519, 68 | "H": 137.1411, 69 | "I": 113.1594, 70 | "L": 113.1594, 71 | "K": 128.1741, 72 | "M": 131.1926, 73 | "F": 147.1766, 74 | "P": 97.1167, 75 | "S": 87.0782, 76 | "T": 101.1051, 77 | "W": 186.2132, 78 | "Y": 163.1760, 79 | "V": 99.1326, 80 | } 81 | 82 | Alist = list("ACDEFGHIKLMNPQRSTVWYZ") 83 | ENCODE_DIMENSION = len(Alist) + 3 84 | 85 | charMap = {"@": 0, "[": 21} 86 | for i, a in enumerate(Alist): 87 | charMap[a] = i + 1 88 | 89 | # help functions 90 | 91 | 92 | def asnp(x): 93 | return np.asarray(x) 94 | 95 | 96 | def asnp32(x): 97 | return np.asarray(x, dtype="float32") 98 | 99 | 100 | # compute percursor mass 101 | def fastmass(pep, ion_type, charge, mod=None, cam=True): 102 | base = mass.fast_mass(pep, ion_type=ion_type, charge=charge) 103 | 104 | if cam: 105 | base += 57.021 * pep.count("C") / charge 106 | 107 | if not mod is None: 108 | base += 15.995 * np.sum(mod == 1) / charge 109 | 110 | base += -np.sum(mod[mod < 0]) 111 | return base 112 | 113 | 114 | INPUT_LENGTH = MAX_PEPTIDE_LENGTH + 2 115 | INPUT_DIMENSION = ENCODE_DIMENSION + 2 + 3 116 | META_SHAPE = (3, 30) 117 | 118 | 119 | # embed input item into a matrix 120 | def embed(spectrum, embedding, mass_scale=200): 121 | pep = spectrum["pep"] 122 | pep = pep.replace("L", "I") 123 | 124 | embedding[len(pep)][ENCODE_DIMENSION - 1] = 1 # ending pos 125 | for i, aa in enumerate(pep): 126 | embedding[i][charMap[aa]] = 1 # 1 - 20 127 | embedding[i][ENCODE_DIMENSION] = mono[aa] / mass_scale 128 | 129 | embedding[: len(pep), ENCODE_DIMENSION + 1] = ( 130 | np.arange(len(pep)) / LENGTH_SCALE 131 | ) # position info 132 | 133 | embedding[len(pep) + 1, 0] = 1 # padding info 134 | 135 | return embedding 136 | 137 | 138 | def preprocessor(batch): 139 | batch_size = len(batch) 140 | embedding = np.zeros((batch_size, INPUT_LENGTH, INPUT_DIMENSION), dtype="float32") 141 | meta = np.zeros((batch_size, *META_SHAPE), dtype="float32") 142 | 143 | for i, sp in enumerate(batch): 144 | pep = sp["pep"] 145 | 146 | if len(pep) > MAX_PEPTIDE_LENGTH: 147 | raise "input too long" 148 | 149 | embed(sp, embedding=embedding[i]) 150 | meta[i][0][sp["charge"] - 1] = 1 # charge 151 | meta[i][1][sp["type"]] = 1 # ftype 152 | meta[i][2][0] = fastmass(pep, ion_type="M", charge=1) / PRECURSOR_SCALE 153 | 154 | if not "nce" in sp or sp["nce"] == 0: 155 | meta[i][2][-1] = 0.25 156 | else: 157 | meta[i][2][-1] = sp["nce"] / 100.0 158 | 159 | return (embedding, meta) 160 | 161 | 162 | # read inputs 163 | def parse_spectra(sps, spec_type=3): 164 | # ratio constants for NCE 165 | cr = {1: 1, 2: 0.9, 3: 0.85, 4: 0.8, 5: 0.75, 6: 0.75, 7: 0.75, 8: 0.75} 166 | 167 | db = [] 168 | 169 | for sp in sps: 170 | param = sp["params"] 171 | 172 | c = int(str(param["charge"][0])[0]) 173 | 174 | if "seq" in param: 175 | pep = param["seq"] 176 | else: 177 | pep = param["title"] 178 | 179 | if "pepmass" in param: 180 | mass = param["pepmass"][0] 181 | else: 182 | mass = float(param["parent"]) 183 | 184 | if "hcd" in param: 185 | try: 186 | hcd = param["hcd"] 187 | if hcd[-1] == "%": 188 | hcd = float(hcd) 189 | elif hcd[-2:] == "eV": 190 | hcd = float(hcd[:-2]) 191 | hcd = hcd * 500 * cr[c] / mass 192 | else: 193 | raise Exception("Invalid type!") 194 | except: 195 | hcd = 0 196 | else: 197 | hcd = 0 198 | 199 | mz = sp["m/z array"] 200 | it = sp["intensity array"] 201 | 202 | db.append( 203 | { 204 | "pep": pep, 205 | "charge": c, 206 | "mass": mass, 207 | "mz": mz, 208 | "it": it, 209 | "nce": hcd, 210 | "type": spec_type, 211 | } 212 | ) 213 | 214 | return db 215 | 216 | 217 | spec_types = {"unknown": 0, "cid": 1, "etd": 2, "hcd": 3, "ethcd": 4, "etcid": 5} 218 | 219 | 220 | def readmgf(fn, type="hcd"): 221 | file = open(fn, "r") 222 | data = mgf.read( 223 | file, convert_arrays=1, read_charges=False, dtype="float32", use_index=False 224 | ) 225 | 226 | codes = parse_spectra(data, spec_type=spec_types[type]) 227 | file.close() 228 | return codes 229 | 230 | 231 | def spectrum2vector(mz_list, itensity_list, mass, bin_size, charge): 232 | itensity_list = itensity_list / np.max(itensity_list) 233 | 234 | vector = np.zeros(SPECTRA_DIMENSION, dtype="float32") 235 | 236 | mz_list = np.asarray(mz_list) 237 | 238 | indexes = mz_list / bin_size 239 | indexes = np.around(indexes).astype("int32") 240 | 241 | for i, index in enumerate(indexes): 242 | vector[index] += itensity_list[i] 243 | 244 | # normalize 245 | vector = np.sqrt(vector) 246 | 247 | # remove precursors, including isotropic peaks 248 | for delta in (0, 1, 2): 249 | precursor_mz = mass + delta / charge 250 | if precursor_mz > 0 and precursor_mz < 2000: 251 | vector[round(precursor_mz / bin_size)] = 0 252 | 253 | return vector 254 | 255 | 256 | # building the model 257 | 258 | 259 | def res_block(x, layers, kernel=(3,), act="relu", se=0, **kws): 260 | normalizer = BatchNormalization 261 | 262 | ConvLayer = k.layers.Conv1D 263 | MaxPoolingLayer = k.layers.MaxPooling1D 264 | AvePoolingLayer = k.layers.AveragePooling1D 265 | GlobalPoolingLayer = k.layers.GlobalAveragePooling1D 266 | GlobalMaxLayer = k.layers.GlobalMaxPooling1D 267 | assert K.ndim(x) == 3 268 | 269 | raw_x = x # backup input 270 | 271 | x = ConvLayer(layers, kernel_size=kernel, padding="same", **kws)(x) 272 | x = normalizer(gamma_initializer="zeros")(x) 273 | 274 | if se == 1: 275 | x2 = GlobalPoolingLayer()(x) 276 | x2 = Dense(max(4, layers // 16), activation="relu")(x2) 277 | x2 = Dense(layers, activation="sigmoid")(x2) 278 | x2 = k.layers.Reshape((1, -1))(x2) 279 | 280 | x = k.layers.Multiply()([x, x2]) 281 | 282 | if K.int_shape(x)[-1] != layers: 283 | raw_x = ConvLayer(layers, kernel_size=1, padding="same")(raw_x) 284 | raw_x = normalizer()(raw_x) 285 | 286 | x = Add()([raw_x, x]) 287 | 288 | return Activation(act)(x) # final activation 289 | 290 | 291 | def build(act="relu"): 292 | inp = Input(shape=(INPUT_LENGTH, INPUT_DIMENSION), name="enbedding_input") 293 | meta_inp = Input(shape=(*META_SHAPE,), name="meta_input") 294 | 295 | info = k.layers.Dense(8, activation="relu")(k.layers.Flatten()(meta_inp)) 296 | info = k.layers.Reshape((1, -1))(info) 297 | info = tf.repeat(info, K.shape(inp)[1], axis=1) 298 | x = k.layers.Concatenate(axis=-1)([inp, info]) 299 | 300 | x = CoordinateChannel1D()(x) # add positional information 301 | 302 | def conv_normal(x, channel, kernel, padding="same"): 303 | x = Conv1D(channel, kernel_size=kernel, padding=padding)(x) 304 | x = BatchNormalization(gamma_initializer="zeros")(x) 305 | return x 306 | 307 | features = k.layers.Concatenate(axis=-1)( 308 | [conv_normal(x, 64, i) for i in range(2, 10)] 309 | ) 310 | 311 | x = Conv1D(512, kernel_size=1, padding="same")(x) 312 | x = BatchNormalization()(x) 313 | 314 | x = Add()([x, features]) 315 | x = Activation(act)(x) 316 | 317 | for i in range(8): 318 | x = res_block(x, 512, 3, act=act, se=1) 319 | 320 | for i in range(3): 321 | x = res_block(x, 512, 1, se=0, act=act) 322 | 323 | x = k.layers.Conv1D(SPECTRA_DIMENSION, kernel_size=1, padding="valid")(x) 324 | x = Activation("sigmoid")(x) 325 | x = k.layers.GlobalAveragePooling1D(name="spectrum")(x) 326 | 327 | model = k.models.Model(inputs=[inp, meta_inp], outputs=x, name="predfull_model") 328 | return model 329 | 330 | 331 | parser = argparse.ArgumentParser() 332 | parser.add_argument( 333 | "--mgf", type=str, help="output file path", default="hcd_testingset.mgf" 334 | ) 335 | parser.add_argument( 336 | "--out", type=str, help="filename to save the trained model", default="trained.h5" 337 | ) 338 | 339 | args = parser.parse_args() 340 | 341 | K.clear_session() 342 | 343 | pm = build() 344 | pm.compile(optimizer=k.optimizers.Adam(lr=0.0003), loss="cosine_similarity") 345 | print(pm.summary()) 346 | 347 | 348 | print("Reading mgf...", args.mgf) 349 | spectra = readmgf(args.mgf, type="hcd") 350 | 351 | y = [ 352 | spectrum2vector(sp["mz"], sp["it"], sp["mass"], BIN_SIZE, sp["charge"]) 353 | for sp in spectra 354 | ] 355 | 356 | x = preprocessor(spectra) 357 | 358 | pm.fit(x=x, y=np.asarray(y, dtype="float32"), epochs=50, verbose=1) 359 | pm.save(args.out) 360 | --------------------------------------------------------------------------------