├── .gitattributes
├── .gitignore
├── .vscode
    └── settings.json
├── README.md
├── compare_performance.py
├── coord_tf.py
├── example.tsv
├── imgs
    ├── hcd1.png
    ├── hcd2.png
    └── model.png
├── predfull.py
└── train_model.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | example_prediction.mgf filter=lfs diff=lfs merge=lfs -text
2 | *.mgf filter=lfs diff=lfs merge=lfs -text
3 | *.h5 filter=lfs diff=lfs merge=lfs -text
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | 
  4 | *.mgf
  5 | *.h5
  6 | *.hdf5
  7 | 
  8 | # User-specific files
  9 | *.suo
 10 | *.user
 11 | *.userosscache
 12 | *.sln.docstates
 13 | 
 14 | # User-specific files (MonoDevelop/Xamarin Studio)
 15 | *.userprefs
 16 | 
 17 | # Build results
 18 | [Dd]ebug/
 19 | [Dd]ebugPublic/
 20 | [Rr]elease/
 21 | [Rr]eleases/
 22 | x64/
 23 | x86/
 24 | bld/
 25 | [Bb]in/
 26 | [Oo]bj/
 27 | [Ll]og/
 28 | 
 29 | # Visual Studio 2015 cache/options directory
 30 | .vs/
 31 | # Uncomment if you have tasks that create the project's static files in wwwroot
 32 | #wwwroot/
 33 | 
 34 | # MSTest test Results
 35 | [Tt]est[Rr]esult*/
 36 | [Bb]uild[Ll]og.*
 37 | 
 38 | # NUNIT
 39 | *.VisualState.xml
 40 | TestResult.xml
 41 | 
 42 | # Build Results of an ATL Project
 43 | [Dd]ebugPS/
 44 | [Rr]eleasePS/
 45 | dlldata.c
 46 | 
 47 | # DNX
 48 | project.lock.json
 49 | project.fragment.lock.json
 50 | artifacts/
 51 | 
 52 | *_i.c
 53 | *_p.c
 54 | *_i.h
 55 | *.ilk
 56 | *.meta
 57 | *.obj
 58 | *.pch
 59 | *.pdb
 60 | *.pgc
 61 | *.pgd
 62 | *.rsp
 63 | *.sbr
 64 | *.tlb
 65 | *.tli
 66 | *.tlh
 67 | *.tmp
 68 | *.tmp_proj
 69 | *.log
 70 | *.vspscc
 71 | *.vssscc
 72 | .builds
 73 | *.pidb
 74 | *.svclog
 75 | *.scc
 76 | 
 77 | # Chutzpah Test files
 78 | _Chutzpah*
 79 | 
 80 | # Visual C++ cache files
 81 | ipch/
 82 | *.aps
 83 | *.ncb
 84 | *.opendb
 85 | *.opensdf
 86 | *.sdf
 87 | *.cachefile
 88 | *.VC.db
 89 | *.VC.VC.opendb
 90 | 
 91 | # Visual Studio profiler
 92 | *.psess
 93 | *.vsp
 94 | *.vspx
 95 | *.sap
 96 | 
 97 | # TFS 2012 Local Workspace
 98 | $tf/
 99 | 
100 | # Guidance Automation Toolkit
101 | *.gpState
102 | 
103 | # ReSharper is a .NET coding add-in
104 | _ReSharper*/
105 | *.[Rr]e[Ss]harper
106 | *.DotSettings.user
107 | 
108 | # JustCode is a .NET coding add-in
109 | .JustCode
110 | 
111 | # TeamCity is a build add-in
112 | _TeamCity*
113 | 
114 | # DotCover is a Code Coverage Tool
115 | *.dotCover
116 | 
117 | # NCrunch
118 | _NCrunch_*
119 | .*crunch*.local.xml
120 | nCrunchTemp_*
121 | 
122 | # MightyMoose
123 | *.mm.*
124 | AutoTest.Net/
125 | 
126 | # Web workbench (sass)
127 | .sass-cache/
128 | 
129 | # Installshield output folder
130 | [Ee]xpress/
131 | 
132 | # Click-Once directory
133 | publish/
134 | 
135 | # Publish Web Output
136 | *.[Pp]ublish.xml
137 | *.azurePubxml
138 | # TODO: Comment the next line if you want to checkin your web deploy settings
139 | # but database connection strings (with potential passwords) will be unencrypted
140 | #*.pubxml
141 | *.publishproj
142 | 
143 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
144 | # checkin your Azure Web App publish settings, but sensitive information contained
145 | # in these scripts will be unencrypted
146 | PublishScripts/
147 | 
148 | # NuGet Packages
149 | *.nupkg
150 | # The packages folder can be ignored because of Package Restore
151 | **/packages/*
152 | # except build/, which is used as an MSBuild target.
153 | !**/packages/build/
154 | # Uncomment if necessary however generally it will be regenerated when needed
155 | #!**/packages/repositories.config
156 | # NuGet v3's project.json files produces more ignoreable files
157 | *.nuget.props
158 | *.nuget.targets
159 | 
160 | # Microsoft Azure Build Output
161 | csx/
162 | *.build.csdef
163 | 
164 | # Microsoft Azure Emulator
165 | ecf/
166 | rcf/
167 | 
168 | # Windows Store app package directories and files
169 | AppPackages/
170 | BundleArtifacts/
171 | Package.StoreAssociation.xml
172 | _pkginfo.txt
173 | 
174 | # Visual Studio cache files
175 | # files ending in .cache can be ignored
176 | *.[Cc]ache
177 | # but keep track of directories ending in .cache
178 | !*.[Cc]ache/
179 | 
180 | # Others
181 | ClientBin/
182 | ~$*
183 | *~
184 | *.dbmdl
185 | *.dbproj.schemaview
186 | *.jfm
187 | *.pfx
188 | *.publishsettings
189 | node_modules/
190 | orleans.codegen.cs
191 | 
192 | # Since there are multiple workflows, uncomment next line to ignore bower_components
193 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
194 | #bower_components/
195 | 
196 | # RIA/Silverlight projects
197 | Generated_Code/
198 | 
199 | # Backup & report files from converting an old project file
200 | # to a newer Visual Studio version. Backup files are not needed,
201 | # because we have git ;-)
202 | _UpgradeReport_Files/
203 | Backup*/
204 | UpgradeLog*.XML
205 | UpgradeLog*.htm
206 | 
207 | # SQL Server files
208 | *.mdf
209 | *.ldf
210 | 
211 | # Business Intelligence projects
212 | *.rdl.data
213 | *.bim.layout
214 | *.bim_*.settings
215 | 
216 | # Microsoft Fakes
217 | FakesAssemblies/
218 | 
219 | # GhostDoc plugin setting file
220 | *.GhostDoc.xml
221 | 
222 | # Node.js Tools for Visual Studio
223 | .ntvs_analysis.dat
224 | 
225 | # Visual Studio 6 build log
226 | *.plg
227 | 
228 | # Visual Studio 6 workspace options file
229 | *.opt
230 | 
231 | # Visual Studio LightSwitch build output
232 | **/*.HTMLClient/GeneratedArtifacts
233 | **/*.DesktopClient/GeneratedArtifacts
234 | **/*.DesktopClient/ModelManifest.xml
235 | **/*.Server/GeneratedArtifacts
236 | **/*.Server/ModelManifest.xml
237 | _Pvt_Extensions
238 | 
239 | # Paket dependency manager
240 | .paket/paket.exe
241 | paket-files/
242 | 
243 | # FAKE - F# Make
244 | .fake/
245 | 
246 | # JetBrains Rider
247 | .idea/
248 | *.sln.iml
249 | 
250 | # CodeRush
251 | .cr/
252 | 
253 | # Python Tools for Visual Studio (PTVS)
254 | __pycache__/
255 | *.pyc
256 | .ipynb_checkpoints
257 | 
258 | *.list
259 | *.jar
260 | *.out
261 | *.newcluster
262 | *.backup
263 | *.exe
264 | /CodeGraphData
265 | readme
266 | *.cluster
267 | *.ipynb
268 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "jupyter.jupyterServerType": "local",
3 |     "[python]": {
4 |         "editor.defaultFormatter": "ms-python.black-formatter"
5 |     },
6 |     "python.formatting.provider": "none"
7 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PredFull
  2 | 
  3 | __Visit [http://predfull.com/](http://predfull.com/) to try online prediction__
  4 | 
  5 | > This work was published on Analytical Chemistry: [`Full-Spectrum Prediction of Peptides Tandem Mass Spectra using Deep Neural Network`](https://pubs.acs.org/doi/10.1021/acs.analchem.9b04867)
  6 | >
  7 | > Kaiyuan Liu, Sujun Li, Lei Wang, Yuzhen Ye, Haixu Tang
  8 | 
  9 | The first model for predicting complete tandem mass spectra from peptides sequences, using a deep CNN neural network trained on over 2 million experimental spectra.
 10 | 
 11 | Free for academic uses.
 12 | 
 13 | ## Update History
 14 | 
 15 | * 2022.05.19: Support input peptide of any length
 16 | * 2021.05.18: Support predicting peptides with oxidized methionine.
 17 | * 2021.01.01: Update example results.
 18 | * 2020.08.22: Fixed performance issues.
 19 | * 2020.05.25: Support predicting non-tryptic peptides.
 20 | * 2019.09.01: First version.
 21 | 
 22 | ## Method
 23 | 
 24 | Based on the structure of the residual convolutional networks. Current precision (bin size): 0.1 Th.
 25 | 
 26 | ![model](imgs/model.png)
 27 | 
 28 | ## How to use
 29 | 
 30 | __Expect clone this project, you should download `pm.h5` from [google drive](https://drive.google.com/drive/folders/1Ca3HdV-w8TZPRa9KhPBbjrTtGSmtEIsn?usp=sharing) and place it into this folder.__
 31 | 
 32 | ### Important Notes
 33 | 
 34 | * The only modification (PTM) supported is **oxidation on Methionine**, otherwise only UNMODIFIED peptides are allowed. To indicate an oxidized methionine, use the format "M(O)".
 35 | * This model assumes a __FIXED__ carbamidomethyl on C
 36 | * The length of input peptides are __NOT__ limited, however, would expect poor performance with peptides longer than 30
 37 | * The prediction will NOT output peaks with M/z > 2000
 38 | * Predicted peaks that are weaker than STRONGEST_PEAK / 1000 are regarded as noises thus will be omitted from the final output.
 39 | 
 40 | ### Required Packages
 41 | 
 42 | Recommend to install dependency via [Anaconda](https://www.anaconda.com/distribution/)
 43 | 
 44 | * Python >= 3.7
 45 | * Tensorflow >= 2.3.0
 46 | * Pandas >= 0.20
 47 | * pyteomics
 48 | * lxml
 49 | 
 50 | __The Tensorflow has to be 2.30 or newer! A compatibility bug in Tensorflow made version before 2.3.0 can't load the model correctly. We'll release a new model once the Tensorflow team solve this.__
 51 | 
 52 | ### Input format
 53 | 
 54 | The required input format is TSV, with the following columns:
 55 | 
 56 | Peptide | Charge | Type | NCE
 57 | ------- | ------ | ---- | ---
 58 | AAAAAAAAAVSR | 2 | HCD | 25
 59 | AAGAAESEEDFLR | 2 | HCD | 25
 60 | AAPAPTASSTININTSTSK | 2 | HCD | 25
 61 | AAPAPM(O)NTSTSK | 2 | HCD | 25
 62 | 
 63 | Apparently, 'Peptide' and 'Charge' columns mean what it says. The 'Type' must be HCD or ETD (in uppercase). NCE means normalized collision energy, set to 25 as default. Note that in the above examples the last peptide has an oxidized methionine, and it's the only modification supported now. Check `example.tsv` for examples.
 64 | 
 65 | ### Usage
 66 | 
 67 | Simply run:
 68 | 
 69 | `python predfull.py --input example.tsv --model pm.h5 --output example_prediction.mgf`
 70 | 
 71 | The output file is in MGF format
 72 | 
 73 | * --input: the input file
 74 | * --output: the output path
 75 | * --model: the pretrained model
 76 | 
 77 | ## Prediction Examples
 78 | 
 79 | __Note that intensities are shown by square rooted values__
 80 | 
 81 | ![example 1](imgs/hcd2.png)
 82 | 
 83 | ![example 2](imgs/hcd1.png)
 84 | 
 85 | ## Performance Evaluation
 86 | 
 87 | We provide sample data on [google drive](https://drive.google.com/drive/folders/1Ca3HdV-w8TZPRa9KhPBbjrTtGSmtEIsn?usp=sharing) and codes for you to evaluate the prediction performance. The `hcd_testingset.mgf` file on google drive contains ground truth spectra (randomly sampled from [NIST Human Synthetic Peptide Spectral Library](https://chemdata.nist.gov/dokuwiki/doku.php?id=peptidew:lib:kustersynselected20170530)) that corresponding to items in `example.tsv`, while the `example_prediction.mgf` file contains pre-run predictions.
 88 | 
 89 | To evaluate the similarity, first download groud truth reference file `hcd_testingset.mgf` from [google drive](https://drive.google.com/drive/folders/1Ca3HdV-w8TZPRa9KhPBbjrTtGSmtEIsn?usp=sharing), then run:
 90 | 
 91 | `python compare_performance.py --real hcd_testingset.mgf --pred example_prediction.mgf`
 92 | 
 93 | * --real: the ground truth file
 94 | * --pred: the prediction file
 95 | 
 96 | You should get around ~0.789 average similarities using these two pre-given MGF files.
 97 | 
 98 | __Make sure that items in `example.tsv` and `hcd_testingset.mgf` are of the same order! Don't permute items or add/delete items unless you will align them by yourself.__
 99 | 
100 | ## How to build & train the model
101 | 
102 | For those who are interested in reproducing this model, here we provide `train_model.py` of example codes to build and train the model.
103 | 


--------------------------------------------------------------------------------
/compare_performance.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import math
  3 | import numpy as np
  4 | from pyteomics import mgf, mass
  5 | 
  6 | 
  7 | def norm(x): return np.linalg.norm(x)
  8 | 
  9 | 
 10 | def cosine(u, v): return np.dot(u, v) / max(norm(u) * norm(v), 1e-16)
 11 | 
 12 | 
 13 | DIMENSION = 20000
 14 | BIN_SIZE = 0.1
 15 | 
 16 | 
 17 | def spectrum2vector(mz_list, itensity_list, mass, bin_size, charge):
 18 |     itensity_list = itensity_list / np.max(itensity_list)
 19 | 
 20 |     vector = np.zeros(DIMENSION, dtype='float32')
 21 | 
 22 |     mz_list = np.asarray(mz_list)
 23 | 
 24 |     indexes = mz_list / bin_size
 25 |     indexes = np.around(indexes).astype('int32')
 26 | 
 27 |     for i, index in enumerate(indexes):
 28 |         if index >= DIMENSION: continue
 29 |         vector[index] += itensity_list[i]
 30 | 
 31 |     # normalize
 32 |     vector = np.sqrt(vector)
 33 | 
 34 |     # remove precursors, including isotropic precursor peaks
 35 |     for delta in (0, 1, 2):
 36 |         precursor_mz = mass + delta / charge
 37 |         if precursor_mz > 0 and precursor_mz < 2000:
 38 |             vector[round(precursor_mz / bin_size)] = 0
 39 | 
 40 |     return vector
 41 | 
 42 | 
 43 | # ratio constants for NCE
 44 | cr = {1: 1, 2: 0.9, 3: 0.85, 4: 0.8, 5: 0.75, 6: 0.75, 7: 0.75, 8: 0.75}
 45 | 
 46 | 
 47 | def parse_spectra(sps):
 48 |     db = []
 49 | 
 50 |     for sp in sps:
 51 |         param = sp['params']
 52 | 
 53 |         c = int(str(param['charge'][0])[0])
 54 | 
 55 |         if 'seq' in param:
 56 |             pep = param['seq']
 57 |         else:
 58 |             pep = param['title']
 59 | 
 60 |         if 'pepmass' in param:
 61 |             mass = param['pepmass'][0]
 62 |         else:
 63 |             mass = float(param['parent'])
 64 | 
 65 |         if 'hcd' in param:
 66 |             try:
 67 |                 hcd = param['hcd']
 68 |                 if hcd[-1] == '%':
 69 |                     hcd = float(hcd)
 70 |                 elif hcd[-2:] == 'eV':
 71 |                     hcd = float(hcd[:-2])
 72 |                     hcd = hcd * 500 * cr[c] / mass
 73 |                 else:
 74 |                     raise Exception("Invalid type!")
 75 |             except:
 76 |                 hcd = 0
 77 |         else:
 78 |             hcd = 0
 79 | 
 80 |         mz = sp['m/z array']
 81 |         it = sp['intensity array']
 82 | 
 83 |         db.append({'pep': pep, 'charge': c,
 84 |                    'mass': mass, 'mz': mz, 'it': it, 'nce': hcd})
 85 | 
 86 |     return db
 87 | 
 88 | 
 89 | def readmgf(fn):
 90 |     file = open(fn, "r")
 91 |     data = mgf.read(file, convert_arrays=1, read_charges=False,
 92 |                     dtype='float32', use_index=False)
 93 | 
 94 |     codes = parse_spectra(data)
 95 |     return codes
 96 | 
 97 | 
 98 | parser = argparse.ArgumentParser()
 99 | parser.add_argument('--real', type=str,
100 |                     help='Real MGF file path', default='hcd_testingset.mgf')
101 | parser.add_argument('--pred', type=str,
102 |                     help='predicted MGF file path', default='example_prediction.mgf')
103 | 
104 | args = parser.parse_args()
105 | 
106 | print('Reading', args.real)
107 | real_vectors = [spectrum2vector(sp['mz'], sp['it'], sp['mass'], BIN_SIZE,
108 |                                sp['charge']) for sp in readmgf(args.real)]
109 | 
110 | print('Reading', args.pred)
111 | pred_vectors = [spectrum2vector(sp['mz'], sp['it'], sp['mass'], BIN_SIZE,
112 |                                sp['charge']) for sp in readmgf(args.pred)]
113 | 
114 | similarites = [cosine(sp1, sp2)
115 |                for sp1, sp2 in zip(real_vectors, pred_vectors)]
116 | 
117 | print('Average Cosine similarites:', np.mean(similarites))
118 | 


--------------------------------------------------------------------------------
/coord_tf.py:
--------------------------------------------------------------------------------
  1 | from tensorflow.keras.layers import Layer, InputSpec
  2 | from tensorflow.keras import backend as K
  3 | from tensorflow.keras.utils import get_custom_objects
  4 | 
  5 | 
  6 | class _CoordinateChannel(Layer):
  7 |     """ Adds Coordinate Channels to the input tensor.
  8 | 
  9 |     # Arguments
 10 |         rank: An integer, the rank of the input data-uniform,
 11 |             e.g. "2" for 2D convolution.
 12 |         use_radius: Boolean flag to determine whether the
 13 |             radius coordinate should be added for 2D rank
 14 |             inputs or not.
 15 |         data_format: A string,
 16 |             one of `"channels_last"` or `"channels_first"`.
 17 |             The ordering of the dimensions in the inputs.
 18 |             `"channels_last"` corresponds to inputs with shape
 19 |             `(batch, ..., channels)` while `"channels_first"` corresponds to
 20 |             inputs with shape `(batch, channels, ...)`.
 21 |             It defaults to the `image_data_format` value found in your
 22 |             Keras config file at `~/.keras/keras.json`.
 23 |             If you never set it, then it will be "channels_last".
 24 | 
 25 |     # Input shape
 26 |         ND tensor with shape:
 27 |         `(samples, channels, *)`
 28 |         if `data_format` is `"channels_first"`
 29 |         or ND tensor with shape:
 30 |         `(samples, *, channels)`
 31 |         if `data_format` is `"channels_last"`.
 32 | 
 33 |     # Output shape
 34 |         ND tensor with shape:
 35 |         `(samples, channels + 2, *)`
 36 |         if `data_format` is `"channels_first"`
 37 |         or 5D tensor with shape:
 38 |         `(samples, *, channels + 2)`
 39 |         if `data_format` is `"channels_last"`.
 40 | 
 41 |     # References:
 42 |         - [An Intriguing Failing of Convolutional Neural Networks and the CoordConv Solution](https://arxiv.org/abs/1807.03247)
 43 |     """
 44 | 
 45 |     def __init__(self, rank,
 46 |                  use_radius=False,
 47 |                  data_format=None,
 48 |                  **kwargs):
 49 |         super(_CoordinateChannel, self).__init__(**kwargs)
 50 | 
 51 |         if data_format not in [None, 'channels_first', 'channels_last']:
 52 |             raise ValueError('`data_format` must be either "channels_last", "channels_first" '
 53 |                              'or None.')
 54 | 
 55 |         self.rank = rank
 56 |         self.use_radius = use_radius
 57 |         self.data_format = K.image_data_format() if data_format is None else data_format
 58 |         self.axis = 1 if K.image_data_format() == 'channels_first' else -1
 59 | 
 60 |         self.input_spec = InputSpec(min_ndim=2)
 61 |         self.supports_masking = True
 62 | 
 63 |     def build(self, input_shape):
 64 |         assert len(input_shape) >= 2
 65 |         input_dim = input_shape[self.axis]
 66 | 
 67 |         self.input_spec = InputSpec(min_ndim=self.rank + 2,
 68 |                                     axes={self.axis: input_dim})
 69 |         self.built = True
 70 | 
 71 |     def call(self, inputs, training=None, mask=None):
 72 |         input_shape = K.shape(inputs)
 73 | 
 74 |         if self.rank == 1:
 75 |             input_shape = [input_shape[i] for i in range(3)]
 76 |             batch_shape, dim, channels = input_shape
 77 | 
 78 |             xx_range = K.tile(K.expand_dims(K.arange(0, dim), axis=0),
 79 |                               K.stack([batch_shape, 1]))
 80 |             xx_range = K.expand_dims(xx_range, axis=-1)
 81 | 
 82 |             xx_channels = K.cast(xx_range, K.dtype(inputs))
 83 |             xx_channels = xx_channels / K.cast(dim - 1, K.dtype(inputs))
 84 |             xx_channels = (xx_channels * 2) - 1.
 85 | 
 86 |             outputs = K.concatenate([inputs, xx_channels], axis=-1)
 87 | 
 88 |         if self.rank == 2:
 89 |             if self.data_format == 'channels_first':
 90 |                 inputs = K.permute_dimensions(inputs, [0, 2, 3, 1])
 91 |                 input_shape = K.shape(inputs)
 92 | 
 93 |             input_shape = [input_shape[i] for i in range(4)]
 94 |             batch_shape, dim1, dim2, channels = input_shape
 95 | 
 96 |             xx_ones = K.ones(K.stack([batch_shape, dim2]), dtype='int32')
 97 |             xx_ones = K.expand_dims(xx_ones, axis=-1)
 98 | 
 99 |             xx_range = K.tile(K.expand_dims(K.arange(0, dim1), axis=0),
100 |                               K.stack([batch_shape, 1]))
101 |             xx_range = K.expand_dims(xx_range, axis=1)
102 |             xx_channels = K.batch_dot(xx_ones, xx_range, axes=[2, 1])
103 |             xx_channels = K.expand_dims(xx_channels, axis=-1)
104 |             xx_channels = K.permute_dimensions(xx_channels, [0, 2, 1, 3])
105 | 
106 |             yy_ones = K.ones(K.stack([batch_shape, dim1]), dtype='int32')
107 |             yy_ones = K.expand_dims(yy_ones, axis=1)
108 | 
109 |             yy_range = K.tile(K.expand_dims(K.arange(0, dim2), axis=0),
110 |                               K.stack([batch_shape, 1]))
111 |             yy_range = K.expand_dims(yy_range, axis=-1)
112 | 
113 |             yy_channels = K.batch_dot(yy_range, yy_ones, axes=[2, 1])
114 |             yy_channels = K.expand_dims(yy_channels, axis=-1)
115 |             yy_channels = K.permute_dimensions(yy_channels, [0, 2, 1, 3])
116 | 
117 |             xx_channels = K.cast(xx_channels, K.floatx())
118 |             xx_channels = xx_channels / K.cast(dim1 - 1, K.floatx())
119 |             xx_channels = (xx_channels * 2) - 1.
120 | 
121 |             yy_channels = K.cast(yy_channels, K.floatx())
122 |             yy_channels = yy_channels / K.cast(dim2 - 1, K.floatx())
123 |             yy_channels = (yy_channels * 2) - 1.
124 | 
125 |             outputs = K.concatenate([inputs, xx_channels, yy_channels], axis=-1)
126 | 
127 |             if self.use_radius:
128 |                 rr = K.sqrt(K.square(xx_channels - 0.5) +
129 |                             K.square(yy_channels - 0.5))
130 |                 outputs = K.concatenate([outputs, rr], axis=-1)
131 | 
132 |             if self.data_format == 'channels_first':
133 |                 outputs = K.permute_dimensions(outputs, [0, 3, 1, 2])
134 | 
135 |         if self.rank == 3:
136 |             if self.data_format == 'channels_first':
137 |                 inputs = K.permute_dimensions(inputs, [0, 2, 3, 4, 1])
138 |                 input_shape = K.shape(inputs)
139 | 
140 |             input_shape = [input_shape[i] for i in range(5)]
141 |             batch_shape, dim1, dim2, dim3, channels = input_shape
142 | 
143 |             xx_ones = K.ones(K.stack([batch_shape, dim3]), dtype='int32')
144 |             xx_ones = K.expand_dims(xx_ones, axis=-1)
145 | 
146 |             xx_range = K.tile(K.expand_dims(K.arange(0, dim2), axis=0),
147 |                               K.stack([batch_shape, 1]))
148 |             xx_range = K.expand_dims(xx_range, axis=1)
149 | 
150 |             xx_channels = K.batch_dot(xx_ones, xx_range, axes=[2, 1])
151 |             xx_channels = K.expand_dims(xx_channels, axis=-1)
152 |             xx_channels = K.permute_dimensions(xx_channels, [0, 2, 1, 3])
153 | 
154 |             xx_channels = K.expand_dims(xx_channels, axis=1)
155 |             xx_channels = K.tile(xx_channels,
156 |                                  [1, dim1, 1, 1, 1])
157 | 
158 |             yy_ones = K.ones(K.stack([batch_shape, dim2]), dtype='int32')
159 |             yy_ones = K.expand_dims(yy_ones, axis=1)
160 | 
161 |             yy_range = K.tile(K.expand_dims(K.arange(0, dim3), axis=0),
162 |                               K.stack([batch_shape, 1]))
163 |             yy_range = K.expand_dims(yy_range, axis=-1)
164 | 
165 |             yy_channels = K.batch_dot(yy_range, yy_ones, axes=[2, 1])
166 |             yy_channels = K.expand_dims(yy_channels, axis=-1)
167 |             yy_channels = K.permute_dimensions(yy_channels, [0, 2, 1, 3])
168 | 
169 |             yy_channels = K.expand_dims(yy_channels, axis=1)
170 |             yy_channels = K.tile(yy_channels,
171 |                                  [1, dim1, 1, 1, 1])
172 | 
173 |             zz_range = K.tile(K.expand_dims(K.arange(0, dim1), axis=0),
174 |                               K.stack([batch_shape, 1]))
175 |             zz_range = K.expand_dims(zz_range, axis=-1)
176 |             zz_range = K.expand_dims(zz_range, axis=-1)
177 | 
178 |             zz_channels = K.tile(zz_range,
179 |                                  [1, 1, dim2, dim3])
180 |             zz_channels = K.expand_dims(zz_channels, axis=-1)
181 | 
182 |             xx_channels = K.cast(xx_channels, K.floatx())
183 |             xx_channels = xx_channels / K.cast(dim2 - 1, K.floatx())
184 |             xx_channels = xx_channels * 2 - 1.
185 | 
186 |             yy_channels = K.cast(yy_channels, K.floatx())
187 |             yy_channels = yy_channels / K.cast(dim3 - 1, K.floatx())
188 |             yy_channels = yy_channels * 2 - 1.
189 | 
190 |             zz_channels = K.cast(zz_channels, K.floatx())
191 |             zz_channels = zz_channels / K.cast(dim1 - 1, K.floatx())
192 |             zz_channels = zz_channels * 2 - 1.
193 | 
194 |             outputs = K.concatenate([inputs, zz_channels, xx_channels, yy_channels],
195 |                                     axis=-1)
196 | 
197 |             if self.data_format == 'channels_first':
198 |                 outputs = K.permute_dimensions(outputs, [0, 4, 1, 2, 3])
199 | 
200 |         return outputs
201 | 
202 |     def compute_output_shape(self, input_shape):
203 |         assert input_shape and len(input_shape) >= 2
204 |         assert input_shape[self.axis]
205 | 
206 |         if self.use_radius and self.rank == 2:
207 |             channel_count = 3
208 |         else:
209 |             channel_count = self.rank
210 | 
211 |         output_shape = list(input_shape)
212 |         output_shape[self.axis] = input_shape[self.axis] + channel_count
213 |         return tuple(output_shape)
214 | 
215 |     def get_config(self):
216 |         config = {
217 |             'rank': self.rank,
218 |             'use_radius': self.use_radius,
219 |             'data_format': self.data_format
220 |         }
221 |         base_config = super(_CoordinateChannel, self).get_config()
222 |         return dict(list(base_config.items()) + list(config.items()))
223 | 
224 | 
225 | class CoordinateChannel1D(_CoordinateChannel):
226 |     """ Adds Coordinate Channels to the input tensor of rank 1.
227 | 
228 |     # Arguments
229 |         data_format: A string,
230 |             one of `"channels_last"` or `"channels_first"`.
231 |             The ordering of the dimensions in the inputs.
232 |             `"channels_last"` corresponds to inputs with shape
233 |             `(batch, ..., channels)` while `"channels_first"` corresponds to
234 |             inputs with shape `(batch, channels, ...)`.
235 |             It defaults to the `image_data_format` value found in your
236 |             Keras config file at `~/.keras/keras.json`.
237 |             If you never set it, then it will be "channels_last".
238 | 
239 |     # Input shape
240 |         3D tensor with shape: `(batch_size, steps, input_dim)`
241 | 
242 |     # Output shape
243 |         3D tensor with shape: `(batch_size, steps, input_dim + 2)`
244 | 
245 |     # References:
246 |         - [An Intriguing Failing of Convolutional Neural Networks and the CoordConv Solution](https://arxiv.org/abs/1807.03247)
247 |     """
248 | 
249 |     def __init__(self, data_format=None, **kwargs):
250 |         super(CoordinateChannel1D, self).__init__(
251 |             rank=1,
252 |             use_radius=False,
253 |             data_format=data_format,
254 |             **kwargs
255 |         )
256 | 
257 |     def get_config(self):
258 |         config = super(CoordinateChannel1D, self).get_config()
259 |         config.pop('rank')
260 |         config.pop('use_radius')
261 |         return config
262 | 
263 | 
264 | class CoordinateChannel2D(_CoordinateChannel):
265 |     """ Adds Coordinate Channels to the input tensor.
266 | 
267 |     # Arguments
268 |         use_radius: Boolean flag to determine whether the
269 |             radius coordinate should be added for 2D rank
270 |             inputs or not.
271 |         data_format: A string,
272 |             one of `"channels_last"` or `"channels_first"`.
273 |             The ordering of the dimensions in the inputs.
274 |             `"channels_last"` corresponds to inputs with shape
275 |             `(batch, ..., channels)` while `"channels_first"` corresponds to
276 |             inputs with shape `(batch, channels, ...)`.
277 |             It defaults to the `image_data_format` value found in your
278 |             Keras config file at `~/.keras/keras.json`.
279 |             If you never set it, then it will be "channels_last".
280 | 
281 |     # Input shape
282 |         4D tensor with shape:
283 |         `(samples, channels, rows, cols)`
284 |         if `data_format` is `"channels_first"`
285 |         or 4D tensor with shape:
286 |         `(samples, rows, cols, channels)`
287 |         if `data_format` is `"channels_last"`.
288 | 
289 |     # Output shape
290 |         4D tensor with shape:
291 |         `(samples, channels + 2/3, rows, cols)`
292 |         if `data_format` is `"channels_first"`
293 |         or 4D tensor with shape:
294 |         `(samples, rows, cols, channels + 2/3)`
295 |         if `data_format` is `"channels_last"`.
296 | 
297 |         If `use_radius` is set, then will have 3 additional filers,
298 |         else only 2 additional filters will be added.
299 | 
300 |     # References:
301 |         - [An Intriguing Failing of Convolutional Neural Networks and the CoordConv Solution](https://arxiv.org/abs/1807.03247)
302 |     """
303 | 
304 |     def __init__(self, use_radius=False,
305 |                  data_format=None,
306 |                  **kwargs):
307 |         super(CoordinateChannel2D, self).__init__(
308 |             rank=2,
309 |             use_radius=use_radius,
310 |             data_format=data_format,
311 |             **kwargs
312 |         )
313 | 
314 |     def get_config(self):
315 |         config = super(CoordinateChannel2D, self).get_config()
316 |         config.pop('rank')
317 |         return config
318 | 
319 | 
320 | class CoordinateChannel3D(_CoordinateChannel):
321 |     """ Adds Coordinate Channels to the input tensor.
322 | 
323 |     # Arguments
324 |         rank: An integer, the rank of the input data-uniform,
325 |             e.g. "2" for 2D convolution.
326 |         use_radius: Boolean flag to determine whether the
327 |             radius coordinate should be added for 2D rank
328 |             inputs or not.
329 |         data_format: A string,
330 |             one of `"channels_last"` or `"channels_first"`.
331 |             The ordering of the dimensions in the inputs.
332 |             `"channels_last"` corresponds to inputs with shape
333 |             `(batch, ..., channels)` while `"channels_first"` corresponds to
334 |             inputs with shape `(batch, channels, ...)`.
335 |             It defaults to the `image_data_format` value found in your
336 |             Keras config file at `~/.keras/keras.json`.
337 |             If you never set it, then it will be "channels_last".
338 | 
339 |     # Input shape
340 |         5D tensor with shape:
341 |         `(samples, channels, conv_dim1, conv_dim2, conv_dim3)`
342 |         if `data_format` is `"channels_first"`
343 |         or 5D tensor with shape:
344 |         `(samples, conv_dim1, conv_dim2, conv_dim3, channels)`
345 |         if `data_format` is `"channels_last"`.
346 | 
347 |     # Output shape
348 |         5D tensor with shape:
349 |         `(samples, channels + 2, conv_dim1, conv_dim2, conv_dim3)`
350 |         if `data_format` is `"channels_first"`
351 |         or 5D tensor with shape:
352 |         `(samples, conv_dim1, conv_dim2, conv_dim3, channels + 2)`
353 |         if `data_format` is `"channels_last"`.
354 | 
355 |     # References:
356 |         - [An Intriguing Failing of Convolutional Neural Networks and the CoordConv Solution](https://arxiv.org/abs/1807.03247)
357 |     """
358 | 
359 |     def __init__(self, data_format=None,
360 |                  **kwargs):
361 |         super(CoordinateChannel3D, self).__init__(
362 |             rank=3,
363 |             use_radius=False,
364 |             data_format=data_format,
365 |             **kwargs
366 |         )
367 | 
368 |     def get_config(self):
369 |         config = super(CoordinateChannel3D, self).get_config()
370 |         config.pop('rank')
371 |         config.pop('use_radius')
372 |         return config
373 | 
374 | 
375 | get_custom_objects().update({'CoordinateChannel1D': CoordinateChannel1D,
376 |                              'CoordinateChannel2D': CoordinateChannel2D,
377 |                              'CoordinateChannel3D': CoordinateChannel3D})
378 | 


--------------------------------------------------------------------------------
/imgs/hcd1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lkytal/PredFull/fc3641e0bc511e2947be7e33021ed871140db030/imgs/hcd1.png


--------------------------------------------------------------------------------
/imgs/hcd2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lkytal/PredFull/fc3641e0bc511e2947be7e33021ed871140db030/imgs/hcd2.png


--------------------------------------------------------------------------------
/imgs/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lkytal/PredFull/fc3641e0bc511e2947be7e33021ed871140db030/imgs/model.png


--------------------------------------------------------------------------------
/predfull.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import math
  6 | from pyteomics import mgf, mass
  7 | import argparse
  8 | 
  9 | import tensorflow.keras as k
 10 | from tensorflow.keras import backend as K
 11 | from tensorflow.keras.layers import Layer, InputSpec
 12 | from tensorflow.keras.layers import (
 13 |     Conv1D,
 14 |     MaxPooling1D,
 15 |     Dense,
 16 |     Add,
 17 |     Flatten,
 18 |     Activation,
 19 |     BatchNormalization,
 20 |     LayerNormalization,
 21 | )
 22 | from tensorflow.keras import Model, Input
 23 | 
 24 | from coord_tf import CoordinateChannel2D, CoordinateChannel1D
 25 | 
 26 | # Hyper Parameters
 27 | precision = 0.1
 28 | MZ_START = 0
 29 | SPEC_DIMENSION = 20000
 30 | 
 31 | PRECURSOR_SCALE = 20000.0
 32 | LENGTH_SCALE = 1000
 33 | 
 34 | MAX_CHARGE = 30
 35 | 
 36 | mono = {
 37 |     "G": 57.021464,
 38 |     "A": 71.037114,
 39 |     "S": 87.032029,
 40 |     "P": 97.052764,
 41 |     "V": 99.068414,
 42 |     "T": 101.04768,
 43 |     "C": 160.03019,
 44 |     "L": 113.08406,
 45 |     "I": 113.08406,
 46 |     "D": 115.02694,
 47 |     "Q": 128.05858,
 48 |     "K": 128.09496,
 49 |     "E": 129.04259,
 50 |     "M": 131.04048,
 51 |     "m": 147.0354,
 52 |     "H": 137.05891,
 53 |     "F": 147.06441,
 54 |     "R": 156.10111,
 55 |     "Y": 163.06333,
 56 |     "N": 114.04293,
 57 |     "W": 186.07931,
 58 |     "O": 147.03538,
 59 | }
 60 | 
 61 | ave_mass = {
 62 |     "A": 71.0788,
 63 |     "R": 156.1875,
 64 |     "N": 114.1038,
 65 |     "D": 115.0886,
 66 |     "C": 160.1598,
 67 |     "E": 129.1155,
 68 |     "Q": 128.1307,
 69 |     "G": 57.0519,
 70 |     "H": 137.1411,
 71 |     "I": 113.1594,
 72 |     "L": 113.1594,
 73 |     "K": 128.1741,
 74 |     "M": 131.1926,
 75 |     "F": 147.1766,
 76 |     "P": 97.1167,
 77 |     "S": 87.0782,
 78 |     "T": 101.1051,
 79 |     "W": 186.2132,
 80 |     "Y": 163.1760,
 81 |     "V": 99.1326,
 82 | }
 83 | 
 84 | Alist = list("ACDEFGHIKLMNPQRSTVWYZ")
 85 | encoding_dimension = len(Alist) + 3
 86 | 
 87 | charMap = {"*": 0, "]": len(Alist) + 1, "[": len(Alist) + 2}
 88 | for i, a in enumerate(Alist):
 89 |     charMap[a] = i + 1
 90 | 
 91 | 
 92 | # help functions
 93 | def mz2pos(mz, pre=precision):
 94 |     return int(round((mz - MZ_START) / pre))
 95 | 
 96 | 
 97 | def pos2mz(pos, pre=precision):
 98 |     return pos * pre + MZ_START
 99 | 
100 | 
101 | def asnp(x):
102 |     return np.asarray(x)
103 | 
104 | 
105 | def asnp32(x):
106 |     return np.asarray(x, dtype="float32")
107 | 
108 | 
109 | def f2(x):
110 |     return "{0:.2f}".format(x)
111 | 
112 | 
113 | def f4(x):
114 |     return "{0:.4f}".format(x)
115 | 
116 | 
117 | # compute percursor mass
118 | def fastmass(pep, ion_type, charge, mod=None, cam=True):
119 |     base = mass.fast_mass(pep, ion_type=ion_type, charge=charge)
120 | 
121 |     if cam:
122 |         base += 57.021 * pep.count("C") / charge
123 | 
124 |     if not mod is None:
125 |         base += 15.995 * np.sum(mod == 1) / charge
126 | 
127 |         base += -np.sum(mod[mod < 0])
128 |     return base
129 | 
130 | 
131 | # help function to parse modifications
132 | 
133 | 
134 | def getmod(pep):
135 |     mod = np.zeros(len(pep))
136 | 
137 |     if pep.isalpha():
138 |         return pep, mod, 0
139 | 
140 |     seq = []
141 |     nmod = 0
142 | 
143 |     i = -1
144 |     while len(pep) > 0:
145 |         if pep[0] == "(":
146 |             if pep[:3] == "(O)":
147 |                 mod[i] = 1
148 |                 pep = pep[3:]
149 |             elif pep[:4] == "(ox)":
150 |                 mod[i] = 1
151 |                 pep = pep[4:]
152 |             elif pep[2] == ")" and pep[1] in "ASDFGHJKLZXCVBNMQWERTYUIOP":
153 |                 mod[i] = -2
154 |                 pep = pep[3:]
155 |             else:
156 |                 raise "unknown mod: " + pep
157 | 
158 |         elif pep[0] == "+" or pep[0] == "-":
159 |             sign = 1 if pep[0] == "+" else -1
160 | 
161 |             for j in range(1, len(pep)):
162 |                 if pep[j] not in ".1234567890":
163 |                     if i == -1:  # N-term mod
164 |                         nmod += sign * float(pep[1:j])
165 |                     else:
166 |                         mod[i] += sign * float(pep[1:j])
167 |                     pep = pep[j:]
168 |                     break
169 | 
170 |             if j == len(pep) - 1 and pep[-1] in ".1234567890":  # till end
171 |                 mod[i] += sign * float(pep[1:])
172 |                 break
173 |         else:
174 |             seq += pep[0]
175 |             pep = pep[1:]
176 |             i = len(seq) - 1  # more realible
177 | 
178 |     return "".join(seq), mod[: len(seq)], nmod
179 | 
180 | 
181 | META_SHAPE = (3, 30)  # (charge, ftype, other(mass, nce))
182 | INPUT_DIMENSION = encoding_dimension + 2 + 3
183 | INPUT_SHAPE = [-1, INPUT_DIMENSION]
184 | 
185 | 
186 | # embed input item into a matrix
187 | def embed(
188 |     spectrum, shape=INPUT_SHAPE, mass_scale=200, embedding=None, ignore=False, pep=None
189 | ):
190 |     if embedding is None:
191 |         embedding = np.zeros(shape, dtype="float32")
192 | 
193 |     if pep is None:
194 |         pep = spectrum["pep"]
195 |     pep = pep.replace("L", "I")
196 | 
197 |     embedding[len(pep)][encoding_dimension - 1] = 1  # ending pos
198 |     for i, aa in enumerate(pep):
199 |         embedding[i][charMap[aa]] = 1  # 1 - 20
200 |         embedding[i][encoding_dimension] = mono[aa] / mass_scale
201 | 
202 |     embedding[: len(pep), encoding_dimension + 1] = (
203 |         np.arange(len(pep)) / LENGTH_SCALE
204 |     )  # position info
205 |     embedding[len(pep) + 1, 0] = 1  # padding info
206 | 
207 |     if "mod" in spectrum:
208 |         for i, modi in enumerate(spectrum["mod"]):
209 |             embedding[i][encoding_dimension + 2 + int(modi)] = 1
210 | 
211 |     return embedding
212 | 
213 | 
214 | # preprocess function for inputs
215 | def preprocessor(batch):
216 |     batch_size = len(batch)
217 |     embedding = np.zeros((batch_size, *INPUT_SHAPE), dtype="float32")
218 |     meta = np.zeros((batch_size, *META_SHAPE), dtype="float32")
219 | 
220 |     for i, sp in enumerate(batch):
221 |         pep = sp["pep"]
222 | 
223 |         embed(sp, embedding=embedding[i])
224 |         meta[i][0][sp["charge"] - 1] = 1  # charge
225 |         meta[i][1][sp["type"]] = 1  # ftype
226 |         meta[i][2][0] = fastmass(pep, ion_type="M", charge=1) / PRECURSOR_SCALE
227 | 
228 |         if not "nce" in sp or sp["nce"] == 0:
229 |             meta[i][2][-1] = 0.25
230 |         else:
231 |             meta[i][2][-1] = sp["nce"] / 100.0
232 | 
233 |     return (embedding, meta)
234 | 
235 | 
236 | # generator for inputs
237 | class input_generator(k.utils.Sequence):
238 |     def __init__(self, spectra, processor, batch_size, shuffle=1):
239 |         self.spectra = spectra
240 |         self.processor = processor
241 |         self.batch_size = batch_size
242 |         self.shuffle = shuffle
243 | 
244 |     def on_epoch_begin(self, epoch):
245 |         if epoch > 0 and self.shuffle:
246 |             np.random.shuffle(self.spectra)
247 | 
248 |     def __len__(self):
249 |         return math.ceil(len(self.spectra) / self.batch_size)
250 | 
251 |     def __getitem__(self, idx):
252 |         start_idx = idx * self.batch_size
253 |         end_idx = min(start_idx + self.batch_size, len(self.spectra))
254 | 
255 |         return (self.processor(self.spectra[start_idx:end_idx]),)
256 | 
257 | 
258 | # functions that transfer predictions into mgf format
259 | def sparse(x, y, th=0.0002):
260 |     x = np.asarray(x, dtype="float32")
261 |     y = np.asarray(y, dtype="float32")
262 | 
263 |     y /= np.max(y)
264 | 
265 |     return x[y > th], y[y > th]
266 | 
267 | 
268 | def tomgf(sp, y):
269 |     head = (
270 |         "BEGIN IONS\n"
271 |         f"TITLE={sp['title']}\n"
272 |         f"PEPTIDE={sp['title']}\n"
273 |         f"CHARGE={sp['charge']}+\n"
274 |         f"PEPMASS={sp['mass']}\n"
275 |     )
276 | 
277 |     y[min(math.ceil(sp["mass"] * sp["charge"] / precision), len(y)) :] = 0
278 | 
279 |     imz = (
280 |         np.arange(0, SPEC_DIMENSION, dtype="int32") * precision + MZ_START
281 |     )  # more acurate
282 |     mzs, its = sparse(imz, y)
283 | 
284 |     # mzs *= 1.00052
285 | 
286 |     peaks = [f"{f2(mz)} {f4(it * 1000)}" for mz, it in zip(mzs, its)]
287 | 
288 |     return head + "\n".join(peaks) + "\nEND IONS"
289 | 
290 | 
291 | parser = argparse.ArgumentParser()
292 | parser.add_argument("--input", type=str, help="input file path", default="example.tsv")
293 | parser.add_argument("--batch_size", type=str, help="batch size per loop", default=256)
294 | parser.add_argument(
295 |     "--output", type=str, help="output file path", default="example_prediction.mgf"
296 | )
297 | parser.add_argument("--model", type=str, help="model file path", default="pm.h5")
298 | 
299 | args = parser.parse_args()
300 | 
301 | K.clear_session()
302 | 
303 | pm = k.models.load_model(args.model, compile=0)
304 | pm.compile(optimizer=k.optimizers.Adam(lr=0.0003), loss="cosine")
305 | 
306 | # fragmentation types
307 | types = {"un": 0, "cid": 1, "etd": 2, "hcd": 3, "ethcd": 4, "etcid": 5}
308 | 
309 | # read inputs
310 | inputs = []
311 | for item in pd.read_csv(args.input, sep="\t").itertuples():
312 |     if item.Charge < 1 or item.Charge > MAX_CHARGE:
313 |         print("input", item.Peptide, "exceed max charge of", MAX_CHARGE, ", ignored")
314 |         continue
315 | 
316 |     pep, mod, nterm_mod = getmod(item.Peptide)
317 | 
318 |     if nterm_mod != 0:
319 |         print("input", item.Peptide, "has N-term modification, ignored")
320 |         continue
321 | 
322 |     if np.any(mod != 0) and set(mod) != set([0, 1]):
323 |         print("Only Oxidation modification is supported, ignored", item.Peptide)
324 |         continue
325 | 
326 |     inputs.append(
327 |         {
328 |             "pep": pep,
329 |             "mod": mod,
330 |             "charge": item.Charge,
331 |             "title": item.Peptide,
332 |             "nce": item.NCE,
333 |             "type": types[item.Type.lower()],
334 |             "mass": fastmass(pep, "M", item.Charge, mod=mod),
335 |         }
336 |     )
337 | 
338 |     INPUT_SHAPE[0] = max(
339 |         INPUT_SHAPE[0], len(pep) + 2
340 |     )  # update xshape to match max input peptide
341 | 
342 | batch_per_loop = 64
343 | loop_size = args.batch_size * batch_per_loop
344 | 
345 | f = open(args.output, "w+")
346 | 
347 | while len(inputs) > 0:
348 |     if len(inputs) >= loop_size:
349 |         sliced_spectra = inputs[:loop_size]
350 |         inputs = inputs[loop_size:]
351 |     else:
352 |         sliced_spectra = inputs
353 |         inputs = []
354 | 
355 |     y = pm.predict(
356 |         input_generator(sliced_spectra, preprocessor, batch_size=args.batch_size),
357 |         verbose=1,
358 |     )
359 |     y = np.square(y)
360 | 
361 |     f.writelines("%s\n\n" % tomgf(sp, yi) for sp, yi in zip(sliced_spectra, y))
362 | 
363 | f.close()
364 | print("Prediction finished")
365 | 


--------------------------------------------------------------------------------
/train_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import numpy as np
  4 | import math
  5 | from pyteomics import mgf, mass
  6 | import argparse
  7 | 
  8 | import tensorflow as tf
  9 | import tensorflow.keras as k
 10 | from tensorflow.keras import backend as K
 11 | from tensorflow.keras.layers import (
 12 |     Conv1D,
 13 |     MaxPooling1D,
 14 |     Dense,
 15 |     Add,
 16 |     Flatten,
 17 |     Activation,
 18 |     BatchNormalization,
 19 | )
 20 | from tensorflow.keras import Model, Input
 21 | 
 22 | from coord_tf import CoordinateChannel2D, CoordinateChannel1D
 23 | 
 24 | # hyper parameter and constants
 25 | 
 26 | SPECTRA_DIMENSION = 20000
 27 | BIN_SIZE = 0.1
 28 | MAX_PEPTIDE_LENGTH = 30
 29 | MAX_MZ = 2000
 30 | LENGTH_SCALE = 1000
 31 | PRECURSOR_SCALE = 20000.0
 32 | 
 33 | 
 34 | mono = {
 35 |     "G": 57.021464,
 36 |     "A": 71.037114,
 37 |     "S": 87.032029,
 38 |     "P": 97.052764,
 39 |     "V": 99.068414,
 40 |     "T": 101.04768,
 41 |     "C": 160.03019,
 42 |     "L": 113.08406,
 43 |     "I": 113.08406,
 44 |     "D": 115.02694,
 45 |     "Q": 128.05858,
 46 |     "K": 128.09496,
 47 |     "E": 129.04259,
 48 |     "M": 131.04048,
 49 |     "m": 147.0354,
 50 |     "H": 137.05891,
 51 |     "F": 147.06441,
 52 |     "R": 156.10111,
 53 |     "Y": 163.06333,
 54 |     "N": 114.04293,
 55 |     "W": 186.07931,
 56 |     "O": 147.03538,
 57 | }
 58 | 
 59 | ave_mass = {
 60 |     "A": 71.0788,
 61 |     "R": 156.1875,
 62 |     "N": 114.1038,
 63 |     "D": 115.0886,
 64 |     "C": 160.1598,
 65 |     "E": 129.1155,
 66 |     "Q": 128.1307,
 67 |     "G": 57.0519,
 68 |     "H": 137.1411,
 69 |     "I": 113.1594,
 70 |     "L": 113.1594,
 71 |     "K": 128.1741,
 72 |     "M": 131.1926,
 73 |     "F": 147.1766,
 74 |     "P": 97.1167,
 75 |     "S": 87.0782,
 76 |     "T": 101.1051,
 77 |     "W": 186.2132,
 78 |     "Y": 163.1760,
 79 |     "V": 99.1326,
 80 | }
 81 | 
 82 | Alist = list("ACDEFGHIKLMNPQRSTVWYZ")
 83 | ENCODE_DIMENSION = len(Alist) + 3
 84 | 
 85 | charMap = {"@": 0, "[": 21}
 86 | for i, a in enumerate(Alist):
 87 |     charMap[a] = i + 1
 88 | 
 89 | # help functions
 90 | 
 91 | 
 92 | def asnp(x):
 93 |     return np.asarray(x)
 94 | 
 95 | 
 96 | def asnp32(x):
 97 |     return np.asarray(x, dtype="float32")
 98 | 
 99 | 
100 | # compute percursor mass
101 | def fastmass(pep, ion_type, charge, mod=None, cam=True):
102 |     base = mass.fast_mass(pep, ion_type=ion_type, charge=charge)
103 | 
104 |     if cam:
105 |         base += 57.021 * pep.count("C") / charge
106 | 
107 |     if not mod is None:
108 |         base += 15.995 * np.sum(mod == 1) / charge
109 | 
110 |         base += -np.sum(mod[mod < 0])
111 |     return base
112 | 
113 | 
114 | INPUT_LENGTH = MAX_PEPTIDE_LENGTH + 2
115 | INPUT_DIMENSION = ENCODE_DIMENSION + 2 + 3
116 | META_SHAPE = (3, 30)
117 | 
118 | 
119 | # embed input item into a matrix
120 | def embed(spectrum, embedding, mass_scale=200):
121 |     pep = spectrum["pep"]
122 |     pep = pep.replace("L", "I")
123 | 
124 |     embedding[len(pep)][ENCODE_DIMENSION - 1] = 1  # ending pos
125 |     for i, aa in enumerate(pep):
126 |         embedding[i][charMap[aa]] = 1  # 1 - 20
127 |         embedding[i][ENCODE_DIMENSION] = mono[aa] / mass_scale
128 | 
129 |     embedding[: len(pep), ENCODE_DIMENSION + 1] = (
130 |         np.arange(len(pep)) / LENGTH_SCALE
131 |     )  # position info
132 | 
133 |     embedding[len(pep) + 1, 0] = 1  # padding info
134 | 
135 |     return embedding
136 | 
137 | 
138 | def preprocessor(batch):
139 |     batch_size = len(batch)
140 |     embedding = np.zeros((batch_size, INPUT_LENGTH, INPUT_DIMENSION), dtype="float32")
141 |     meta = np.zeros((batch_size, *META_SHAPE), dtype="float32")
142 | 
143 |     for i, sp in enumerate(batch):
144 |         pep = sp["pep"]
145 | 
146 |         if len(pep) > MAX_PEPTIDE_LENGTH:
147 |             raise "input too long"
148 | 
149 |         embed(sp, embedding=embedding[i])
150 |         meta[i][0][sp["charge"] - 1] = 1  # charge
151 |         meta[i][1][sp["type"]] = 1  # ftype
152 |         meta[i][2][0] = fastmass(pep, ion_type="M", charge=1) / PRECURSOR_SCALE
153 | 
154 |         if not "nce" in sp or sp["nce"] == 0:
155 |             meta[i][2][-1] = 0.25
156 |         else:
157 |             meta[i][2][-1] = sp["nce"] / 100.0
158 | 
159 |     return (embedding, meta)
160 | 
161 | 
162 | # read inputs
163 | def parse_spectra(sps, spec_type=3):
164 |     # ratio constants for NCE
165 |     cr = {1: 1, 2: 0.9, 3: 0.85, 4: 0.8, 5: 0.75, 6: 0.75, 7: 0.75, 8: 0.75}
166 | 
167 |     db = []
168 | 
169 |     for sp in sps:
170 |         param = sp["params"]
171 | 
172 |         c = int(str(param["charge"][0])[0])
173 | 
174 |         if "seq" in param:
175 |             pep = param["seq"]
176 |         else:
177 |             pep = param["title"]
178 | 
179 |         if "pepmass" in param:
180 |             mass = param["pepmass"][0]
181 |         else:
182 |             mass = float(param["parent"])
183 | 
184 |         if "hcd" in param:
185 |             try:
186 |                 hcd = param["hcd"]
187 |                 if hcd[-1] == "%":
188 |                     hcd = float(hcd)
189 |                 elif hcd[-2:] == "eV":
190 |                     hcd = float(hcd[:-2])
191 |                     hcd = hcd * 500 * cr[c] / mass
192 |                 else:
193 |                     raise Exception("Invalid type!")
194 |             except:
195 |                 hcd = 0
196 |         else:
197 |             hcd = 0
198 | 
199 |         mz = sp["m/z array"]
200 |         it = sp["intensity array"]
201 | 
202 |         db.append(
203 |             {
204 |                 "pep": pep,
205 |                 "charge": c,
206 |                 "mass": mass,
207 |                 "mz": mz,
208 |                 "it": it,
209 |                 "nce": hcd,
210 |                 "type": spec_type,
211 |             }
212 |         )
213 | 
214 |     return db
215 | 
216 | 
217 | spec_types = {"unknown": 0, "cid": 1, "etd": 2, "hcd": 3, "ethcd": 4, "etcid": 5}
218 | 
219 | 
220 | def readmgf(fn, type="hcd"):
221 |     file = open(fn, "r")
222 |     data = mgf.read(
223 |         file, convert_arrays=1, read_charges=False, dtype="float32", use_index=False
224 |     )
225 | 
226 |     codes = parse_spectra(data, spec_type=spec_types[type])
227 |     file.close()
228 |     return codes
229 | 
230 | 
231 | def spectrum2vector(mz_list, itensity_list, mass, bin_size, charge):
232 |     itensity_list = itensity_list / np.max(itensity_list)
233 | 
234 |     vector = np.zeros(SPECTRA_DIMENSION, dtype="float32")
235 | 
236 |     mz_list = np.asarray(mz_list)
237 | 
238 |     indexes = mz_list / bin_size
239 |     indexes = np.around(indexes).astype("int32")
240 | 
241 |     for i, index in enumerate(indexes):
242 |         vector[index] += itensity_list[i]
243 | 
244 |     # normalize
245 |     vector = np.sqrt(vector)
246 | 
247 |     # remove precursors, including isotropic peaks
248 |     for delta in (0, 1, 2):
249 |         precursor_mz = mass + delta / charge
250 |         if precursor_mz > 0 and precursor_mz < 2000:
251 |             vector[round(precursor_mz / bin_size)] = 0
252 | 
253 |     return vector
254 | 
255 | 
256 | # building the model
257 | 
258 | 
259 | def res_block(x, layers, kernel=(3,), act="relu", se=0, **kws):
260 |     normalizer = BatchNormalization
261 | 
262 |     ConvLayer = k.layers.Conv1D
263 |     MaxPoolingLayer = k.layers.MaxPooling1D
264 |     AvePoolingLayer = k.layers.AveragePooling1D
265 |     GlobalPoolingLayer = k.layers.GlobalAveragePooling1D
266 |     GlobalMaxLayer = k.layers.GlobalMaxPooling1D
267 |     assert K.ndim(x) == 3
268 | 
269 |     raw_x = x  # backup input
270 | 
271 |     x = ConvLayer(layers, kernel_size=kernel, padding="same", **kws)(x)
272 |     x = normalizer(gamma_initializer="zeros")(x)
273 | 
274 |     if se == 1:
275 |         x2 = GlobalPoolingLayer()(x)
276 |         x2 = Dense(max(4, layers // 16), activation="relu")(x2)
277 |         x2 = Dense(layers, activation="sigmoid")(x2)
278 |         x2 = k.layers.Reshape((1, -1))(x2)
279 | 
280 |         x = k.layers.Multiply()([x, x2])
281 | 
282 |     if K.int_shape(x)[-1] != layers:
283 |         raw_x = ConvLayer(layers, kernel_size=1, padding="same")(raw_x)
284 |         raw_x = normalizer()(raw_x)
285 | 
286 |     x = Add()([raw_x, x])
287 | 
288 |     return Activation(act)(x)  # final activation
289 | 
290 | 
291 | def build(act="relu"):
292 |     inp = Input(shape=(INPUT_LENGTH, INPUT_DIMENSION), name="enbedding_input")
293 |     meta_inp = Input(shape=(*META_SHAPE,), name="meta_input")
294 | 
295 |     info = k.layers.Dense(8, activation="relu")(k.layers.Flatten()(meta_inp))
296 |     info = k.layers.Reshape((1, -1))(info)
297 |     info = tf.repeat(info, K.shape(inp)[1], axis=1)
298 |     x = k.layers.Concatenate(axis=-1)([inp, info])
299 | 
300 |     x = CoordinateChannel1D()(x)  # add positional information
301 | 
302 |     def conv_normal(x, channel, kernel, padding="same"):
303 |         x = Conv1D(channel, kernel_size=kernel, padding=padding)(x)
304 |         x = BatchNormalization(gamma_initializer="zeros")(x)
305 |         return x
306 | 
307 |     features = k.layers.Concatenate(axis=-1)(
308 |         [conv_normal(x, 64, i) for i in range(2, 10)]
309 |     )
310 | 
311 |     x = Conv1D(512, kernel_size=1, padding="same")(x)
312 |     x = BatchNormalization()(x)
313 | 
314 |     x = Add()([x, features])
315 |     x = Activation(act)(x)
316 | 
317 |     for i in range(8):
318 |         x = res_block(x, 512, 3, act=act, se=1)
319 | 
320 |     for i in range(3):
321 |         x = res_block(x, 512, 1, se=0, act=act)
322 | 
323 |     x = k.layers.Conv1D(SPECTRA_DIMENSION, kernel_size=1, padding="valid")(x)
324 |     x = Activation("sigmoid")(x)
325 |     x = k.layers.GlobalAveragePooling1D(name="spectrum")(x)
326 | 
327 |     model = k.models.Model(inputs=[inp, meta_inp], outputs=x, name="predfull_model")
328 |     return model
329 | 
330 | 
331 | parser = argparse.ArgumentParser()
332 | parser.add_argument(
333 |     "--mgf", type=str, help="output file path", default="hcd_testingset.mgf"
334 | )
335 | parser.add_argument(
336 |     "--out", type=str, help="filename to save the trained model", default="trained.h5"
337 | )
338 | 
339 | args = parser.parse_args()
340 | 
341 | K.clear_session()
342 | 
343 | pm = build()
344 | pm.compile(optimizer=k.optimizers.Adam(lr=0.0003), loss="cosine_similarity")
345 | print(pm.summary())
346 | 
347 | 
348 | print("Reading mgf...", args.mgf)
349 | spectra = readmgf(args.mgf, type="hcd")
350 | 
351 | y = [
352 |     spectrum2vector(sp["mz"], sp["it"], sp["mass"], BIN_SIZE, sp["charge"])
353 |     for sp in spectra
354 | ]
355 | 
356 | x = preprocessor(spectra)
357 | 
358 | pm.fit(x=x, y=np.asarray(y, dtype="float32"), epochs=50, verbose=1)
359 | pm.save(args.out)
360 | 


--------------------------------------------------------------------------------