├── .gitattributes
├── LICENSE
├── README.md
├── deepnovo_cython_setup.py
├── deepnovo_main.py
├── aa_workflow_step_4_2.py
├── deepnovo_cython_modules.pyx
├── deepnovo_preprocess.py
├── deepnovo_config.py
├── plot.py
├── deepnovo_worker_test.py
├── deepnovo_postprocess.py
├── aa_workflow.py
├── deepnovo_worker_db.py
├── aa_workflow_step_5.py
└── deepnovo_worker_io.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | DeepNovoAA is publicly available for non-commercial uses.
2 | Copyright (C) 2020. Authors. All rights reserved.
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DeepNovo-AA
2 | 
3 | ## General information
4 | 
5 | - Publication: Personalized deep learning of individual immunopeptidomes to identify neoantigens for cancer vaccines. Nature Machine Intelligence, 2020. (https://www.nature.com/articles/s42256-020-00260-4)
6 | 
7 | - To run the workflow on an example dataset, follow step-by-step instructions and Python scripts in the file `aa_workflow.py`.
8 | 


--------------------------------------------------------------------------------
/deepnovo_cython_setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Hieu Tran. All Rights Reserved.
 2 | #
 3 | # DeepNovo is publicly available for non-commercial uses.
 4 | # ==============================================================================
 5 | 
 6 | """TODO(nh2tran): docstring."""
 7 | 
 8 | import os
 9 | from distutils.core import setup
10 | from Cython.Build import cythonize
11 | import numpy
12 | 
13 | setup(ext_modules=cythonize("deepnovo_cython_modules.pyx"),
14 |       include_dirs=[numpy.get_include(), os.path.join(numpy.get_include(), 'numpy')])
15 | 


--------------------------------------------------------------------------------
/deepnovo_main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Hieu Tran. All Rights Reserved.
 2 | #
 3 | # DeepNovo is publicly available for non-commercial uses.
 4 | # ==============================================================================
 5 | 
 6 | """TODO(nh2tran): docstring."""
 7 | 
 8 | from __future__ import absolute_import
 9 | from __future__ import division
10 | from __future__ import print_function
11 | 
12 | import tensorflow as tf
13 | 
14 | import deepnovo_config
15 | import deepnovo_model
16 | import deepnovo_worker_db
17 | import deepnovo_worker_denovo
18 | import deepnovo_worker_io
19 | import deepnovo_worker_test
20 | import deepnovo_main_modules
21 | 
22 | 
23 | def main(_):
24 |   """TODO(nh2tran): docstring."""
25 | 
26 |   print("main()")
27 | 
28 |   if deepnovo_config.FLAGS.knapsack_build:
29 |     deepnovo_main_modules.knapsack_build()
30 |   elif deepnovo_config.FLAGS.train:
31 |     deepnovo_main_modules.train()
32 |   elif deepnovo_config.FLAGS.test_true_feeding:
33 |     deepnovo_main_modules.test_true_feeding()
34 |   elif deepnovo_config.FLAGS.decode:
35 |     deepnovo_main_modules.decode()
36 |   elif deepnovo_config.FLAGS.search_denovo:
37 |     model = deepnovo_model.ModelInference()
38 |     model.build_model()
39 |     worker_io = deepnovo_worker_io.WorkerIO(
40 |         input_spectrum_file=deepnovo_config.denovo_input_spectrum_file,
41 |         input_feature_file=deepnovo_config.denovo_input_feature_file,
42 |         output_file=deepnovo_config.denovo_output_file)
43 |     worker_denovo = deepnovo_worker_denovo.WorkerDenovo()
44 |     worker_denovo.search_denovo(model, worker_io)
45 |   elif deepnovo_config.FLAGS.search_db:
46 |     model = deepnovo_model.ModelInference()
47 |     model.build_model()
48 |     worker_io = deepnovo_worker_io.WorkerIO(
49 |         input_spectrum_file=deepnovo_config.db_input_spectrum_file,
50 |         input_feature_file=deepnovo_config.db_input_feature_file,
51 |         output_file=deepnovo_config.db_output_file)
52 |     worker_db = deepnovo_worker_db.WorkerDB(
53 |         db_fasta_file=deepnovo_config.db_fasta_file)
54 |     worker_db.build_db()
55 |     worker_db.search_db(model, worker_io)
56 |   elif deepnovo_config.FLAGS.search_hybrid:
57 |     model = deepnovo_model.ModelInference()
58 |     model.build_model()
59 |     # denovo search
60 |     worker_io = deepnovo_worker_io.WorkerIO(
61 |         input_spectrum_file=deepnovo_config.hybrid_input_spectrum_file,
62 |         input_feature_file=deepnovo_config.hybrid_input_feature_file,
63 |         output_file=deepnovo_config.hybrid_denovo_file)
64 |     worker_denovo = deepnovo_worker_denovo.WorkerDenovo()
65 |     predicted_denovo_list = worker_denovo.search_denovo(model, worker_io)
66 |     # db search with predicted_denovo_list
67 |     worker_io = deepnovo_worker_io.WorkerIO(
68 |         input_spectrum_file=deepnovo_config.hybrid_input_spectrum_file,
69 |         input_feature_file=deepnovo_config.hybrid_input_feature_file,
70 |         output_file=deepnovo_config.hybrid_output_file)
71 |     worker_db = deepnovo_worker_db.WorkerDB(
72 |         db_fasta_file=deepnovo_config.hybrid_fasta_file)
73 |     worker_db.build_db()
74 |     worker_db.search_db(model, worker_io, predicted_denovo_list)
75 |   elif deepnovo_config.FLAGS.test:
76 |     # test 1%FDR
77 |     #~ worker_db = deepnovo_worker_db.WorkerDB()
78 |     #~ worker_db.build_db()
79 |     #~ worker_test = deepnovo_worker_test.WorkerTest()
80 |     #~ worker_test.test_accuracy(worker_db.peptide_list)
81 |     worker_test = deepnovo_worker_test.WorkerTest()
82 |     worker_test.test_accuracy()
83 |   else:
84 |     print("ERROR: wrong option!")
85 |     sys.exit()
86 | 
87 | 
88 | if __name__ == "__main__":
89 |   tf.app.run()
90 | 


--------------------------------------------------------------------------------
/aa_workflow_step_4_2.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import csv
  6 | from Bio import SeqIO
  7 | from Bio.SeqIO import FastaIO
  8 | 
  9 | 
 10 | def drop_mod(peptide):
 11 |   peptide = peptide.replace("M(Oxidation)", "M")
 12 |   peptide = peptide.replace("N(Deamidation)", "N")
 13 |   peptide = peptide.replace("Q(Deamidation)", "Q")
 14 |   return peptide
 15 | 
 16 | 
 17 | def drop_mod_peaks(peptide):
 18 |   peptide = peptide.replace("M(+15.99)", "M")
 19 |   peptide = peptide.replace("N(+.98)", "N")
 20 |   peptide = peptide.replace("Q(+.98)", "Q")
 21 |   return peptide
 22 | 
 23 | 
 24 | def change_I_to_L(string):
 25 |     return string.replace('I', 'L')
 26 | 
 27 | 
 28 | def preprocess(denovo_file, db_fasta_file, labeled_feature_file, peptide_list_fasta):
 29 |   """Remove denovo peptides that exist in the database fasta file.
 30 |      Combine db and denovo into a peptide list file for PEAKS X DB search round 2.
 31 | 
 32 |      Usage:
 33 |        denovo_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5.denovo_only"
 34 |        db_fasta_file = "data.fasta/uniprot_sprot.human.plus_contaminants.fasta"
 35 |        labeled_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled"
 36 |        peptide_list_fasta = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/aa_workflow.step_4.peptide_list.fasta"
 37 |   """
 38 | 
 39 |   print("".join(["="] * 80)) # section-separating line
 40 |   print("preprocess()")
 41 | 
 42 |   print("denovo_file =", denovo_file)
 43 |   print("db_fasta_file =", db_fasta_file)
 44 |   print("labeled_feature_file =", labeled_feature_file)
 45 |   print("peptide_list_fasta =", peptide_list_fasta)
 46 | 
 47 |   denovo_peptide_set = set()
 48 |   with open(denovo_file, 'r') as fr:
 49 |       reader = csv.reader(fr, delimiter='\t')
 50 |       names = next(reader)
 51 |       seq_index = names.index('predicted_sequence')
 52 |       for line in reader:
 53 |           if not line[seq_index]:
 54 |               continue
 55 |           peptide = line[seq_index]
 56 |           peptide = drop_mod(peptide)
 57 |           peptide = ''.join(peptide.split(','))
 58 |           if peptide in denovo_peptide_set:
 59 |               continue
 60 |           else:
 61 |               denovo_peptide_set.add(peptide)
 62 |   print("Number of top-scoring denovo peptides: {}".format(len(denovo_peptide_set)))
 63 | 
 64 |   with open(db_fasta_file, 'r') as input_fasta_handle:
 65 |       record_list = list(SeqIO.parse(input_fasta_handle, "fasta"))
 66 |       print("Number of protein sequences: ", len(record_list))
 67 |   human_protein_list = [str(record.seq) for record in record_list]
 68 | 
 69 |   # remove denovo peptides that exist in the database fasta file
 70 |   to_L_protein_list = [change_I_to_L(protein) for protein in human_protein_list]
 71 |   pure_denovo_seq_set = set()
 72 |   for i, peptide in enumerate(denovo_peptide_set):
 73 |       peptide_string = change_I_to_L(peptide)
 74 |       indb = False
 75 |       for protein in to_L_protein_list:
 76 |           if peptide_string in protein:
 77 |               indb = True
 78 |               break
 79 |       if not indb:
 80 |           pure_denovo_seq_set.add(peptide)
 81 |       if i % 1000 == 0:
 82 |           print("processing {}".format(i))
 83 |   print("Number of denovo peptides not in database: {}".format(len(pure_denovo_seq_set)))
 84 | 
 85 |   db_peptide_set = set()
 86 |   with open(labeled_feature_file, 'r') as input_handle:
 87 |     csv_reader = csv.DictReader(input_handle, delimiter=',')
 88 |     for row in csv_reader:
 89 |       peptide = drop_mod_peaks(row['seq'])
 90 |       db_peptide_set.add(peptide)
 91 | 
 92 |   with open(peptide_list_fasta, 'w') as output_handle:
 93 |     counter = 0
 94 |     for peptide in db_peptide_set:
 95 |       counter += 1
 96 |       output_handle.write(">DB|db_{}\n".format(counter))
 97 |       output_handle.write(peptide + '\n')
 98 |     counter = 0
 99 |     for peptide in pure_denovo_seq_set:
100 |       counter += 1
101 |       output_handle.write(">DENOVO|denovo_{}\n".format(counter))
102 |       output_handle.write(''.join(peptide) + '\n')
103 | 
104 |   num_db_peptides = len(db_peptide_set)
105 |   num_denovo_peptides = len(pure_denovo_seq_set)
106 |   print("num_db_peptides =", num_db_peptides)
107 |   print("num_denovo_peptides =", num_denovo_peptides)
108 | 
109 | 
110 | def postprocess(psm_file, output_denovo_peptide_file):
111 |   """Extract denovo peptides from the PSMs of PEAKS X DB search round 2.
112 | 
113 |      Usage:
114 |        psm_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/aa_workflow.step_4.psm.csv"
115 |        output_denovo_peptide_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/aa_workflow.step_4.output_peptide_list"
116 |   """
117 | 
118 |   print("".join(["="] * 80)) # section-separating line
119 |   print("postprocess()")
120 | 
121 |   print("psm_file =", psm_file)
122 |   print("output_denovo_peptide_file =", output_denovo_peptide_file)
123 | 
124 |   denovo_peptide_set = set()
125 |   num_denovo_psm = 0
126 |   with open(psm_file, 'r') as input_handle:
127 |     csv_reader = csv.DictReader(input_handle, delimiter=',')
128 |     for row in csv_reader:
129 |       peptide = drop_mod_peaks(row['Peptide'])
130 |       accession = drop_mod_peaks(row['Accession'])
131 |       if accession == 'DENOVO':
132 |         num_denovo_psm += 1
133 |         denovo_peptide_set.add(peptide)
134 | 
135 |   with open(output_denovo_peptide_file, 'w') as output_handle:
136 |     for peptide in denovo_peptide_set:
137 |         output_handle.write(peptide + '\n')
138 | 
139 |   num_denovo_peptides = len(denovo_peptide_set)
140 |   print("num_denovo_peptides =", num_denovo_peptides)
141 |   print("num_denovo_psm =", num_denovo_psm)
142 | 


--------------------------------------------------------------------------------
/deepnovo_cython_modules.pyx:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Hieu Tran. All Rights Reserved.
  2 | #
  3 | # DeepNovo is publicly available for non-commercial uses.
  4 | # ==============================================================================
  5 | 
  6 | """TODO(nh2tran): docstring."""
  7 | 
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | from __future__ import print_function
 11 | 
 12 | import sys
 13 | 
 14 | import numpy as np
 15 | cimport numpy as np
 16 | cimport cython
 17 | 
 18 | import deepnovo_config
 19 | 
 20 | mass_ID_np = deepnovo_config.mass_ID_np
 21 | cdef int GO_ID = deepnovo_config.GO_ID
 22 | cdef int EOS_ID = deepnovo_config.EOS_ID
 23 | cdef float mass_H2O = deepnovo_config.mass_H2O
 24 | cdef float mass_NH3 = deepnovo_config.mass_NH3
 25 | cdef float mass_H = deepnovo_config.mass_H
 26 | cdef int SPECTRUM_RESOLUTION = deepnovo_config.SPECTRUM_RESOLUTION
 27 | cdef int WINDOW_SIZE = deepnovo_config.WINDOW_SIZE
 28 | cdef int vocab_size = deepnovo_config.vocab_size
 29 | cdef int num_ion = deepnovo_config.num_ion
 30 | cdef int neighbor_size = deepnovo_config.neighbor_size
 31 | cdef int MZ_SIZE = deepnovo_config.MZ_SIZE
 32 | 
 33 | 
 34 | @cython.boundscheck(False) # turn off bounds-checking
 35 | @cython.wraparound(False) # turn off negative index wrapping
 36 | cdef void copy_values(float[:,:,:] candidate_intensity_view, float[:,:] spectrum_view, int[:,:] location_sub, int i1, int i2):
 37 |   cdef int j
 38 |   cdef int neighbor
 39 |   cdef int i1_start = neighbor_size * i1
 40 |   for neighbor in range(neighbor_size):
 41 |     for j in range(WINDOW_SIZE):
 42 |       candidate_intensity_view[i2, i1_start + neighbor, j] = spectrum_view[neighbor, location_sub[i1, i2] + j]
 43 | 
 44 | 
 45 | @cython.boundscheck(False) # turn off bounds-checking
 46 | @cython.wraparound(False) # turn off negative index wrapping
 47 | def get_location(peptide_mass, prefix_mass, direction):
 48 |   if direction == 0:
 49 |     candidate_b_mass = prefix_mass + mass_ID_np
 50 |     candidate_y_mass = peptide_mass - candidate_b_mass
 51 |   elif direction == 1:
 52 |     candidate_y_mass = prefix_mass + mass_ID_np
 53 |     candidate_b_mass = peptide_mass - candidate_y_mass
 54 |   
 55 |   # b-ions
 56 |   candidate_b_H2O = candidate_b_mass - mass_H2O
 57 |   candidate_b_NH3 = candidate_b_mass - mass_NH3
 58 |   candidate_b_plus2_charge1 = ((candidate_b_mass + 2 * mass_H) / 2
 59 |                                - mass_H)
 60 | 
 61 |   # y-ions
 62 |   candidate_y_H2O = candidate_y_mass - mass_H2O
 63 |   candidate_y_NH3 = candidate_y_mass - mass_NH3
 64 |   candidate_y_plus2_charge1 = ((candidate_y_mass + 2 * mass_H) / 2
 65 |                                - mass_H)
 66 | 
 67 |   # ion_2
 68 |   #~   b_ions = [candidate_b_mass]
 69 |   #~   y_ions = [candidate_y_mass]
 70 |   #~   ion_mass_list = b_ions + y_ions
 71 | 
 72 |   # ion_8
 73 |   b_ions = [candidate_b_mass,
 74 |             candidate_b_H2O,
 75 |             candidate_b_NH3,
 76 |             candidate_b_plus2_charge1]
 77 |   y_ions = [candidate_y_mass,
 78 |             candidate_y_H2O,
 79 |             candidate_y_NH3,
 80 |             candidate_y_plus2_charge1]
 81 |   ion_mass_list = b_ions + y_ions
 82 |   ion_mass = np.array(ion_mass_list, dtype=np.float32)
 83 | 
 84 |   # ion locations
 85 |   location_sub50 = np.rint(ion_mass * SPECTRUM_RESOLUTION).astype(np.int32) # TODO(nh2tran): line-too-long
 86 |   # location_sub50 = np.int32(ion_mass * SPECTRUM_RESOLUTION)
 87 |   location_sub50 -= (WINDOW_SIZE // 2)
 88 |   location_plus50 = location_sub50 + WINDOW_SIZE
 89 |   ion_id_rows, aa_id_cols = np.nonzero(np.logical_and(
 90 |       location_sub50 >= 0,
 91 |       location_plus50 <= MZ_SIZE))
 92 |   return ion_id_rows, aa_id_cols, location_sub50, location_plus50
 93 | 
 94 | @cython.boundscheck(False) # turn off bounds-checking
 95 | @cython.wraparound(False) # turn off negative index wrapping
 96 | def get_candidate_intensity(float[:,:] spectrum_original, peptide_mass, prefix_mass, direction):
 97 |   """TODO(nh2tran): docstring."""
 98 |   ion_id_rows, aa_id_cols, location_sub50, location_plus50 = get_location(peptide_mass, prefix_mass, direction)
 99 |   # candidate_intensity
100 |   candidate_intensity = np.zeros(shape=(vocab_size,
101 |                                         neighbor_size*num_ion,
102 |                                         WINDOW_SIZE),
103 |                                  dtype=np.float32)
104 |   cdef int [:,:] location_sub50_view = location_sub50
105 |   cdef int [:,:] location_plus50_view = location_plus50
106 |   cdef float [:,:,:] candidate_intensity_view = candidate_intensity
107 |   cdef int[:] row = ion_id_rows.astype(np.int32)
108 |   cdef int[:] col = aa_id_cols.astype(np.int32)
109 |   cdef int index
110 |   for index in range(ion_id_rows.size):
111 |     if col[index] < 3:
112 |       continue
113 |     copy_values(candidate_intensity_view, spectrum_original, location_sub50_view, row[index], col[index])
114 |   # PAD/GO/EOS
115 |   # candidate_intensity[deepnovo_config.PAD_ID].fill(0.0)
116 |   # candidate_intensity[FIRST_LABEL].fill(0.0)
117 |   # candidate_intensity[LAST_LABEL].fill(0.0)
118 |   #~ b_ion_count = len(b_ions)
119 |   #~ if (direction==0):
120 |     #~ candidate_intensity[LAST_LABEL,b_ion_count:].fill(0.0)
121 |   #~ elif (direction==1):
122 |     #~ candidate_intensity[LAST_LABEL,:b_ion_count].fill(0.0)
123 | 
124 |   #~ for aa_id in ([LAST_LABEL] + range(3,deepnovo_config.vocab_size)):
125 |     #~ for ion_id in range(deepnovo_config.num_ion):
126 |       #~ location_sub50 = location_sub50_list[ion_id][aa_id]
127 |       #~ #
128 |       #~ if (location_sub50 > 0):
129 |         #~ candidate_intensity[aa_id,ion_id] = spectrum_original[location_sub50:location_sub50+deepnovo_config.WINDOW_SIZE]
130 | 
131 |   # Nomalization to [0, 1]
132 |   max_intensity = np.max(candidate_intensity)
133 |   if max_intensity > 1.0:
134 |     candidate_intensity /= max_intensity
135 |   # Nomalization to N(0,1): tf.image.per_image_whitening
136 | #~   adjusted_stddev = max(np.std(candidate_intensity), 1.0/math.sqrt(candidate_intensity.size))
137 | #~   candidate_intensity = (candidate_intensity-np.mean(candidate_intensity)) / adjusted_stddev
138 |   return candidate_intensity
139 | 
140 | 
141 | def process_spectrum(spectrum_mz_list, spectrum_intensity_list, peptide_mass):
142 |   """TODO(nh2tran): docstring."""
143 | 
144 |   # neutral mass, location, assuming ion charge z=1
145 |   charge = 1.0
146 |   spectrum_mz = np.array(spectrum_mz_list, dtype=np.float32)
147 |   neutral_mass = spectrum_mz - charge*deepnovo_config.mass_H
148 |   neutral_mass_location = np.rint(neutral_mass * deepnovo_config.SPECTRUM_RESOLUTION).astype(np.int32) # TODO(nh2tran): line-too-long
149 |   cdef int [:] neutral_mass_location_view = neutral_mass_location
150 | 
151 |   # intensity
152 |   spectrum_intensity = np.array(spectrum_intensity_list, dtype=np.float32)
153 |   # log-transform
154 | #~   spectrum_intensity = np.log(spectrum_intensity)
155 |   # find max intensity value for normalization and to assign to special locations
156 |   spectrum_intensity_max = np.max(spectrum_intensity)
157 |   # no normalization for each individual spectrum, we'll do it for multi-spectra
158 | #~   norm_intensity = spectrum_intensity / spectrum_intensity_max
159 |   norm_intensity = spectrum_intensity
160 |   cdef float [:] norm_intensity_view = norm_intensity
161 | 
162 |   # fill spectrum holders
163 |   spectrum_holder = np.zeros(shape=(1, deepnovo_config.MZ_SIZE), dtype=np.float32)
164 |   cdef float [:,:] spectrum_holder_view = spectrum_holder
165 |   # note that different peaks may fall into the same location, hence loop +=
166 |   cdef int index
167 |   for index in range(neutral_mass_location.size):
168 | #~     spectrum_holder_view[neutral_mass_location_view[index]] += norm_intensity_view[index] # TODO(nh2tran): line-too-long
169 |     spectrum_holder_view[0, neutral_mass_location_view[index]] = max(spectrum_holder_view[0, neutral_mass_location_view[index]], # TODO(nh2tran): line-too-long
170 |                                                                      norm_intensity_view[index]) # TODO(nh2tran): line-too-long
171 |   spectrum_original_forward = np.copy(spectrum_holder)
172 |   spectrum_original_backward = np.copy(spectrum_holder)
173 | 
174 |   # add complement
175 |   complement_mass = peptide_mass - neutral_mass
176 |   complement_mass_location = np.rint(complement_mass * deepnovo_config.SPECTRUM_RESOLUTION).astype(np.int32) # TODO(nh2tran): line-too-long
177 |   cdef int [:] complement_mass_location_view = complement_mass_location
178 | #~   cdef int index
179 |   for index in np.nonzero(complement_mass_location > 0)[0]:
180 |     spectrum_holder_view[0, complement_mass_location_view[index]] += norm_intensity_view[index] # TODO(nh2tran): line-too-long
181 | 
182 |   # peptide_mass
183 |   spectrum_original_forward[0, int(round(peptide_mass * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 # TODO(nh2tran): line-too-long
184 |   spectrum_original_backward[0, int(round(peptide_mass * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 # TODO(nh2tran): line-too-long
185 | 
186 |   # N-terminal, b-ion, peptide_mass_C
187 |   # append N-terminal
188 |   mass_N = deepnovo_config.mass_N_terminus - deepnovo_config.mass_H
189 |   spectrum_holder[0, int(round(mass_N * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0
190 |   # append peptide_mass_C
191 |   mass_C = deepnovo_config.mass_C_terminus + deepnovo_config.mass_H
192 |   peptide_mass_C = peptide_mass - mass_C
193 |   spectrum_holder[0, int(round(peptide_mass_C * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 # TODO(nh2tran): line-too-long
194 |   spectrum_original_forward[0, int(round(peptide_mass_C * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 # TODO(nh2tran): line-too-long
195 | 
196 |   # C-terminal, y-ion, peptide_mass_N
197 |   # append C-terminal
198 |   mass_C = deepnovo_config.mass_C_terminus + deepnovo_config.mass_H
199 |   spectrum_holder[0, int(round(mass_C * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0
200 |   # append peptide_mass_N
201 |   mass_N = deepnovo_config.mass_N_terminus - deepnovo_config.mass_H
202 |   peptide_mass_N = peptide_mass - mass_N
203 |   spectrum_holder[0, int(round(peptide_mass_N * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max# 1.0 # TODO(nh2tran): line-too-long
204 |   spectrum_original_backward[0, int(round(peptide_mass_N * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 # TODO(nh2tran): line-too-long
205 | 
206 |   return spectrum_holder, spectrum_original_forward, spectrum_original_backward
207 | 


--------------------------------------------------------------------------------
/deepnovo_preprocess.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import math
  6 | import os
  7 | import random
  8 | import sys
  9 | import time
 10 | import re
 11 | 
 12 | import csv
 13 | import numpy as np
 14 | random.seed(0)
 15 | np.random.seed(0)
 16 | 
 17 | from Bio import SeqIO
 18 | from Bio.SeqIO import FastaIO
 19 | 
 20 | import deepnovo_config
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | 
 27 | # write multi-line fasta file into single-line format
 28 | def write_fasta_1line(input_fasta_file, output_fasta_file):
 29 |   with open(input_fasta_file, "r")  as handle:
 30 |     record_list = list(SeqIO.parse(handle, "fasta"))
 31 |     print(input_fasta_file)
 32 |     print("Number of protein sequences: ", len(record_list))
 33 |   with open(output_fasta_file, "w") as handle:
 34 |     fasta_writer = FastaIO.FastaWriter(handle, wrap=None)
 35 |     fasta_writer.write_file(record_list)
 36 | 
 37 | # ~ input_fasta_file = "data/uniprot.human_all_isoforms.fasta"
 38 | # ~ output_fasta_file = input_fasta_file + ".1line"
 39 | # ~ write_fasta_1line(input_fasta_file, output_fasta_file)
 40 | 
 41 | 
 42 | # randomly split a feature file into train/valid/test files for training
 43 | def split_feature_training(input_feature_file, proportion):
 44 |   print("split_feature_training()")
 45 | 
 46 |   print("input_feature_file = ", input_feature_file)
 47 |   print("proportion = ", proportion)
 48 | 
 49 |   output_file_train = input_feature_file + ".train"
 50 |   output_file_valid = input_feature_file + ".valid"
 51 |   output_file_test = input_feature_file + ".test"
 52 |   print("output_file_train =", output_file_train)
 53 |   print("output_file_valid =", output_file_valid)
 54 |   print("output_file_test =", output_file_test)
 55 | 
 56 | 
 57 |   num_total = 0
 58 |   num_train = 0
 59 |   num_valid = 0
 60 |   num_test = 0
 61 | 
 62 |   # read and write header line
 63 |   csv_reader = csv.DictReader(open(input_feature_file))
 64 |   csv_writer_train = csv.DictWriter(open(output_file_train, mode='w'), csv_reader.fieldnames)
 65 |   csv_writer_valid = csv.DictWriter(open(output_file_valid, mode='w'), csv_reader.fieldnames)
 66 |   csv_writer_test = csv.DictWriter(open(output_file_test, mode='w'), csv_reader.fieldnames)
 67 |   csv_writer_train.writeheader()
 68 |   csv_writer_valid.writeheader()
 69 |   csv_writer_test.writeheader()
 70 | 
 71 |   # iterate over feature rows
 72 |   # use random numbers 0/1/2 to assign rows to writers train/valid/test
 73 |   for row in csv_reader:
 74 |     num_total += 1
 75 |     random_num = np.random.choice(a=3, size=1, p=proportion)
 76 |     if random_num == 0:
 77 |       csv_writer = csv_writer_train
 78 |       num_train += 1
 79 |     elif random_num == 1:
 80 |       csv_writer = csv_writer_valid
 81 |       num_valid += 1
 82 |     else:
 83 |       csv_writer = csv_writer_test
 84 |       num_test += 1
 85 |     csv_writer.writerow(row)
 86 | 
 87 |   print("num_total =", num_total)
 88 |   print("num_train =", num_train)
 89 |   print("num_valid =", num_valid)
 90 |   print("num_test =", num_test)
 91 | 
 92 | # ~ input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_15/feature.csv.labeled.mass_corrected"
 93 | # ~ proportion = [0.90, 0.05, 0.05]
 94 | # ~ split_feature_training(input_feature_file, proportion)
 95 | 
 96 | 
 97 | # randomly split a feature file into train/valid/test files for training
 98 | # train/valid/test do NOT SHARE PEPTIDES
 99 | def split_feature_training_noshare(input_feature_file, proportion):
100 |   """Randomly split a feature file into train/valid/test files for training.
101 |      train/valid/test do NOT SHARE PEPTIDES.
102 | 
103 |      Usage:
104 |        input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled.mass_corrected"
105 |        proportion = [0.90, 0.05, 0.05]
106 |        split_feature_training_noshare(input_feature_file, proportion)
107 |   """
108 | 
109 |   print("split_feature_training_noshare()")
110 | 
111 |   print("input_feature_file = ", input_feature_file)
112 |   print("proportion = ", proportion)
113 | 
114 |   output_file_train = input_feature_file + ".train" + ".noshare"
115 |   output_file_valid = input_feature_file + ".valid" + ".noshare"
116 |   output_file_test = input_feature_file + ".test" + ".noshare"
117 |   print("output_file_train =", output_file_train)
118 |   print("output_file_valid =", output_file_valid)
119 |   print("output_file_test =", output_file_test)
120 | 
121 |   num_total = 0
122 |   num_unique = 0
123 |   num_train = 0
124 |   num_valid = 0
125 |   num_test = 0
126 | 
127 |   peptide_train_list = []
128 |   peptide_valid_list = []
129 |   peptide_test_list = []
130 | 
131 |   # read and write header line
132 |   csv_reader = csv.DictReader(open(input_feature_file))
133 |   csv_writer_train = csv.DictWriter(open(output_file_train, mode='w'), csv_reader.fieldnames)
134 |   csv_writer_valid = csv.DictWriter(open(output_file_valid, mode='w'), csv_reader.fieldnames)
135 |   csv_writer_test = csv.DictWriter(open(output_file_test, mode='w'), csv_reader.fieldnames)
136 |   csv_writer_train.writeheader()
137 |   csv_writer_valid.writeheader()
138 |   csv_writer_test.writeheader()
139 | 
140 |   # iterate over feature rows
141 |   # if the peptide already exists, use the corresponding writer
142 |   # if not, use random numbers 0/1/2 to assign writers train/valid/test
143 |   for row in csv_reader:
144 |     num_total += 1
145 |     peptide = row['seq']
146 |     if (peptide in peptide_train_list):
147 |       csv_writer = csv_writer_train
148 |       num_train += 1
149 |     elif (peptide in peptide_valid_list):
150 |       csv_writer = csv_writer_valid
151 |       num_valid += 1
152 |     elif (peptide in peptide_test_list):
153 |       csv_writer = csv_writer_test
154 |       num_test += 1
155 |     else:
156 |       num_unique += 1
157 |       random_num = np.random.choice(a=3, size=1, p=proportion)
158 |       if random_num == 0:
159 |         peptide_train_list.append(peptide)
160 |         csv_writer = csv_writer_train
161 |         num_train += 1
162 |       elif random_num == 1:
163 |         peptide_valid_list.append(peptide)
164 |         csv_writer = csv_writer_valid
165 |         num_valid += 1
166 |       else:
167 |         peptide_test_list.append(peptide)
168 |         csv_writer = csv_writer_test
169 |         num_test += 1
170 |     csv_writer.writerow(row)
171 | 
172 |   print("num_total =", num_total)
173 |   print("num_unique =", num_unique)
174 |   print("num_train =", num_train)
175 |   print("num_valid =", num_valid)
176 |   print("num_test =", num_test)
177 | 
178 | 
179 | # calculate peptide mass = N-terminus + amino acids + C-terminus
180 | def compute_peptide_mass(peptide):
181 |   """TODO(nh2tran): docstring.
182 |   """
183 | 
184 |   peptide_mass = (deepnovo_config.mass_N_terminus
185 |                   + sum(deepnovo_config.mass_AA[aa] for aa in peptide)
186 |                   + deepnovo_config.mass_C_terminus)
187 | 
188 |   return peptide_mass
189 | 
190 | # ~ peptide = 'AAAAAAALQAK'
191 | # ~ print(compute_peptide_mass(peptide))
192 | 
193 | 
194 | # parse peptide sequence with modifications
195 | # C(+57.02) >> C(Carbamidomethylation)
196 | # M(+15.99) >> M(Oxidation)
197 | # NQ(+.98) >> NQ(Deamidation)
198 | def parse_sequence_with_mod(raw_sequence):
199 |   #print("parse_sequence_with_mod()")
200 | 
201 |   raw_sequence_len = len(raw_sequence)
202 |   index = 0
203 |   peptide = []
204 |   while index < raw_sequence_len:
205 |     if raw_sequence[index] == "(":
206 |       if peptide[-1] == "C" and raw_sequence[index:index + 8] == "(+57.02)":
207 |         peptide[-1] = "C(Carbamidomethylation)"
208 |         index += 8
209 |       elif peptide[-1] == 'M' and raw_sequence[index:index + 8] == "(+15.99)":
210 |         peptide[-1] = 'M(Oxidation)'
211 |         index += 8
212 |       elif peptide[-1] == 'N' and raw_sequence[index:index + 6] == "(+.98)":
213 |         peptide[-1] = 'N(Deamidation)'
214 |         index += 6
215 |       elif peptide[-1] == 'Q' and raw_sequence[index:index + 6] == "(+.98)":
216 |         peptide[-1] = 'Q(Deamidation)'
217 |         index += 6
218 |       else:  # unknown modification
219 |         print("ERROR: unknown modification!")
220 |         print("raw_sequence = ", raw_sequence)
221 |         sys.exit()
222 |     else:
223 |       peptide.append(raw_sequence[index])
224 |       index += 1
225 | 
226 |   return peptide
227 | 
228 | # ~ raw_sequence = 'RHM(+15.99)GIGKR'
229 | # ~ print(parse_sequence_with_mod(raw_sequence))
230 | 
231 | 
232 | # calculate ppm of precursor_mz against peptide_mz
233 | # ppm / 1e6 = (precursor_mz - peptide_mz) / peptide_mz 
234 | def calculate_mass_shift_ppm(input_feature_file):
235 |   """Calculate ppm of precursor_mz against peptide_mz.
236 |      ppm / 1e6 = (precursor_mz - peptide_mz) / peptide_mz
237 | 
238 |      Usage:
239 |        input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled"
240 |        ppm = calculate_mass_shift_ppm(input_feature_file)
241 |   """
242 | 
243 |   print("calculate_mass_shift_ppm()")
244 | 
245 |   print("input_feature_file = ", input_feature_file)
246 | 
247 |   precursor_ppm_list = []
248 |   csv_reader = csv.DictReader(open(input_feature_file))
249 |   for row in csv_reader:
250 |     peptide = parse_sequence_with_mod(row['seq'])
251 |     precursor_mz = float(row['m/z'])
252 |     precursor_charge = float(row['z'])
253 |     peptide_mass = compute_peptide_mass(peptide)
254 |     peptide_mz = (peptide_mass + precursor_charge * deepnovo_config.mass_H) / precursor_charge
255 |     precursor_ppm = (precursor_mz - peptide_mz) / peptide_mz * 1e6
256 |     precursor_ppm_list.append(precursor_ppm)
257 |   mean_precursor_ppm = np.mean(precursor_ppm_list)
258 | 
259 |   print("mean_precursor_ppm =", mean_precursor_ppm)
260 |   return mean_precursor_ppm
261 | 
262 | 
263 | # correct precursor_mz given ppm
264 | # corrected_mz = precursor_mz / (1 + ppm / 1e6)
265 | def correct_mass_shift_ppm(input_feature_file, ppm):
266 |   """Correct precursor_mz given ppm: corrected_mz = precursor_mz / (1 + ppm / 1e6).
267 | 
268 |      Usage:
269 |        input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv"
270 |        correct_mass_shift_ppm(input_feature_file, ppm)
271 |   """
272 | 
273 |   print("correct_mass_shift_ppm()")
274 | 
275 |   print("input_feature_file = ", input_feature_file)
276 |   print("ppm =", ppm)
277 | 
278 |   output_feature_file = input_feature_file + ".mass_corrected"
279 |   print("output_feature_file =", output_feature_file)
280 | 
281 |   csv_reader = csv.DictReader(open(input_feature_file))
282 |   csv_writer = csv.DictWriter(open(output_feature_file, mode='w'), csv_reader.fieldnames)
283 |   csv_writer.writeheader()
284 |   for row in csv_reader:
285 |     precursor_mz = float(row['m/z'])
286 |     corrected_mz = precursor_mz / (1 + ppm / 1e6)
287 |     row['m/z'] = corrected_mz
288 |     csv_writer.writerow(row)
289 | 
290 | 
291 | # split a feature file into labeled and unlabeled files
292 | def split_feature_unlabel(input_feature_file):
293 |   """Split a feature file into labeled and unlabeled files.
294 | 
295 |      Usage:
296 |        input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv"
297 |        split_feature_unlabel(input_feature_file)
298 |   """
299 | 
300 |   print(''.join(['='] * 80)) # section-separating line
301 |   print("split_feature_unlabel()")
302 |   print("input_feature_file =", input_feature_file)
303 | 
304 |   output_file_labeled = input_feature_file + ".labeled"
305 |   output_file_unlabeled = input_feature_file + ".unlabeled"
306 |   print("output_file_labeled =", output_file_labeled)
307 |   print("output_file_unlabeled =", output_file_unlabeled)
308 | 
309 |   num_labeled = 0
310 |   num_unlabeled = 0
311 | 
312 |   # read and write header line
313 |   csv_reader = csv.DictReader(open(input_feature_file))
314 |   csv_writer_labeled = csv.DictWriter(open(output_file_labeled, mode='w'), csv_reader.fieldnames)
315 |   csv_writer_unlabeled = csv.DictWriter(open(output_file_unlabeled, mode='w'), csv_reader.fieldnames)
316 |   csv_writer_labeled.writeheader()
317 |   csv_writer_unlabeled.writeheader()
318 | 
319 |   # iterate over feature rows
320 |   # unlabeled features have empty peptide sequence
321 |   for row in csv_reader:
322 |     peptide = row['seq']
323 |     if peptide == '':
324 |       csv_writer = csv_writer_unlabeled
325 |       num_unlabeled += 1
326 |     else:
327 |       csv_writer = csv_writer_labeled
328 |       num_labeled += 1
329 |     csv_writer.writerow(row)
330 | 
331 |   print("num_labeled =", num_labeled)
332 |   print("num_unlabeled =", num_unlabeled)
333 | 
334 | 
335 | # merge multiple mgf files into one, adding fraction ID to scan ID
336 | def merge_mgf_file(input_file_list, fraction_list, output_file):
337 |   """Merge multiple mgf files into one, adding fraction ID to scan ID.
338 | 
339 |      Usage:
340 |        folder_path = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/"
341 |        fraction_list = range(0, 10+1)
342 |        merge_mgf_file(
343 |            input_file_list=[folder_path + "export_" + str(i) + ".mgf" for i in fraction_list],
344 |            fraction_list=fraction_list,
345 |            output_file=folder_path + "spectrum.mgf")
346 |   """
347 | 
348 |   print("merge_mgf_file()")
349 |   
350 |   # iterate over mgf files and their lines
351 |   counter = 0
352 |   with open(output_file, mode="w") as output_handle:
353 |     for input_file, fraction in zip(input_file_list, fraction_list):
354 |       print("input_file = ", os.path.join(input_file))
355 |       with open(input_file, mode="r") as input_handle:
356 |         for line in input_handle:
357 |           if "SCANS=" in line: # a spectrum found
358 |             counter += 1
359 |             scan = re.split('=|\n|\r', line)[1]
360 |             # re-number scan id
361 |             output_handle.write("SCANS=F{0}:{1}\n".format(fraction, scan))
362 |           else:
363 |             output_handle.write(line)
364 |   print("output_file = {0:s}".format(output_file))
365 |   print("counter = {0:d}".format(counter))
366 | 
367 | 
368 | # merge multiple feature files into one, adding fraction ID to feature & scan ID
369 | def merge_feature_file(input_file_list, fraction_list, output_file):
370 |   """Merge multiple feature files into one, adding fraction ID to feature & scan ID.
371 | 
372 |      Usage:
373 |        folder_path = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/"
374 |        fraction_list = range(0, 10+1)
375 |        merge_feature_file(
376 |            input_file_list=[folder_path + "export_" + str(i) + ".csv" for i in fraction_list],
377 |            fraction_list=fraction_list,
378 |            output_file=folder_path + "feature.csv")
379 |   """
380 | 
381 |   print("merge_feature_file()")
382 |   
383 |   # read and write header line
384 |   csv_reader = csv.DictReader(open(input_file_list[0]))
385 |   csv_writer = csv.DictWriter(open(output_file, mode='w'), csv_reader.fieldnames)
386 |   csv_writer.writeheader()
387 | 
388 |   # iterate over feature files and their rows
389 |   counter = 0
390 |   for input_file, fraction in zip(input_file_list, fraction_list):
391 |     print("input_file = ", os.path.join(input_file))
392 |     csv_reader = csv.DictReader(open(input_file))
393 |     for row in csv_reader:
394 |       counter += 1
395 |       # add fraction to feature id
396 |       feature_id = row['spec_group_id']
397 |       feature_id = "F" + str(fraction) + ":" + feature_id
398 |       row['spec_group_id'] = feature_id
399 |       # add fraction to scan id
400 |       scan_list = re.split(';', row['scans'])
401 |       scan_list = ["F" + str(fraction) + ":" + x for x in scan_list]
402 |       row['scans'] = ";".join(scan_list)
403 |       # join the line back together and write to output
404 |       csv_writer.writerow(row)
405 |   print("output_file = {0:s}".format(output_file))
406 |   print("counter = {0:d}".format(counter))
407 | 
408 | 
409 | 


--------------------------------------------------------------------------------
/deepnovo_config.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Hieu Tran. All Rights Reserved.
  2 | #
  3 | # DeepNovo is publicly available for non-commercial uses.
  4 | # ==============================================================================
  5 | 
  6 | """TODO(nh2tran): docstring."""
  7 | 
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | from __future__ import print_function
 11 | 
 12 | import numpy as np
 13 | import tensorflow as tf
 14 | 
 15 | 
 16 | # ==============================================================================
 17 | # FLAGS (options) for this app
 18 | # ==============================================================================
 19 | 
 20 | 
 21 | tf.app.flags.DEFINE_string("train_dir", # flag_name
 22 |                            "train", # default_value
 23 |                            "Training directory.") # docstring
 24 | 
 25 | tf.app.flags.DEFINE_boolean("reset_step",
 26 |                             False, # default_value
 27 |                             "Set to true to reset the global step after loading a pretrained model.")
 28 | 
 29 | tf.app.flags.DEFINE_integer("direction",
 30 |                             2,
 31 |                             "Set to 0/1/2 for Forward/Backward/Bi-directional.")
 32 | 
 33 | tf.app.flags.DEFINE_boolean("use_intensity",
 34 |                             True,
 35 |                             "Set to True to use intensity-model.")
 36 | 
 37 | tf.app.flags.DEFINE_boolean("shared",
 38 |                             False,
 39 |                             "Set to True to use shared weights.")
 40 | 
 41 | tf.app.flags.DEFINE_boolean("use_lstm",
 42 |                             True,
 43 |                             "Set to True to use lstm-model.")
 44 | 
 45 | tf.app.flags.DEFINE_boolean("lstm_kmer",
 46 |                             False,
 47 |                             "Set to True to use lstm model on k-mers instead of full sequence.")
 48 | 
 49 | tf.app.flags.DEFINE_boolean("knapsack_build",
 50 |                             False,
 51 |                             "Set to True to build knapsack matrix.")
 52 | 
 53 | tf.app.flags.DEFINE_boolean("train",
 54 |                             False,
 55 |                             "Set to True for training.")
 56 | 
 57 | tf.app.flags.DEFINE_boolean("test_true_feeding",
 58 |                             False,
 59 |                             "Set to True for testing.")
 60 | 
 61 | tf.app.flags.DEFINE_boolean("decode",
 62 |                             False,
 63 |                             "Set to True for decoding.")
 64 | 
 65 | tf.app.flags.DEFINE_boolean("beam_search",
 66 |                             False,
 67 |                             "Set to True for beam search.")
 68 | 
 69 | tf.app.flags.DEFINE_integer("beam_size",
 70 |                             5,
 71 |                             "Number of optimal paths to search during decoding.")
 72 | 
 73 | tf.app.flags.DEFINE_boolean("search_db",
 74 |                             False,
 75 |                             "Set to True to do a database search.")
 76 | 
 77 | tf.app.flags.DEFINE_boolean("search_denovo",
 78 |                             False,
 79 |                             "Set to True to do a denovo search.")
 80 | 
 81 | tf.app.flags.DEFINE_boolean("search_hybrid",
 82 |                             False,
 83 |                             "Set to True to do a hybrid, db+denovo, search.")
 84 | 
 85 | tf.app.flags.DEFINE_boolean("test",
 86 |                             False,
 87 |                             "Set to True to test the prediction accuracy.")
 88 | 
 89 | tf.app.flags.DEFINE_boolean("header_seq",
 90 |                             True,
 91 |                             "Set to False if peptide sequence is not provided.")
 92 | 
 93 | tf.app.flags.DEFINE_boolean("decoy",
 94 |                             False,
 95 |                             "Set to True to search decoy database.")
 96 | 
 97 | tf.app.flags.DEFINE_integer("multiprocessor",
 98 |                             1,
 99 |                             "Use multi processors to read data during training.")
100 | 
101 | 
102 | # I/O arguments
103 | tf.app.flags.DEFINE_string("train_spectrum",
104 |                            "train_spectrum",
105 |                            "Spectrum mgf file to train a new model.")
106 | tf.app.flags.DEFINE_string("train_feature",
107 |                            "train_feature",
108 |                            "Feature csv file to train a new model.")
109 | tf.app.flags.DEFINE_string("valid_spectrum",
110 |                            "valid_spectrum",
111 |                            "Spectrum mgf file for validation during training.")
112 | tf.app.flags.DEFINE_string("valid_feature",
113 |                            "valid_feature",
114 |                            "Feature csv file for validation during training.")
115 | tf.app.flags.DEFINE_string("test_spectrum",
116 |                            "test_spectrum",
117 |                            "Spectrum mgf file for testing.")
118 | tf.app.flags.DEFINE_string("test_feature",
119 |                            "test_feature",
120 |                            "Feature csv file for testing.")
121 | tf.app.flags.DEFINE_string("denovo_spectrum",
122 |                            "denovo_spectrum",
123 |                            "Spectrum mgf file to perform de novo sequencing.")
124 | tf.app.flags.DEFINE_string("denovo_feature",
125 |                            "denovo_feature",
126 |                            "Feature csv file to perform de novo sequencing.")
127 | tf.app.flags.DEFINE_string("target_file",
128 |                            "target_file",
129 |                            "Target file to calculate the prediction accuracy.")
130 | tf.app.flags.DEFINE_string("predicted_file",
131 |                            "predicted_file",
132 |                            "Predicted file to calculate the prediction accuracy.")
133 | 
134 | 
135 | FLAGS = tf.app.flags.FLAGS
136 | 
137 | 
138 | # ==============================================================================
139 | # GLOBAL VARIABLES for VOCABULARY
140 | # ==============================================================================
141 | 
142 | 
143 | # Special vocabulary symbols - we always put them at the start.
144 | _PAD = "_PAD"
145 | _GO = "_GO"
146 | _EOS = "_EOS"
147 | _START_VOCAB = [_PAD, _GO, _EOS]
148 | 
149 | PAD_ID = 0
150 | GO_ID = 1
151 | EOS_ID = 2
152 | 
153 | vocab_reverse = ['A',
154 |                  'R',
155 |                  'N',
156 |                  'N(Deamidation)',
157 |                  'D',
158 |                  'C',
159 |                  #'C(Carbamidomethylation)',
160 |                  'E',
161 |                  'Q',
162 |                  'Q(Deamidation)',
163 |                  'G',
164 |                  'H',
165 |                  'I',
166 |                  'L',
167 |                  'K',
168 |                  'M',
169 |                  'M(Oxidation)',
170 |                  'F',
171 |                  'P',
172 |                  'S',
173 |                  'T',
174 |                  'W',
175 |                  'Y',
176 |                  'V',
177 |                 ]
178 | 
179 | vocab_reverse = _START_VOCAB + vocab_reverse
180 | print("vocab_reverse ", vocab_reverse)
181 | 
182 | vocab = dict([(x, y) for (y, x) in enumerate(vocab_reverse)])
183 | print("vocab ", vocab)
184 | 
185 | vocab_size = len(vocab_reverse)
186 | print("vocab_size ", vocab_size)
187 | 
188 | 
189 | # ==============================================================================
190 | # GLOBAL VARIABLES for THEORETICAL MASS
191 | # ==============================================================================
192 | 
193 | 
194 | mass_H = 1.0078
195 | mass_H2O = 18.0106
196 | mass_NH3 = 17.0265
197 | mass_N_terminus = 1.0078
198 | mass_C_terminus = 17.0027
199 | mass_CO = 27.9949
200 | 
201 | mass_AA = {'_PAD': 0.0,
202 |            '_GO': mass_N_terminus-mass_H,
203 |            '_EOS': mass_C_terminus+mass_H,
204 |            'A': 71.03711, # 0
205 |            'R': 156.10111, # 1
206 |            'N': 114.04293, # 2
207 |            'N(Deamidation)': 115.02695,
208 |            'D': 115.02694, # 3
209 |            'C': 103.00919, # 4
210 |            #'C(Carbamidomethylation)': 161.01919, # C(+58.01) # orbi
211 |            #'C(Carbamidomethylation)': 160.03065, # C(+57.02)
212 |            'E': 129.04259, # 5
213 |            'Q': 128.05858, # 6
214 |            'Q(Deamidation)': 129.0426,
215 |            'G': 57.02146, # 7
216 |            'H': 137.05891, # 8
217 |            'I': 113.08406, # 9
218 |            'L': 113.08406, # 10
219 |            'K': 128.09496, # 11
220 |            'M': 131.04049, # 12
221 |            'M(Oxidation)': 147.0354,
222 |            'F': 147.06841, # 13
223 |            'P': 97.05276, # 14
224 |            'S': 87.03203, # 15
225 |            'T': 101.04768, # 16
226 |            'W': 186.07931, # 17
227 |            'Y': 163.06333, # 18
228 |            'V': 99.06841, # 19
229 |           }
230 | 
231 | mass_ID = [mass_AA[vocab_reverse[x]] for x in range(vocab_size)]
232 | mass_ID_np = np.array(mass_ID, dtype=np.float32)
233 | 
234 | mass_AA_min = mass_AA["G"] # 57.02146
235 | 
236 | 
237 | # ==============================================================================
238 | # GLOBAL VARIABLES for PRECISION, RESOLUTION, temp-Limits of MASS & LEN
239 | # ==============================================================================
240 | 
241 | 
242 | # if change, need to re-compile cython_speedup << NO NEED
243 | # ~ SPECTRUM_RESOLUTION = 10 # bins for 1.0 Da = precision 0.1 Da
244 | # ~ SPECTRUM_RESOLUTION = 20 # bins for 1.0 Da = precision 0.05 Da
245 | # ~ SPECTRUM_RESOLUTION = 40 # bins for 1.0 Da = precision 0.025 Da
246 | SPECTRUM_RESOLUTION = 50 # bins for 1.0 Da = precision 0.02 Da
247 | # ~ SPECTRUM_RESOLUTION = 100 # bins for 1.0 Da = precision 0.01 Da
248 | print("SPECTRUM_RESOLUTION ", SPECTRUM_RESOLUTION)
249 | 
250 | # if change, need to re-compile cython_speedup << NO NEED
251 | WINDOW_SIZE = 10 # 10 bins
252 | print("WINDOW_SIZE ", WINDOW_SIZE)
253 | 
254 | # skip peptide mass > MZ_MAX
255 | MZ_MAX = 3000.0
256 | MZ_SIZE = int(MZ_MAX * SPECTRUM_RESOLUTION) # 30k
257 | 
258 | KNAPSACK_AA_RESOLUTION = 10000 # 0.0001 Da
259 | mass_AA_min_round = int(round(mass_AA_min * KNAPSACK_AA_RESOLUTION)) # 57.02146
260 | KNAPSACK_MASS_PRECISION_TOLERANCE = 100 # 0.01 Da
261 | num_position = 0
262 | 
263 | PRECURSOR_MASS_PRECISION_TOLERANCE = 0.01
264 | 
265 | # ONLY for accuracy evaluation
266 | # ~ PRECURSOR_MASS_PRECISION_INPUT_FILTER = 0.01
267 | # ~ PRECURSOR_MASS_PRECISION_INPUT_FILTER = 1000
268 | AA_MATCH_PRECISION = 0.1
269 | 
270 | # during training or test_true_feeding: 
271 | # skip peptide length > MAX_LEN
272 | # assign peptides to buckets of the same length for efficient padding
273 | if FLAGS.train or FLAGS.test_true_feeding:
274 |   MAX_LEN = 30
275 |   _buckets = [12, 22, 32]
276 |   print("MAX_LEN ", MAX_LEN)
277 |   print("_buckets ", _buckets)
278 | 
279 | 
280 | # ==============================================================================
281 | # HYPER-PARAMETERS of the NEURAL NETWORKS
282 | # ==============================================================================
283 | 
284 | 
285 | num_ion = 8 # 2
286 | print("num_ion ", num_ion)
287 | 
288 | l2_weight = 0.0
289 | print("l2_weight ", l2_weight)
290 | 
291 | embedding_size = 512
292 | print("embedding_size ", embedding_size)
293 | 
294 | num_layers = 1
295 | num_units = 512
296 | print("num_layers ", num_layers)
297 | print("num_units ", num_units)
298 | 
299 | keep_conv = 0.75
300 | keep_dense = 0.5
301 | print("keep_conv ", keep_conv)
302 | print("keep_dense ", keep_dense)
303 | 
304 | max_gradient_norm = 5.0
305 | print("max_gradient_norm ", max_gradient_norm)
306 | 
307 | # DIA model parameters
308 | neighbor_size = 5 # allow up to ? spectra, including the main spectrum
309 | dia_window = 20.0 # the window size of MS2 scan in Dalton
310 | focal_loss = True
311 | 
312 | batch_size = 32
313 | print("batch_size ", batch_size)
314 | 
315 | epoch_stop = 10
316 | print("epoch_stop ", epoch_stop)
317 | 
318 | train_stack_size = 1000
319 | valid_stack_size = 5000
320 | test_stack_size = 5000 # for test_true_feeding
321 | #decode_stack_size = 1000 # for beam_search, deprecated
322 | print("train_stack_size ", train_stack_size)
323 | print("valid_stack_size ", valid_stack_size)
324 | print("test_stack_size ", test_stack_size)
325 | #print("decode_stack_size ", decode_stack_size)
326 | 
327 | steps_per_checkpoint = 100
328 | print("steps_per_checkpoint ", steps_per_checkpoint)
329 | 
330 | 
331 | # ==============================================================================
332 | # INPUT/OUTPUT FILES
333 | # ==============================================================================
334 | 
335 | 
336 | # pre-built knapsack matrix
337 | knapsack_file = "knapsack.npy"
338 | 
339 | # training/testing/decoding files
340 | # ~ input_spectrum_file_train = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/spectrum.mgf"
341 | # ~ input_feature_file_train = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled.mass_corrected.train.noshare"
342 | # ~ input_spectrum_file_valid = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/spectrum.mgf"
343 | # ~ input_feature_file_valid = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled.mass_corrected.valid.noshare"
344 | # ~ input_spectrum_file_test = "data.training/aa.hla.bassani.nature_2016.mel_15.class_1/spectrum.mgf"
345 | # ~ input_feature_file_test = "data.training/aa.hla.bassani.nature_2016.mel_15.class_1/feature.csv.labeled.mass_corrected.test.noshare"
346 | input_spectrum_file_train = FLAGS.train_spectrum
347 | input_feature_file_train = FLAGS.train_feature
348 | input_spectrum_file_valid = FLAGS.valid_spectrum
349 | input_feature_file_valid = FLAGS.valid_feature
350 | input_spectrum_file_test = FLAGS.test_spectrum
351 | input_feature_file_test = FLAGS.test_feature
352 | 
353 | # denovo files
354 | # ~ denovo_input_spectrum_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/spectrum.mgf"
355 | # ~ denovo_input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected"
356 | denovo_input_spectrum_file = FLAGS.denovo_spectrum
357 | denovo_input_feature_file = FLAGS.denovo_feature
358 | denovo_output_file = denovo_input_feature_file + ".deepnovo_denovo"
359 | 
360 | # test accuracy
361 | predicted_format = "deepnovo"
362 | # ~ target_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled.mass_corrected"
363 | # ~ predicted_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5"
364 | target_file = FLAGS.target_file
365 | predicted_file = FLAGS.predicted_file
366 | accuracy_file = predicted_file + ".accuracy"
367 | denovo_only_file = predicted_file + ".denovo_only"
368 | scan2fea_file = predicted_file + ".scan2fea"
369 | multifea_file = predicted_file + ".multifea"
370 | 
371 | # feature file column format
372 | col_feature_id = 0
373 | col_precursor_mz = 1
374 | col_precursor_charge = 2
375 | col_rt_mean = 3
376 | col_raw_sequence = 4
377 | col_scan_list = 5
378 | col_ms1_list = 6
379 | col_feature_area = 7
380 | col_num = 8
381 | # predicted file column format
382 | pcol_feature_id = 0
383 | pcol_feature_area = 1
384 | pcol_sequence = 2
385 | pcol_score = 3
386 | pcol_position_score = 4
387 | pcol_precursor_mz = 5
388 | pcol_precursor_charge = 6
389 | pcol_protein_id = 7
390 | pcol_scan_list_middle = 8
391 | pcol_scan_list_original = 9
392 | pcol_score_max = 10
393 | 
394 | 
395 | # ==============================================================================
396 | # DB SEARCH PARAMETERS
397 | # ==============================================================================
398 | 
399 | 
400 | data_format = "mgf"
401 | cleavage_rule = "trypsin"
402 | num_missed_cleavage = 2
403 | fixed_mod_list = ['C']
404 | var_mod_list = ['N', 'Q', 'M']
405 | num_mod = 3
406 | precursor_mass_tolerance = 0.01 # Da
407 | precursor_mass_ppm = 15.0/1000000 # ppm (20 better) # instead of absolute 0.01 Da
408 | topk_output = 1
409 | 
410 | # db files
411 | # ~ db_fasta_file = "data/uniprot_sprot.human.db_decoy.fasta"
412 | # ~ db_input_spectrum_file = "data.training/dia.pecan.hela.2018_03_29/testing.spectrum.mgf"
413 | # ~ db_input_feature_file = "data.training/dia.abrf.2018_03_27/testing.feature.csv.2k"
414 | # ~ db_output_file = db_input_feature_file + ".deepnovo_db"
415 | # ~ if FLAGS.decoy:  
416 |   # ~ db_output_file += ".decoy"
417 | 
418 | # hybrid files
419 | # ~ hybrid_fasta_file = "data/uniprot_sprot.human.db_decoy.fasta"
420 | # ~ hybrid_input_spectrum_file = "data.training/dia.abrf.2018_03_27/prediction.spectrum.mgf"
421 | # ~ hybrid_input_feature_file = "data.training/dia.abrf.2018_03_27/prediction.feature.csv.part1"
422 | # ~ hybrid_denovo_file = hybrid_input_feature_file + ".deepnovo_hybrid_denovo"
423 | # ~ hybrid_output_file = hybrid_input_feature_file + ".deepnovo_hybrid"
424 | # ~ if FLAGS.decoy:
425 |   # ~ hybrid_output_file += ".decoy"
426 | 
427 | 


--------------------------------------------------------------------------------
/plot.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import math
  3 | import numpy as np
  4 | import pandas as pd
  5 | import csv
  6 | import matplotlib
  7 | matplotlib.use('Agg')
  8 | import matplotlib.pyplot as pyplot
  9 | from matplotlib_venn import venn2
 10 | from matplotlib_venn import venn3
 11 | matplotlib.rcParams.update({'font.size': 11})
 12 | from scipy import stats
 13 | 
 14 | 
 15 | # ~ # TEMP
 16 | # ~ file_path = "step_5.output_neoantigen_criteria.xlsx"
 17 | # ~ value_list = pd.read_excel(file_path, sheetname='5_targets_152_candidates')['total_abundance'].values
 18 | # ~ fig, ax = pyplot.subplots()
 19 | # ~ pyplot.boxplot([value_list], labels=['total_abundance'])
 20 | # ~ ax.set_yscale('log')
 21 | # ~ ax.set_ylabel('Total abundance of supporting PSMs')
 22 | # ~ ax.spines["top"].set_visible(False)
 23 | # ~ ax.spines["right"].set_visible(False)
 24 | # ~ # GRLAFFLKY
 25 | # ~ pyplot.plot([1], [134464000], color='red', marker='o', markersize=6)
 26 | # ~ pyplot.savefig("temp.png")
 27 | 
 28 | 
 29 | def read_netmhcpan_csv(input_file, num_allele):
 30 | 
 31 |   best_nM_list = []
 32 |   best_rank_list = []
 33 |   with open(input_file, 'r') as input_handle:
 34 |     csv_reader = csv.DictReader(input_handle, delimiter=',')
 35 |     for row in csv_reader:
 36 |       best_nM = min([float(row['nM' + str(x)]) for x in range(1, num_allele+1)])
 37 |       best_rank = min([float(row['Rank' + str(x)]) for x in range(1, num_allele+1)])
 38 |       best_nM_list.append(best_nM)
 39 |       best_rank_list.append(best_rank)
 40 |   return best_nM_list, best_rank_list
 41 | 
 42 | 
 43 | def draw_figure2_boxplot_netmhcpan():
 44 | 
 45 |   num_allele = 4
 46 |   denovo_path = "deepnovo.aa.figure_2g.netmhcpan_denovo.csv"
 47 |   db_path = "deepnovo.aa.figure_2g.netmhcpan_db.csv"
 48 |   iedb_path = "deepnovo.aa.figure_2g.netmhcpan_iedb.csv"
 49 |   denovo_nM_list, denovo_rank_list = read_netmhcpan_csv(denovo_path, num_allele)
 50 |   db_nM_list, db_rank_list = read_netmhcpan_csv(db_path, num_allele)
 51 |   iedb_nM_list, iedb_rank_list = read_netmhcpan_csv(iedb_path, num_allele)
 52 | 
 53 |   # ~ colors = ['red', 'dodgerblue', 'lightgrey']
 54 |   # ~ nM_list = [denovo_nM_list, db_nM_list, iedb_nM_list]
 55 |   # ~ print([len(x) for x in nM_list])
 56 |   # ~ fig, ax = pyplot.subplots()
 57 |   # ~ nM_plot = pyplot.boxplot(nM_list, labels=['De novo', 'Database', 'IEDB'], patch_artist=True)
 58 |   # ~ for patch, color in zip(nM_plot['boxes'], colors):
 59 |     # ~ patch.set_facecolor(color)
 60 |   # ~ ax.set_yscale('log')
 61 |   # ~ ax.set_ylabel('Binding affinity (nM, log-scale)')
 62 |   # ~ ax.spines["top"].set_visible(False)
 63 |   # ~ ax.spines["right"].set_visible(False)
 64 |   # ~ # 500-nM threshold
 65 |   # ~ pyplot.plot([0, 6], [500, 500], color='black', linestyle='--', linewidth=1)
 66 |   # ~ pyplot.savefig("figure2.boxplot_nM.png")
 67 | 
 68 |   # ~ colors = ['red', 'dodgerblue']
 69 |   colors = ['red', 'dodgerblue', 'lightgrey']
 70 |   # ~ rank_list = [denovo_rank_list, db_rank_list]
 71 |   rank_list = [denovo_rank_list, db_rank_list, iedb_rank_list]
 72 |   print([len(x) for x in rank_list])
 73 |   fig, ax = pyplot.subplots()
 74 |   # ~ rank_plot = pyplot.boxplot(rank_list, labels=['De novo', 'Database'], patch_artist=True)
 75 |   rank_plot = pyplot.boxplot(rank_list, labels=['De novo', 'Database', 'IEDB'], patch_artist=True)
 76 |   for patch, color in zip(rank_plot['boxes'], colors):
 77 |     patch.set_facecolor(color)
 78 |   ax.set_yscale('log')
 79 |   ax.set_ylabel('Binding affinity rank (%, log-scale)')
 80 |   ax.spines["top"].set_visible(False)
 81 |   ax.spines["right"].set_visible(False)
 82 |   # 2% and 0.5% threshold
 83 |   pyplot.plot([0, 6], [2, 2], color='black', linestyle='--', linewidth=1)
 84 |   pyplot.plot([0, 6], [0.5, 0.5], color='black', linestyle='--', linewidth=1)
 85 |   pyplot.savefig("figure2.boxplot_rank.png")
 86 | 
 87 |   print("np.log(np.median(denovo_rank_list)) =", np.log(np.median(denovo_rank_list)))
 88 |   print("np.log(np.median(db_rank_list)) =", np.log(np.median(db_rank_list)))
 89 |   print("np.log(np.median(iedb_rank_list))", np.log(np.median(iedb_rank_list)))
 90 |   mannwhitneyu, pvalue = stats.mannwhitneyu(denovo_rank_list, iedb_rank_list)
 91 |   print("mannwhitneyu =", mannwhitneyu)
 92 |   print("pvalue =", pvalue)
 93 | 
 94 | # ~ draw_figure2_boxplot_netmhcpan()
 95 | 
 96 | 
 97 | def read_immuno_csv(input_file):
 98 | 
 99 |   score_list = []
100 |   with open(input_file, 'r') as input_handle:
101 |     csv_reader = csv.DictReader(input_handle, delimiter=',')
102 |     for row in csv_reader:
103 |       score_list.append(float(row['score']))
104 |   return score_list
105 | 
106 | 
107 | def draw_figure2_boxplot_immuno():
108 | 
109 |   denovo_path = "deepnovo.aa.figure_S5.mel_16.immuno_denovo.csv"
110 |   db_path = "deepnovo.aa.figure_S5.mel_16.immuno_db.csv"
111 |   # ~ iedb_path = "deepnovo.aa.figure_2i.immuno_iedb.csv"
112 |   # ~ model_path = "deepnovo.aa.figure_2i.immuno_model.csv"
113 |   denovo_score_list = read_immuno_csv(denovo_path)
114 |   db_score_list = read_immuno_csv(db_path)
115 |   # ~ iedb_score_list = read_immuno_csv(iedb_path)
116 |   # ~ model_score_list = read_immuno_csv(model_path)
117 | 
118 |   # ~ colors = ['red', 'dodgerblue', 'lightgrey', 'white']
119 |   colors = ['red', 'dodgerblue']
120 |   # ~ score_list = [denovo_score_list, db_score_list, iedb_score_list, model_score_list]
121 |   score_list = [denovo_score_list, db_score_list]
122 |   print([len(x) for x in score_list])
123 |   fig, ax = pyplot.subplots()
124 |   # ~ score_plot = pyplot.boxplot(score_list, labels=['De novo', 'Database', 'IEDB', 'Calis et al.'], patch_artist=True)
125 |   score_plot = pyplot.boxplot(score_list, labels=['De novo', 'Database'], patch_artist=True)
126 |   for patch, color in zip(score_plot['boxes'], colors):
127 |     patch.set_facecolor(color)
128 |   ax.set_ylabel('Immunogenicity')
129 |   ax.spines["top"].set_visible(False)
130 |   ax.spines["right"].set_visible(False)
131 |   pyplot.plot([0, 6], [0., 0.], color='black', linestyle='--', linewidth=1)
132 |   pyplot.savefig("figure2.boxplot_immuno.png")
133 | 
134 |   print("np.median(denovo_score_list) =", np.median(denovo_score_list))
135 |   print("np.median(db_score_list) =", np.median(db_score_list))
136 |   # ~ print("np.median(iedb_score_list) =", np.median(iedb_score_list))
137 |   # ~ print("np.median(model_score_list) =", np.median(model_score_list))
138 |   mannwhitneyu, pvalue = stats.mannwhitneyu(denovo_score_list, db_score_list)
139 |   print("mannwhitneyu =", mannwhitneyu)
140 |   print("pvalue =", pvalue)
141 | 
142 | # ~ draw_figure2_boxplot_immuno()
143 | 
144 | 
145 | def draw_figure2_venn():
146 | 
147 |   file_path = "temp.manuscript/deepnovo.aa.figure_2.step6.xlsx"
148 |   denovo_set = set(pd.read_excel(file_path, sheet_name='denovo_peptide')['denovo_peptide'].values)
149 |   db_set = set(pd.read_excel(file_path, sheet_name='db_peptide')['db_peptide'].values)
150 |   iedb_set = set(pd.read_excel(file_path, sheet_name='iedb_peptide')['iedb_peptide'].values)
151 |   set_labels = ('De novo', 'Database', 'IEDB')
152 |   venn_plot = venn3(subsets=[denovo_set, db_set, iedb_set], set_labels=set_labels)
153 |   venn_plot.get_patch_by_id('100').set_color('red')
154 |   venn_plot.get_patch_by_id('100').set_alpha(0.75)
155 |   venn_plot.get_patch_by_id('010').set_color('skyblue')
156 |   venn_plot.get_patch_by_id('010').set_alpha(1.0)
157 |   venn_plot.get_patch_by_id('001').set_color('grey')
158 |   venn_plot.get_patch_by_id('001').set_alpha(0.5)
159 |   pyplot.savefig("venn3.svg")
160 | 
161 | # ~ draw_figure2_venn()
162 | 
163 | 
164 | def plot_spectrum_array(spectrum_array, figure_name):
165 |   print("plot_spectrum_array()")
166 | 
167 |   figure = plt.figure(1)
168 |   spectrum_count = spectrum_array.shape[0]
169 |   for index in range(spectrum_count):
170 |     plt.subplot(spectrum_count, 1, index+1)
171 |     plt.plot(spectrum_array[index,:])
172 |     plt.ylim((0.0,1.0))
173 |   plt.show()
174 |   figure.savefig(figure_name)
175 |   plt.close()
176 | 
177 | #~ plot_spectrum_array(np.load("spectrum_original_forward.npy"), "spectrum_original_forward.pdf")
178 | 
179 | 
180 | def read_feature_id(input_file, split_char):
181 | 
182 |   feature_set = set()
183 |   with open(input_file, 'r') as handle:
184 |     header_line = handle.readline()
185 |     for line in handle:
186 |       line = re.split(split_char, line)
187 |       feature_id = line[0]
188 |       feature_set.add(feature_id)
189 |   return feature_set
190 | 
191 | 
192 | # figure2.venn2/3.peaks_deepnovo.png
193 | #~ matplotlib.rcParams.update({'font.size': 16})
194 | #~ peaks_set = read_feature_id("data.training/dia.urine.2018_03_29/testing.feature.csv", ',|\r|\n')
195 | #~ peaks_set2 = read_feature_id("data.training/dia.urine.2018_03_29/peaks.denovo.csv.top.feature_id", '\t|\r|\n')
196 | #~ deepnovo_set = read_feature_id("data.training/dia.urine.2018_03_29/testing.unlabeled.csv.deepnovo_denovo.minlen_5.top", '\t|\r|\n')
197 | #~ set_labels = ("PEAKS DB", "PEAKS Denovo", "DeepNovo-DIA")
198 | #~ set_labels = ("", "", "")
199 | #~ venn_plot = venn3(subsets=[peaks_set, peaks_set2, deepnovo_set], set_labels=set_labels)
200 | #~ for text in venn_plot.set_labels:
201 |   #~ text.set_fontsize(16)
202 | #~ pyplot.savefig("figure2.venn2.peaks_18_20_deepnovo.png")
203 | 
204 | 
205 | # figure2.bar.aa/peptide.png
206 | def draw_figure2_bar(y_value, y_label, figure_file):
207 | 
208 |   fig, ax = pyplot.subplots()
209 |   x_value = range(1, len(y_value)+1)
210 |   bar_10k, bar_5k, bar_2k = pyplot.bar(x_value, y_value, width=0.4, align='center')
211 |   bar_10k.set_facecolor('g')
212 |   bar_10k.set_alpha(0.5)
213 |   bar_5k.set_facecolor('lightskyblue')
214 |   bar_2k.set_facecolor('blue')
215 |   bar_2k.set_alpha(0.7)
216 |   for index, value in zip(x_value, y_value):
217 |     ax.text(index-0.1, value + 3, str(value), color='black')
218 |   ax.set_xticks(x_value)
219 |   ax.set_xticklabels(['Top 10k', 'Top 5k', 'Top 2k'])
220 |   ax.set_xlim([0, 4])
221 |   ax.set_ylim([0, 100])
222 |   ax.set_ylabel(y_label)
223 |   ax.spines['right'].set_visible(False)
224 |   ax.spines['top'].set_visible(False)
225 |   ax.xaxis.set_ticks_position('bottom')
226 |   ax.yaxis.set_ticks_position('left')
227 |   pyplot.savefig(figure_file)
228 | 
229 | #~ denovo_only = [41.7, 22.1, 5.5]
230 | #~ draw_figure2_bar(denovo_only, 'Denovo only peptides on top of database (%)', 
231 |                  #~ 'figure2.bar.denovo_only.png')
232 | #~ aa_accuracy = [76.2, 83.6, 94.2]
233 | #~ draw_figure2_bar(aa_accuracy, 'Amino acid accuracy (%)', 'figure2.bar.aa.png')
234 | #~ peptide_accuracy = [41.4, 53.0, 79.9]
235 | #~ draw_figure2_bar(peptide_accuracy, 'Peptide accuracy (%)', 'figure2.bar.peptide.png')
236 | 
237 | 
238 | def read_feature_accuracy(input_file, split_char):
239 | 
240 |   feature_list = []
241 |   with open(input_file, 'r') as handle:
242 |     header_line = handle.readline()
243 |     for line in handle:
244 |       line = re.split(split_char, line)
245 |       feature = {}
246 |       feature["feature_id"] = line[0]
247 |       feature["feature_area_log10"] = math.log10(max(float(line[1]), 1.0))
248 |       feature["predicted_score"] = float(line[4])
249 |       feature["recall_AA"] = float(line[5])
250 |       feature["predicted_len"] = float(line[6])
251 |       feature_list.append(feature)
252 |   return feature_list
253 | 
254 | 
255 | # figure2.accuracy.area.png
256 | def draw_figure2_accuracy_area(accuracy_file):
257 | 
258 |   feature_list = read_feature_accuracy(accuracy_file, '\t|\r|\n')
259 |   num_features = len(feature_list)
260 |   feature_area_log10 = [f['feature_area_log10'] for f in feature_list]
261 |   #~ x_max = int(max(feature_area_log10))
262 |   x_range = np.arange(3, 11, 1.0)
263 |   x_value = []
264 |   y_accuracy = []
265 |   y_proportion = []
266 |   #~ for x in range(1, x_max+1):
267 |   for x in x_range:
268 |     feature_x = [f for f in feature_list if x-0.5 < f['feature_area_log10'] <= x+0.5]
269 |     recall_AA = sum([f['recall_AA'] for f in feature_x])
270 |     target_len = sum([f['predicted_len'] for f in feature_x])
271 |     if target_len > 0:
272 |       x_value.append(x)
273 |       y_accuracy.append(100*recall_AA/target_len)
274 |       y_proportion.append(100.0 * len(feature_x) / num_features)
275 |   
276 |   fig, left_ax = pyplot.subplots()
277 |   pyplot.bar(x_value, y_proportion, width=0.6, align='center', color='salmon', alpha=0.75)
278 |   left_ax.set_xlabel('Feature abundance (log10 scale)')
279 |   left_ax.set_ylabel('Proportion of features (%)', color='salmon')
280 |   left_ax.tick_params('y', colors='salmon')
281 |   left_ax.set_xlim([0, 12])
282 |   left_ax.set_ylim([0, 100])
283 |   left_ax.spines['top'].set_visible(False)
284 |   left_ax.xaxis.set_ticks_position('bottom')
285 |   for index, value in zip(x_value, y_proportion):
286 |     if value > 0:
287 |       left_ax.text(index-0.2, value + 2, str(round(value,1)), fontsize=12, color='black')
288 | 
289 |   right_ax = left_ax.twinx()
290 |   right_ax.plot(x_value, y_accuracy, '-o', linewidth=2.0, color='blue')
291 |   right_ax.set_ylabel('Amino acid accuracy (%)', color='blue')
292 |   right_ax.tick_params('y', colors='blue')
293 |   right_ax.set_xlim([0, 12])
294 |   right_ax.set_ylim([0, 100])
295 |   for index, value in zip(x_value, y_accuracy):
296 |     if value > 0:
297 |       right_ax.text(index-0.2, value + 2, str(round(value,1)), fontsize=12, color='black')
298 | 
299 | 
300 |   pyplot.savefig('figure2.accuracy.area.png')
301 | 
302 | #~ accuracy_file = "data.training/dia.pecan.plasma.2018_03_29/testing_plasma.feature.csv.deepnovo_denovo.accuracy"
303 | #~ accuracy_file = "data.training/dia.hla.elife.jurkat_oxford/testing_jurkat_oxford.unlabeled.csv.deepnovo_denovo.accuracy"
304 | #~ accuracy_file = "Supplementary Table S6.txt"
305 | #~ draw_figure2_accuracy_area(accuracy_file)
306 | 
307 | 
308 | def get_accuracy_score(accuracy_file):
309 | 
310 |   feature_list = read_feature_accuracy(accuracy_file, '\t|\r|\n')
311 |   num_value = 10
312 |   step = 100 // num_value
313 |   x_value = [x*step for x in range(1, num_value+1)]
314 |   y_value = []
315 |   # find the accuracy for different cutoff
316 |   for x in x_value:
317 |     # ~ feature_x = [f for f in feature_list if x-(step//2) < 100*math.exp(f['predicted_score']) <= x+(step//2)]
318 |     feature_x = [f for f in feature_list if x <= 100*math.exp(f['predicted_score'])]
319 |     recall_AA = sum([f['recall_AA'] for f in feature_x])
320 |     predicted_len = sum([f['predicted_len'] for f in feature_x])
321 |     if predicted_len > 0:
322 |       y_value.append(100*recall_AA/predicted_len)
323 |     else:
324 |       y_value.append(0)
325 | 
326 |   return x_value, y_value
327 | 
328 | 
329 | # figure2.accuracy.score.png
330 | def draw_figure2_accuracy_score():
331 | 
332 |   accuracy_file = "data.training/aa.hla.bassani.nature_2016.mel_15.class_1/feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo.accuracy"
333 |   accuracy_file_generic = "data.training/aa.hla.bassani.nature_2016.mel_15.class_1/train.exclude_mel_15/feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo.accuracy"
334 |   x_test, y_test = get_accuracy_score(accuracy_file)
335 |   x_test_generic, y_test_generic = get_accuracy_score(accuracy_file_generic)
336 |   fig, ax = pyplot.subplots()
337 |   plot_cutoff_y, = pyplot.plot([0, 100], [95, 95], ':', linewidth=1.0, color='black', markeredgecolor='black', alpha=1.0)
338 |   plot_test_generic, = pyplot.plot(x_test_generic, y_test_generic, '--o', linewidth=1.0, color='orange', markeredgecolor='orange', alpha=1.0)
339 |   plot_test, = pyplot.plot(x_test, y_test, '-s', linewidth=1.0, color='red', markeredgecolor='red', alpha=1.0)
340 |   # ~ plot_cutoff_x, = pyplot.plot([59.5, 59.5], [0, 100], '--', linewidth=1.0, color='black', markeredgecolor='black', alpha=0.75)
341 |   # ~ plot_cutoff_x, = pyplot.plot([61.9, 61.9], [0, 100], '--', linewidth=1.0, color='black', markeredgecolor='black', alpha=0.75)
342 |   pyplot.legend([plot_test, plot_test_generic, plot_cutoff_y], ['Personalized model', 'Generic model', '95% cutoff'], loc='lower right')
343 |   pyplot.yticks([80, 83.5, 86.7, 90, 95, 100], ['80', '83.5', '86.7', '90', '95', '100'])
344 |   # ~ pyplot.title('DeepNovo confidence score for quality control')
345 |   ax.set_xlabel('De novo confidence score')
346 |   ax.set_xlim([0, 105])
347 |   ax.set_ylim([80, 101])
348 |   ax.set_ylabel('Amino acid accuracy (%)')
349 |   ax.spines['right'].set_visible(False)
350 |   ax.spines['top'].set_visible(False)
351 |   ax.xaxis.set_ticks_position('bottom')
352 |   ax.yaxis.set_ticks_position('left')
353 |   pyplot.savefig('figure2.accuracy.score.png')
354 |   # ~ pyplot.savefig('figure2.accuracy.score.svg')
355 | 
356 | # ~ draw_figure2_accuracy_score()
357 | 
358 | 
359 | 
360 | #~ db_file = "data.training/dia.pecan.plasma.2018_03_28/testing.feature.csv.deepnovo_denovo"
361 | #~ db_abundance = read_feature_abundance(db_file, '\t|\r|\n')
362 | #~ db_abundance_log10 = np.log10(np.array(db_abundance))
363 | #~ denovo_top_file = "data.training/dia.pecan.plasma.2018_03_28/testing.unlabeled.csv.deepnovo_denovo.len_5.7k"
364 | #~ denovo_abundance = read_feature_abundance(denovo_top_file, '\t|\r|\n')
365 | #~ denovo_abundance_log10 = np.log10(np.array(denovo_abundance))
366 | #~ print(len(db_abundance_log10))
367 | #~ print(len(denovo_abundance_log10))
368 | #~ n, bins, patches = pyplot.hist(db_abundance_log10, 50, facecolor='salmon', alpha=0.5)
369 | #~ n, bins, patches = pyplot.hist(denovo_abundance_log10, 50, facecolor='green', alpha=0.5)
370 | #~ pyplot.xlabel('Feature abundance (log10 scale)')
371 | #~ pyplot.ylabel('Number of features')
372 | #~ pyplot.savefig('figure2.hist.denovo.png')
373 | 


--------------------------------------------------------------------------------
/deepnovo_worker_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Hieu Tran. All Rights Reserved.
  2 | #
  3 | # DeepNovo is publicly available for non-commercial uses.
  4 | # ==============================================================================
  5 | 
  6 | """TODO(nh2tran): docstring."""
  7 | 
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | from __future__ import print_function
 11 | 
 12 | import re
 13 | import sys
 14 | 
 15 | import numpy as np
 16 | 
 17 | import deepnovo_config
 18 | 
 19 | class WorkerTest(object):
 20 |   """TODO(nh2tran): docstring.
 21 |      The WorkerTest should be stand-alone and separated from other workers.
 22 |   """
 23 | 
 24 | 
 25 |   def __init__(self):
 26 |     """TODO(nh2tran): docstring."""
 27 | 
 28 |     print("".join(["="] * 80)) # section-separating line
 29 |     print("WorkerTest.__init__()")
 30 | 
 31 |     # we currently use deepnovo_config to store both const & settings
 32 |     # the settings should be shown in __init__() to keep track carefully
 33 |     self.MZ_MAX = deepnovo_config.MZ_MAX
 34 | 
 35 |     self.target_file = deepnovo_config.target_file
 36 |     self.predicted_file = deepnovo_config.predicted_file
 37 |     self.predicted_format = deepnovo_config.predicted_format
 38 |     self.accuracy_file = deepnovo_config.accuracy_file
 39 |     self.denovo_only_file = deepnovo_config.denovo_only_file
 40 |     self.scan2fea_file = deepnovo_config.scan2fea_file
 41 |     self.multifea_file = deepnovo_config.multifea_file
 42 |     print("target_file = {0:s}".format(self.target_file))
 43 |     print("predicted_file = {0:s}".format(self.predicted_file))
 44 |     print("predicted_format = {0:s}".format(self.predicted_format))
 45 |     print("accuracy_file = {0:s}".format(self.accuracy_file))
 46 |     print("denovo_only_file = {0:s}".format(self.denovo_only_file))
 47 |     print("scan2fea_file = {0:s}".format(self.scan2fea_file))
 48 |     print("multifea_file = {0:s}".format(self.multifea_file))
 49 | 
 50 |     self.target_dict = {}
 51 |     self.predicted_list = []
 52 | 
 53 | 
 54 |   def test_accuracy(self, db_peptide_list=None):
 55 |     """TODO(nh2tran): docstring."""
 56 | 
 57 |     print("".join(["="] * 80)) # section-separating line
 58 |     print("WorkerTest.test_accuracy()")
 59 | 
 60 |     # write the accuracy of predicted peptides
 61 |     accuracy_handle = open(self.accuracy_file, 'w')
 62 |     header_list = ["feature_id",
 63 |                    "feature_area",
 64 |                    "target_sequence",
 65 |                    "predicted_sequence",
 66 |                    "predicted_score",
 67 |                    "recall_AA",
 68 |                    "predicted_len",
 69 |                    "target_len",
 70 |                    "scan_list_middle",
 71 |                    "scan_list_original"]
 72 |     header_row = "\t".join(header_list)
 73 |     print(header_row, file=accuracy_handle, end="\n")
 74 | 
 75 |     # write denovo_only peptides
 76 |     denovo_only_handle = open(self.denovo_only_file, 'w')
 77 |     header_list = ["feature_id",
 78 |                    "feature_area",
 79 |                    "predicted_sequence",
 80 |                    "predicted_score",
 81 |                    "predicted_score_max",
 82 |                    "scan_list_middle",
 83 |                    "scan_list_original"]
 84 |     header_row = "\t".join(header_list)
 85 |     print(header_row, file=denovo_only_handle, end="\n")
 86 | 
 87 |     self._get_target()
 88 |     target_count_total = len(self.target_dict)
 89 |     target_len_total = sum([len(x) for x in self.target_dict.itervalues()])
 90 | 
 91 |     # this part is tricky!
 92 |     # some target peptides are reported by PEAKS DB but not found in
 93 |     #   db_peptide_list due to mistakes in cleavage rules.
 94 |     # if db_peptide_list is given, we only consider those target peptides,
 95 |     #   otherwise, use all target peptides
 96 |     target_dict_db = {}
 97 |     if db_peptide_list is not None:
 98 |       for feature_id, target in self.target_dict.iteritems():
 99 |         target_simplied = target
100 |         # remove the extension 'mod' from variable modifications
101 |         target_simplied = ['M' if x=='M(Oxidation)' else x for x in target_simplied]
102 |         target_simplied = ['N' if x=='N(Deamidation)' else x for x in target_simplied]
103 |         target_simplied = ['Q' if x=='Q(Deamidation)' else x for x in target_simplied]
104 |         if target_simplied in db_peptide_list:
105 |           target_dict_db[feature_id] = target
106 |         else:
107 |           print("target not found: ", target_simplied)
108 |     else:
109 |       target_dict_db = self.target_dict
110 |     target_count_db = len(target_dict_db)
111 |     target_len_db = sum([len(x) for x in target_dict_db.itervalues()])
112 | 
113 |     # we also skip target peptides with precursor_mass > MZ_MAX
114 |     target_dict_db_mass = {}
115 |     for feature_id, peptide in target_dict_db.iteritems():
116 |       if self._compute_peptide_mass(peptide) <= self.MZ_MAX:
117 |         target_dict_db_mass[feature_id] = peptide
118 |     target_count_db_mass = len(target_dict_db_mass)
119 |     target_len_db_mass = sum([len(x) for x in target_dict_db_mass.itervalues()])
120 | 
121 |     # read predicted peptides from deepnovo or peaks
122 |     if deepnovo_config.predicted_format == "deepnovo":
123 |       self._get_predicted()
124 |     else:
125 |       self._get_predicted_peaks()
126 | 
127 |     # note that the prediction has already skipped precursor_mass > MZ_MAX
128 |     # we also skip predicted peptides whose feature_id's are not in target_dict_db_mass
129 |     predicted_count_mass = len(self.predicted_list)
130 |     predicted_count_mass_db = 0
131 |     predicted_len_mass_db = 0
132 |     predicted_only = 0
133 |     # the recall is calculated on remaining peptides
134 |     recall_AA_total = 0.0
135 |     recall_peptide_total = 0.0
136 | 
137 |     # record scan with multiple features
138 |     scan_dict = {}
139 | 
140 |     for index, predicted in enumerate(self.predicted_list):
141 | 
142 |       feature_id = predicted["feature_id"]
143 |       feature_area = str(predicted["feature_area"])
144 |       feature_scan_list_middle = predicted["scan_list_middle"]
145 |       feature_scan_list_original = predicted["scan_list_original"]
146 |       if feature_scan_list_original:
147 |         for scan in re.split(';|\r|\n', feature_scan_list_original):
148 |           if scan in scan_dict:
149 |             scan_dict[scan]["feature_count"] += 1
150 |             scan_dict[scan]["feature_list"].append(feature_id)
151 |           else:
152 |             scan_dict[scan] = {}
153 |             scan_dict[scan]["feature_count"] = 1
154 |             scan_dict[scan]["feature_list"] = [feature_id]
155 | 
156 |       if feature_id in target_dict_db_mass:
157 | 
158 |         predicted_count_mass_db += 1
159 | 
160 |         target = target_dict_db_mass[feature_id]
161 |         target_len= len(target)
162 | 
163 |         # if >= 1 denovo peptides reported, calculate the best accuracy
164 |         best_recall_AA = 0
165 |         best_predicted_sequence = predicted["sequence"][0]
166 |         best_predicted_score = predicted["score"][0]
167 |         for predicted_sequence, predicted_score in zip(predicted["sequence"], predicted["score"]):
168 |           predicted_AA_id = [deepnovo_config.vocab[x] for x in predicted_sequence]
169 |           target_AA_id = [deepnovo_config.vocab[x] for x in target]
170 |           recall_AA = self._match_AA_novor(target_AA_id, predicted_AA_id)
171 |           if (recall_AA > best_recall_AA
172 |               or (recall_AA == best_recall_AA and predicted_score > best_predicted_score)):
173 |             best_recall_AA = recall_AA
174 |             best_predicted_sequence = predicted_sequence[:]
175 |             best_predicted_score = predicted_score
176 |         recall_AA = best_recall_AA
177 |         predicted_sequence = best_predicted_sequence[:]
178 |         predicted_score = best_predicted_score
179 | 
180 |         recall_AA_total += recall_AA
181 |         if recall_AA == target_len:
182 |           recall_peptide_total += 1
183 |         predicted_len= len(predicted_sequence)
184 |         predicted_len_mass_db += predicted_len
185 | 
186 |         # convert to string format to print out
187 |         target_sequence = ",".join(target)
188 |         predicted_sequence = ",".join(predicted_sequence)
189 |         predicted_score = "{0:.2f}".format(predicted_score)
190 |         recall_AA = "{0:d}".format(recall_AA)
191 |         predicted_len = "{0:d}".format(predicted_len)
192 |         target_len = "{0:d}".format(target_len)
193 |         print_list = [feature_id,
194 |                       feature_area,
195 |                       target_sequence,
196 |                       predicted_sequence,
197 |                       predicted_score,
198 |                       recall_AA,
199 |                       predicted_len,
200 |                       target_len,
201 |                       feature_scan_list_middle,
202 |                       feature_scan_list_original]
203 |         print_row = "\t".join(print_list)
204 |         print(print_row, file=accuracy_handle, end="\n")
205 |       else:
206 |         predicted_only += 1
207 |         predicted_sequence = ';'.join([','.join(x) for x in predicted["sequence"]])
208 |         predicted_score = ';'.join(['{0:.2f}'.format(x) for x in predicted["score"]])
209 |         if predicted["score"]:
210 |           predicted_score_max = '{0:.2f}'.format(np.max(predicted["score"]))
211 |         else:
212 |           predicted_score_max = ''
213 |         print_list = [feature_id,
214 |                       feature_area,
215 |                       predicted_sequence,
216 |                       predicted_score,
217 |                       predicted_score_max,
218 |                       feature_scan_list_middle,
219 |                       feature_scan_list_original]
220 |         print_row = "\t".join(print_list)
221 |         print(print_row, file=denovo_only_handle, end="\n")
222 | 
223 |     accuracy_handle.close()
224 |     denovo_only_handle.close()
225 | 
226 |     multifea_dict = {}
227 |     for scan_id, value in scan_dict.iteritems():
228 |       feature_count = value["feature_count"]
229 |       feature_list = value["feature_list"]
230 |       if feature_count > 1:
231 |         for feature_id in feature_list:
232 |           if feature_id in multifea_dict:
233 |             multifea_dict[feature_id].append(scan_id + ':' + str(feature_count))
234 |           else:
235 |             multifea_dict[feature_id] = [scan_id + ':' + str(feature_count)]
236 | 
237 |     with open(self.scan2fea_file, 'w') as handle:
238 |       header_list = ["scan_id",
239 |                      "feature_count",
240 |                      "feature_list"]
241 |       header_row = "\t".join(header_list)
242 |       print(header_row, file=handle, end="\n")
243 |       for scan_id, value in scan_dict.iteritems():
244 |         print_list = [scan_id,
245 |                       str(value["feature_count"]),
246 |                       ";".join(value["feature_list"])]
247 |         print_row = "\t".join(print_list)
248 |         print(print_row, file=handle, end="\n")
249 | 
250 |     with open(self.multifea_file, 'w') as handle:
251 |       header_list = ["feature_id",
252 |                      "scan_list"]
253 |       header_row = "\t".join(header_list)
254 |       print(header_row, file=handle, end="\n")
255 |       for feature_id, scan_list in multifea_dict.iteritems():
256 |         print_list = [feature_id,
257 |                       ";".join(scan_list)]
258 |         print_row = "\t".join(print_list)
259 |         print(print_row, file=handle, end="\n")
260 | 
261 |     print("target_count_total = {0:d}".format(target_count_total))
262 |     print("target_len_total = {0:d}".format(target_len_total))
263 |     print("target_count_db = {0:d}".format(target_count_db))
264 |     print("target_len_db = {0:d}".format(target_len_db))
265 |     print("target_count_db_mass: {0:d}".format(target_count_db_mass))
266 |     print("target_len_db_mass: {0:d}".format(target_len_db_mass))
267 |     print()
268 | 
269 |     print("predicted_count_mass: {0:d}".format(predicted_count_mass))
270 |     print("predicted_count_mass_db: {0:d}".format(predicted_count_mass_db))
271 |     print("predicted_len_mass_db: {0:d}".format(predicted_len_mass_db))
272 |     print("predicted_only: {0:d}".format(predicted_only))
273 |     print()
274 | 
275 |     print("recall_AA_total = {0:.4f}".format(recall_AA_total / target_len_total))
276 |     print("recall_AA_db = {0:.4f}".format(recall_AA_total / target_len_db))
277 |     print("recall_AA_db_mass = {0:.4f}".format(recall_AA_total / target_len_db_mass))
278 |     print("recall_peptide_total = {0:.4f}".format(recall_peptide_total / target_count_total))
279 |     print("recall_peptide_db = {0:.4f}".format(recall_peptide_total / target_count_db))
280 |     print("recall_peptide_db_mass = {0:.4f}".format(recall_peptide_total / target_count_db_mass))
281 |     print("precision_AA_mass_db  = {0:.4f}".format(recall_AA_total / predicted_len_mass_db))
282 |     print("precision_peptide_mass_db  = {0:.4f}".format(recall_peptide_total / predicted_count_mass_db))
283 |   
284 |   
285 |   def _compute_peptide_mass(self, peptide):
286 |     """TODO(nh2tran): docstring.
287 |     """
288 | 
289 |     #~ print("".join(["="] * 80)) # section-separating line ===
290 |     #~ print("WorkerDB: _compute_peptide_mass()")
291 | 
292 |     peptide_mass = (deepnovo_config.mass_N_terminus
293 |                     + sum(deepnovo_config.mass_AA[aa] for aa in peptide)
294 |                     + deepnovo_config.mass_C_terminus)
295 | 
296 |     return peptide_mass
297 | 
298 | 
299 |   def _get_predicted(self):
300 |     """TODO(nh2tran): docstring."""
301 | 
302 |     print("".join(["="] * 80)) # section-separating line
303 |     print("WorkerTest._get_predicted()")
304 | 
305 |     predicted_list = []
306 |     col_feature_id = deepnovo_config.pcol_feature_id
307 |     col_feature_area = deepnovo_config.pcol_feature_area
308 |     col_sequence = deepnovo_config.pcol_sequence
309 |     col_score = deepnovo_config.pcol_score
310 |     col_scan_list_middle = deepnovo_config.pcol_scan_list_middle
311 |     col_scan_list_original = deepnovo_config.pcol_scan_list_original
312 |     with open(self.predicted_file, 'r') as handle:
313 |       # header
314 |       handle.readline()
315 |       for line in handle:
316 |         line_split = re.split('\t|\n', line)
317 |         predicted = {}
318 |         predicted["feature_id"] = line_split[col_feature_id]
319 |         predicted["feature_area"] = float(line_split[col_feature_area])
320 |         predicted["scan_list_middle"] = line_split[col_scan_list_middle]
321 |         predicted["scan_list_original"] = line_split[col_scan_list_original]
322 |         if line_split[col_sequence]: # not empty sequence
323 |           predicted["sequence"] = [re.split(',', x)
324 |                                    for x in re.split(';', line_split[col_sequence])]
325 |           predicted["score"] = [float(x)
326 |                                 for x in re.split(';', line_split[col_score])]
327 |         else: 
328 |           predicted["sequence"] = [[]]
329 |           predicted["score"] = [-999]
330 |         predicted_list.append(predicted)
331 | 
332 |     self.predicted_list = predicted_list
333 | 
334 | 
335 |   def _get_predicted_peaks(self):
336 |     """TODO(nh2tran): docstring."""
337 | 
338 |     print("".join(["="] * 80)) # section-separating line
339 |     print("WorkerTest._get_predicted_peaks()")
340 | 
341 |     predicted_list = []
342 |     col_fraction_id = 0
343 |     fraction_id_map = {'1':'1',
344 |                        '2':'10',
345 |                        '3':'11',
346 |                        '4':'12',
347 |                        '5':'2',
348 |                        '6':'3',
349 |                        '7':'4',
350 |                        '8':'5',
351 |                        '9':'6',
352 |                        '10':'7',
353 |                        '11':'8',
354 |                        '12':'9',
355 |                       }
356 |     col_scan_id = 1
357 |     col_sequence = 3
358 |     with open(self.predicted_file, 'r') as handle:
359 |       # header
360 |       handle.readline()
361 |       for line in handle:
362 |         line_split = re.split(',|\n', line)
363 |         predicted = {}
364 |         #~ predicted["feature_id"] = "F" + fraction_id_map[line_split[col_fraction_id]] + ":" + line_split[col_scan_id]
365 |         predicted["feature_id"] = "F" + line_split[col_fraction_id] + ":" + line_split[col_scan_id]
366 |         raw_sequence = line_split[col_sequence]
367 |         assert raw_sequence, "Error: wrong format."
368 |         predicted["sequence"] = self._parse_sequence(raw_sequence)
369 |         # skip peptides with precursor_mass > MZ_MAX
370 |         if self._compute_peptide_mass(predicted["sequence"]) > self.MZ_MAX:
371 |           continue
372 |         predicted["feature_area"] = 0
373 |         predicted["scan_list_middle"] = ""
374 |         predicted["scan_list_original"] = ""
375 |         predicted["sequence"] = [predicted["sequence"]]
376 |         predicted["score"] = [-999]
377 |         predicted_list.append(predicted)
378 | 
379 |     self.predicted_list = predicted_list
380 | 
381 | 
382 |   def _get_target(self):
383 |     """TODO(nh2tran): docstring."""
384 | 
385 |     print("".join(["="] * 80)) # section-separating line
386 |     print("WorkerTest._get_target()")
387 | 
388 |     target_dict = {}
389 |     with open(self.target_file, 'r') as handle:
390 |       header_line = handle.readline()
391 |       for line in handle:
392 |         line = re.split(',|\r|\n', line)
393 |         feature_id = line[0]
394 |         raw_sequence = line[deepnovo_config.col_raw_sequence]
395 |         assert raw_sequence, "Error: wrong target format."
396 |         peptide = self._parse_sequence(raw_sequence)
397 |         target_dict[feature_id] = peptide
398 |     self.target_dict = target_dict
399 | 
400 | 
401 |   def _parse_sequence(self, raw_sequence):
402 |     """TODO(nh2tran): docstring."""
403 | 
404 |     #~ print("".join(["="] * 80)) # section-separating line
405 |     #~ print("WorkerTest._parse_sequence()")
406 | 
407 |     raw_sequence_len = len(raw_sequence)
408 |     peptide = []
409 |     index = 0
410 |     while index < raw_sequence_len:
411 |       if raw_sequence[index] == "(":
412 |         if peptide[-1] == "C" and raw_sequence[index:index+8] == "(+57.02)":
413 |           peptide[-1] = "C(Carbamidomethylation)"
414 |           index += 8
415 |         elif peptide[-1] == 'M' and raw_sequence[index:index+8] == "(+15.99)":
416 |           peptide[-1] = 'M(Oxidation)'
417 |           index += 8
418 |         elif peptide[-1] == 'N' and raw_sequence[index:index+6] == "(+.98)":
419 |           peptide[-1] = 'N(Deamidation)'
420 |           index += 6
421 |         elif peptide[-1] == 'Q' and raw_sequence[index:index+6] == "(+.98)":
422 |           peptide[-1] = 'Q(Deamidation)'
423 |           index += 6
424 |         else: # unknown modification
425 |           print("ERROR: unknown modification!")
426 |           print("raw_sequence = ", raw_sequence)
427 |           sys.exit()
428 |       else:
429 |         peptide.append(raw_sequence[index])
430 |         index += 1
431 | 
432 |     return peptide
433 | 
434 | 
435 |   def _match_AA_novor(self, target, predicted):
436 |     """TODO(nh2tran): docstring."""
437 |   
438 |     #~ print("".join(["="] * 80)) # section-separating line
439 |     #~ print("WorkerTest._test_AA_match_novor()")
440 | 
441 |     num_match = 0
442 |     target_len = len(target)
443 |     predicted_len = len(predicted)
444 |     target_mass = [deepnovo_config.mass_ID[x] for x in target]
445 |     target_mass_cum = np.cumsum(target_mass)
446 |     predicted_mass = [deepnovo_config.mass_ID[x] for x in predicted]
447 |     predicted_mass_cum = np.cumsum(predicted_mass)
448 |   
449 |     i = 0
450 |     j = 0
451 |     while i < target_len and j < predicted_len:
452 |       if abs(target_mass_cum[i] - predicted_mass_cum[j]) < 0.5:
453 |         if abs(target_mass[i] - predicted_mass[j]) < 0.1:
454 |         #~ if  decoder_input[index_aa] == output[index_aa]:
455 |           num_match += 1
456 |         i += 1
457 |         j += 1
458 |       elif target_mass_cum[i] < predicted_mass_cum[j]:
459 |         i += 1
460 |       else:
461 |         j += 1
462 | 
463 |     return num_match
464 |   
465 | 


--------------------------------------------------------------------------------
/deepnovo_postprocess.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Hieu Tran. All Rights Reserved.
  2 | #
  3 | # DeepNovo is publicly available for non-commercial uses.
  4 | # ==============================================================================
  5 | 
  6 | """TODO(nh2tran): docstring."""
  7 | 
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | from __future__ import print_function
 11 | 
 12 | import os
 13 | import random
 14 | import sys
 15 | import re
 16 | 
 17 | from Bio import SeqIO
 18 | from Bio.SeqIO import FastaIO
 19 | import Levenshtein
 20 | 
 21 | import csv
 22 | import numpy as np
 23 | import math
 24 | import deepnovo_config
 25 | 
 26 | 
 27 | def compute_peptide_mass(peptide):
 28 |   """TODO(nh2tran): docstring.
 29 |   """
 30 | 
 31 |   #~ print("".join(["="] * 80)) # section-separating line ===
 32 |   #~ print("WorkerDB: _compute_peptide_mass()")
 33 | 
 34 |   peptide_mass = (deepnovo_config.mass_N_terminus
 35 |                   + sum(deepnovo_config.mass_AA[aa] for aa in peptide)
 36 |                   + deepnovo_config.mass_C_terminus)
 37 | 
 38 |   return peptide_mass
 39 | 
 40 | # ~ peptide = 'TASSQRLR'
 41 | # ~ print(peptide)
 42 | # ~ print(compute_peptide_mass(peptide))
 43 | 
 44 | 
 45 | def read_feature_accuracy(input_file):
 46 | 
 47 |   feature_list = []
 48 |   with open(input_file, 'r') as input_handle:
 49 |     csv_reader = csv.DictReader(input_handle, delimiter='\t')
 50 |     for row in csv_reader:
 51 |       feature = {}
 52 |       feature['feature_id'] = row['feature_id']
 53 |       feature['feature_area'] = math.log10(max(float(row['feature_area']), 1.0))
 54 |       feature['predicted_score'] = float(row['predicted_score'])
 55 |       feature['recall_AA'] = float(row['recall_AA'])
 56 |       feature['predicted_len'] = float(row['predicted_len'])
 57 |       feature_list.append(feature)
 58 |   return feature_list
 59 | 
 60 | 
 61 | def find_score_cutoff(accuracy_file, accuracy_cutoff):
 62 |   """TODO(nh2tran): docstring."""
 63 | 
 64 |   print("".join(["="] * 80)) # section-separating line
 65 |   print("find_score_cutoff()")
 66 | 
 67 |   print('accuracy_file =', accuracy_file)
 68 |   print('accuracy_cutoff =', accuracy_cutoff)
 69 | 
 70 |   feature_list = read_feature_accuracy(accuracy_file)
 71 |   feature_list_sorted = sorted(feature_list, key=lambda k: k['predicted_score'], reverse=True)
 72 |   recall_cumsum = np.cumsum([f['recall_AA'] for f in feature_list_sorted])
 73 |   predicted_len_cumsum = np.cumsum([f['predicted_len'] for f in feature_list_sorted])
 74 |   accuracy_cumsum = recall_cumsum / predicted_len_cumsum
 75 |   #cutoff_index = np.flatnonzero(accuracy_cumsum < accuracy_cutoff)[0]
 76 |   cutoff_index = np.flatnonzero(accuracy_cumsum >= accuracy_cutoff)[-1]
 77 |   cutoff_score = feature_list_sorted[cutoff_index]['predicted_score']
 78 |   print('cutoff_index = ', cutoff_index)
 79 |   print('cutoff_score = ', cutoff_score)
 80 |   print('cutoff_score = ', 100*math.exp(cutoff_score))
 81 | 
 82 |   return cutoff_score
 83 | 
 84 | 
 85 | def select_top_score(input_file, output_file, score_cutoff):
 86 |   """Select a threshold of de novo confidence scores to filter de novo results.
 87 |      The score threshold is calculated based on a 95% cutoff of the testing accuracy.
 88 | 
 89 |      Usage:
 90 |        accuracy_cutoff = 0.95
 91 |        accuracy_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo.accuracy"
 92 |        score_cutoff = find_score_cutoff(accuracy_file, accuracy_cutoff)
 93 |        input_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo"
 94 |        output_file = input_file + ".top95"
 95 |        select_top_score(input_file, output_file, score_cutoff)
 96 |   """
 97 | 
 98 |   print("".join(["="] * 80)) # section-separating line
 99 |   print("select_top_score()")
100 | 
101 |   print('input_file = ', input_file)
102 |   print('output_file = ', output_file)
103 |   print('score_cutoff = ', score_cutoff)
104 | 
105 |   total_feature = 0
106 |   select_feature = 0
107 |   with open(input_file, 'r') as input_handle:
108 |     with open(output_file, 'w') as output_handle:
109 |       csv_reader = csv.DictReader(input_handle, delimiter='\t')
110 |       csv_writer = csv.DictWriter(output_handle, csv_reader.fieldnames, delimiter='\t')
111 |       csv_writer.writeheader()
112 |       for row in csv_reader:
113 |         total_feature += 1
114 |         predicted_score = float(row['predicted_score']) if row['predicted_score'] else -999
115 |         if predicted_score >= score_cutoff:
116 |           select_feature += 1
117 |           csv_writer.writerow(row)
118 |   print('total_feature = ', total_feature)
119 |   print('select_feature = ', select_feature)
120 |           
121 | 
122 | def convert_I_to_L(input_file, output_file):
123 |   """Convert I (Isoleucine) to L (Leucine).
124 | 
125 |      Usage:
126 |        input_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo.top95"
127 |        output_file = input_file + ".I_to_L"
128 |        convert_I_to_L(input_file, output_file)
129 |   """
130 | 
131 |   print("".join(["="] * 80)) # section-separating line
132 |   print("convert_I_to_L()")
133 | 
134 |   print('input_file = ', input_file)
135 |   print('output_file = ', output_file)
136 | 
137 |   with open(input_file, 'r') as input_handle:
138 |     with open(output_file, 'w') as output_handle:
139 |       csv_reader = csv.DictReader(input_handle, delimiter='\t')
140 |       csv_reader.fieldnames.append('before_I_to_L')
141 |       csv_writer = csv.DictWriter(output_handle, csv_reader.fieldnames, delimiter='\t')
142 |       csv_writer.writeheader()
143 |       for row in csv_reader:
144 |         predicted_sequence = row['predicted_sequence']
145 |         row['before_I_to_L'] = predicted_sequence
146 |         row['predicted_sequence'] = predicted_sequence.replace('I', 'L')
147 |         csv_writer.writerow(row)
148 |           
149 | 
150 | def compute_distance(predicted_sequence, consensus_sequence):
151 |   """TODO(nh2tran): docstring.
152 |   """
153 | 
154 |   #~ print("".join(["="] * 80)) # section-separating line ===
155 |   #~ print("compute_distance()")
156 | 
157 |   # simplify the modifications
158 |   modification_list = ['C(Carbamidomethylation)', 'M(Oxidation)', 'N(Deamidation)', 'Q(Deamidation)']
159 |   simplified_list = ['c', 'm', 'n', 'q']
160 |   for x in simplified_list:
161 |     assert x not in deepnovo_config.vocab_reverse
162 |   for x, y in zip(modification_list, simplified_list):
163 |     predicted_sequence = [aa.replace(x, y) for aa in predicted_sequence]
164 |     consensus_sequence = [aa.replace(x, y) for aa in consensus_sequence]
165 |   predicted_sequence = ''.join(predicted_sequence)
166 |   consensus_sequence = ''.join(consensus_sequence)
167 | 
168 |   return Levenshtein.distance(predicted_sequence, consensus_sequence)
169 | 
170 | 
171 | def correct_by_consensus(input_file, output_file):
172 |   """Correct de novo sequencing errors as following:
173 |        group predicted sequences of the same mass together;
174 |        vote the consensus sequence;
175 |        replace the predicted by the consensus to correct errors: AB-BA, Q-AG, N-GG, etc.
176 |   
177 |      Usage:
178 |        input_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L"
179 |        output_file = input_file + ".consensus"
180 |        correct_by_consensus(input_file, output_file)
181 |   """
182 | 
183 |   print("".join(["="] * 80)) # section-separating line
184 |   print("correct_by_consensus()")
185 | 
186 |   print('input_file = ', input_file)
187 |   print('output_file = ', output_file)
188 | 
189 |   total_feature = 0
190 |   empty_feature = 0
191 |   mass_dict = {}
192 |   with open(input_file, 'r') as input_handle:
193 |     with open(output_file, 'w') as output_handle:
194 |       csv_reader = csv.DictReader(input_handle, delimiter='\t')
195 |       csv_reader.fieldnames.append('before_consensus')
196 |       csv_writer = csv.DictWriter(output_handle, csv_reader.fieldnames, delimiter='\t')
197 |       csv_writer.writeheader()
198 | 
199 |       # build the sequence mass dictionary
200 |       # all sequences with the same mass are grouped together
201 |       # (same mass up to resolution 1e4)
202 |       for row in csv_reader:
203 |         total_feature += 1
204 |         predicted_sequence = row['predicted_sequence']
205 |         # skip empty sequences that DeepNovo couldn't find a suitable candidate with the given mass
206 |         if predicted_sequence == '':
207 |           empty_feature += 1
208 |           continue
209 |         # save the original predicted sequence before correcting it later
210 |         row['before_consensus'] = predicted_sequence
211 | 
212 |         predicted_sequence = predicted_sequence.split(',')
213 |         predicted_score = float(row['predicted_score'])
214 |         sequence_mass_index = int(round(compute_peptide_mass(predicted_sequence)
215 |                                         * deepnovo_config.KNAPSACK_AA_RESOLUTION))
216 |         feature = {'row': row,
217 |                    'predicted_sequence': predicted_sequence,
218 |                    'predicted_score': predicted_score}
219 |         if sequence_mass_index in mass_dict:
220 |           mass_dict[sequence_mass_index].append(feature)
221 |         else:
222 |           mass_dict[sequence_mass_index] = [feature]
223 |       # check if all sequences have been assigned
224 |       assigned_feature = sum([len(x) for x in mass_dict.values()])
225 |       assert total_feature - empty_feature == assigned_feature
226 | 
227 |       # for each group of sequences of the same mass,
228 |       # vote the consensus sequence;
229 |       # calculate Levenshtein distance between each sequence and the consensus;
230 |       # if 1 <= distance <= 2, replace the sequence by the consensus;
231 |       # (distance = 2 examples: AB-BA, Q-AG, N-GG)
232 |       # write to output.
233 |       for group in mass_dict.values():
234 |         if len(group) == 1:
235 |           consensus_sequence = group[0]['predicted_sequence']
236 |         else:
237 |           # vote the consensus sequence
238 |           # the easy way is to find the sequence with the highest score and frequency
239 |           # (more complicated ways: De Bruijn graph, alignment)
240 |           consensus_candidate = {}
241 |           for feature in group:
242 |             predicted_sequence = feature['predicted_sequence']
243 |             predicted_score_prob = 100*math.exp(feature['predicted_score'])
244 |             predicted_sequence = ','.join(predicted_sequence)
245 |             if predicted_sequence in consensus_candidate:
246 |               consensus_candidate[predicted_sequence] += predicted_score_prob
247 |             else:
248 |               consensus_candidate[predicted_sequence] = predicted_score_prob
249 |           consensus_sequence = max(consensus_candidate.iterkeys(), key=(lambda key: consensus_candidate[key]))
250 |           consensus_sequence = consensus_sequence.split(',')
251 | 
252 |         # calculate distance, correct sequence by the consensus, write to output
253 |         for feature in group:
254 |           distance = compute_distance(feature['predicted_sequence'], consensus_sequence)
255 |           if 1 <= distance <= 2:
256 |             feature['row']['predicted_sequence'] = ','.join(consensus_sequence)
257 |           csv_writer.writerow(feature['row'])
258 | 
259 |       print('total_feature = ', total_feature)
260 |       print('empty_feature = ', empty_feature)
261 |       print('assigned_feature = ', assigned_feature)
262 |           
263 | 
264 | def filter_by_minlen(input_file, output_file, minlen):
265 |   """Filter out sequences of length less than minlen.
266 |   
267 |      Usage:
268 |        minlen = 5
269 |        input_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus"
270 |        output_file = input_file + ".minlen" + str(minlen)
271 |        filter_by_minlen(input_file, output_file, minlen)
272 |   """
273 | 
274 |   print("".join(["="] * 80)) # section-separating line
275 |   print("filter_by_minlen()")
276 |   print('input_file = ', input_file)
277 |   print('output_file = ', output_file)
278 |   print('minlen = ', minlen)
279 | 
280 |   total_feature = 0
281 |   minlen_feature = 0
282 |   removed_feature = 0
283 |   with open(input_file, 'r') as input_handle:
284 |     with open(output_file, 'w') as output_handle:
285 |       csv_reader = csv.DictReader(input_handle, delimiter='\t')
286 |       csv_writer = csv.DictWriter(output_handle, csv_reader.fieldnames, delimiter='\t')
287 |       csv_writer.writeheader()
288 |       for row in csv_reader:
289 |         total_feature += 1
290 |         predicted_sequence_len = len(re.split(',', row['predicted_sequence']))
291 |         if predicted_sequence_len >= minlen:
292 |           csv_writer.writerow(row)
293 |           minlen_feature += 1
294 |         else:
295 |           removed_feature += 1
296 |   print('total_feature = ', total_feature)
297 |   print('minlen_feature = ', minlen_feature)
298 |   print('removed_feature = ', removed_feature)
299 |           
300 | 
301 | def database_lookup(input_fasta_file, input_denovo_file, output_file, split_char, col_sequence):
302 | 
303 |   print("".join(["="] * 80)) # section-separating line
304 |   print("database_lookup()")
305 | 
306 |   print('input_fasta_file = ', input_fasta_file)
307 |   print('input_denovo_file = ', input_denovo_file)
308 |   print('output_file = ', output_file)
309 | 
310 |   with open(input_fasta_file, 'r') as input_fasta_handle:
311 |     record_list = list(SeqIO.parse(input_fasta_handle, "fasta"))
312 |     print("Number of protein sequences: ", len(record_list))
313 | 
314 |   total_count = 0 
315 |   db_count = 0
316 |   denovo_count = 0
317 |   with open(input_denovo_file, 'r') as input_denovo_handle:
318 |     with open(output_file, 'w') as output_handle:
319 |       # header
320 |       header_line = input_denovo_handle.readline()
321 |       print(header_line, file=output_handle, end="")
322 |       for line in input_denovo_handle:
323 |         total_count += 1
324 |         line_split = re.split(split_char, line)
325 |         line_split = line_split[:-1] # exclude the last empty ""
326 |         predicted_sequence = line_split[col_sequence]
327 |         predicted_sequence = predicted_sequence.replace(',', '')
328 |         predicted_sequence = predicted_sequence.replace('C(Carbamidomethylation)', 'C')
329 |         indb = False
330 |         for record in record_list:
331 |           if predicted_sequence in record.seq:
332 |             indb = True
333 |             break
334 |         if indb:
335 |           db_count += 1
336 |           line_split.append("db")
337 |         else:
338 |           denovo_count += 1
339 |           line_split.append("denovo")
340 |         print('\t'.join(line_split), file=output_handle, end="\n")
341 |   print('total_count = ', total_count)
342 |   print('db_count = ', db_count)
343 |   print('denovo_count = ', denovo_count)
344 | 
345 | # ~ input_fasta_file = "data/uniprot_sprot.human.fasta"
346 | # ~ input_denovo_file = "data.training/bassani.hla.2018_10_18.correct_mass_shift/unidentified_features.csv.deepnovo_denovo.top95"
347 | # ~ output_file = input_denovo_file + ".lookup"
348 | # ~ split_char = '\t|\n'
349 | # ~ col_sequence = 2
350 | # ~ database_lookup(input_fasta_file, input_denovo_file, output_file, split_char, col_sequence)
351 | 
352 | 
353 | def select_top_k(input_file, output_file, top_k, split_char, col_score):
354 |   """TODO(nh2tran): docstring."""
355 | 
356 |   print("".join(["="] * 80)) # section-separating line
357 |   print("select_top_k()")
358 | 
359 |   print('input_file = ', input_file)
360 |   print('output_file = ', output_file)
361 |   print('top_k = ', top_k)
362 | 
363 |   with open(input_file, 'r') as input_handle:
364 |     with open(output_file, 'w') as output_handle:
365 |       # header
366 |       header_line = input_handle.readline()
367 |       print(header_line, file=output_handle, end="")
368 |       predicted_list = []
369 |       for line in input_handle:
370 |         line_split = re.split(split_char, line)
371 |         predicted = {}
372 |         predicted["line"] = line
373 |         predicted["score"] = float(line_split[col_score]) if line_split[col_score] else -999
374 |         predicted_list.append(predicted)
375 |       sorted_list = sorted(predicted_list, key=lambda k: k['score'], reverse=True) 
376 |       for entry in sorted_list[:top_k]:
377 |         print(entry["line"], file=output_handle, end="")
378 |           
379 | #~ top_k = 7673
380 | #~ split_char = '\t|\n'
381 | #~ col_score = deepnovo_config.pcol_score_max
382 | #~ input_file = "data.training/dia.pecan.plasma.2018_03_29/testing.unlabeled.csv.deepnovo_denovo"
383 | #~ output_file = input_file + ".topk"
384 | #~ select_top_k(input_file, output_file, top_k, split_char, col_score)
385 | #~ split_char = ',|\n'
386 | #~ col_score = 5
387 | #~ input_file = "data.training/dia.urine.2018_03_29/peaks.denovo.csv"
388 | 
389 | 
390 | # filter features of single-feature (DDA-like) scan or multi-feature scan (DIA)
391 | def filter_multifeature(input_file):
392 |   """TODO(nh2tran): docstring."""
393 | 
394 |   print("".join(["="] * 80)) # section-separating line
395 |   print("filter_multifeature()")
396 | 
397 |   print('input_file = ', input_file)
398 |   output_file_1 = input_file + '.1fea'
399 |   output_file_2 = input_file + '.2fea'
400 |   print('output_file_1 = ', output_file_1)
401 |   print('output_file_2 = ', output_file_2)
402 | 
403 |   # read feature and record feature_dict, scan_dict
404 |   with open(input_file, 'r') as input_handle:
405 |     # header
406 |     header_line = input_handle.readline()
407 |     col_feature_id = deepnovo_config.col_feature_id
408 |     col_scan_list = deepnovo_config.col_scan_list
409 |     feature_dict = {}
410 |     scan_dict = {}
411 |     # read feature and record feature_dict, scan_dict
412 |     for line in input_handle:
413 |       line_split = re.split(',|\n', line)
414 |       feature_id = line_split[col_feature_id]
415 |       scan_list = re.split(';', line_split[col_scan_list])
416 |       feature_dict[feature_id] = {}
417 |       feature_dict[feature_id]['line'] = line
418 |       feature_dict[feature_id]['scan_list'] = scan_list
419 |       for scan_id in scan_list:
420 |         if scan_id in scan_dict:
421 |           scan_dict[scan_id]['feature_list'].append(feature_id)
422 |         else:
423 |           scan_dict[scan_id] = {}
424 |           scan_dict[scan_id]['feature_list'] = [feature_id]
425 | 
426 |   print('Total scan count = ', len(scan_dict))
427 |   print('  Scan with single-feature = ',
428 |         sum([1 if (len(scan['feature_list'])==1) else 0 for _, scan in scan_dict.iteritems()]))
429 |   print('  Scan with multi-feature = ',
430 |         sum([1 if (len(scan['feature_list'])>=2) else 0 for _, scan in scan_dict.iteritems()]))
431 | 
432 |   # write feature to separate files,
433 |   # depending on its scan is single-feature (DDA-like) or multi-feature (DIA)
434 |   single_feature_count = 0
435 |   multi_feature_count = 0
436 |   with open(output_file_1, 'w') as output_handle_1:
437 |     with open(output_file_2, 'w') as output_handle_2:
438 |       # header
439 |       print(header_line, file=output_handle_1, end="")
440 |       print(header_line, file=output_handle_2, end="")
441 |       for feature_id, feature in feature_dict.iteritems():
442 |         # assuming all scans are single-feature
443 |         output_handle = output_handle_1
444 |         single_feature_count += 1
445 |         # at least 1 scan is multi-feature
446 |         #~ for scan_id in feature['scan_list']:
447 |           #~ if len(scan_dict[scan_id]['feature_list']) >= 2:
448 |             #~ output_handle = output_handle_2
449 |             #~ multi_feature_count += 1
450 |             #~ single_feature_count -= 1
451 |             #~ break
452 |         # average feature count of scans
453 |         feature_count = sum([len(scan_dict[scan_id]['feature_list']) for scan_id in feature['scan_list']])
454 |         feature_count /= float(len(feature['scan_list']))
455 |         if feature_count >= 2:
456 |           output_handle = output_handle_2
457 |           multi_feature_count += 1
458 |           single_feature_count -= 1
459 |         print(feature['line'], file=output_handle, end="")
460 | 
461 |   print('Total feature count = ', len(feature_dict))
462 |   print('Feature with single-feature scans = ', single_feature_count)
463 |   print('Feature with at least 1 multi-feature scans = ', multi_feature_count)
464 | 
465 | #~ input_file = "data.training/dia.urine.2018_03_29/testing_12.feature.csv"
466 | #~ filter_multifeature(input_file)
467 | 
468 | 


--------------------------------------------------------------------------------
/aa_workflow.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | 
  7 | from deepnovo_preprocess import *
  8 | from deepnovo_postprocess import *
  9 | import aa_workflow_step_4_2
 10 | import aa_workflow_step_5
 11 | 
 12 | 
 13 | data_fasta_dir = "data.fasta/"
 14 | patient_id = "Mel16"
 15 | data_training_dir = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/"
 16 | num_fractions = 11
 17 | model_dir = "train.mel_16.class_1" # before training, create this empty folder at the same level as Python scripts.
 18 | 
 19 | 
 20 | # ================================================================================
 21 | # Workflow of neoantigen discovery by personalized de novo sequencing.
 22 | # ================================================================================
 23 | 
 24 | # Step-by-step instructions based on the following example dataset:
 25 | 
 26 | #   Patient Mel-16 (Bassani-Sternberg et al., Nature Communication, 2016)
 27 | #   HLA class 1: 12 raw files, 1 failed to run PEAKS
 28 | 
 29 | #     20141212_QEp7_MiBa_SA_HLA-I-p_MM16_1_A
 30 | #     20141212_QEp7_MiBa_SA_HLA-I-p_MM16_1_B
 31 | #     20141212_QEp7_MiBa_SA_HLA-I-p_MM16_2_A
 32 | #     20141212_QEp7_MiBa_SA_HLA-I-p_MM16_2_B
 33 | #     20141212_QEp7_MiBa_SA_HLA-I-p_MM16_3_A
 34 | #     20141212_QEp7_MiBa_SA_HLA-I-p_MM16_3_B
 35 | #     20141213_QEp7_MiBa_SA_HLA-I-p_MM16_1_A_1
 36 | #     20141213_QEp7_MiBa_SA_HLA-I-p_MM16_1_B_1, failed
 37 | #     20141213_QEp7_MiBa_SA_HLA-I-p_MM16_2_A_1
 38 | #     20141213_QEp7_MiBa_SA_HLA-I-p_MM16_2_B_1
 39 | #     20141213_QEp7_MiBa_SA_HLA-I-p_MM16_3_A_1
 40 | #     20141213_QEp7_MiBa_SA_HLA-I-p_MM16_3_B_1
 41 | 
 42 | 
 43 | 
 44 | 
 45 | # ================================================================================
 46 | # Step 1: Build the immunopeptidome of the patient.
 47 | # ================================================================================
 48 | 
 49 | # This step 1 took about ?? hours on a laptop with 4 CPU cores i7, 16 GB memory
 50 | 
 51 | # ================================================================================
 52 | # Step 1.1: Run PEAKS X DB search on the raw files with the following parameters:
 53 | # ================================================================================
 54 | 
 55 | #     Enzyme: None
 56 | #     Instrument: Orbi-Orbi
 57 | #     Fragment: HCD
 58 | #     Acquisition: DDA
 59 | 
 60 | #     Parent Mass Error Tolerance: 15.0 ppm
 61 | #     Fragment Mass Error Tolerance: 0.05 Da
 62 | #     Precursor Mass Search Type: monoisotopic
 63 | #     Enzyme: None
 64 | #     Digest Mode: Unspecific
 65 | #     Max Missed Cleavages: 100
 66 | #     Variable Modifications:
 67 | #       Oxidation (M): 15.99
 68 | #       Deamidation (NQ): 0.98
 69 | #     Max Variable PTM Per Peptide: 3
 70 | #     Database: uniprot_sprot.human
 71 | #     Taxon: All
 72 | #     Contaminant Database: contaminants_maxquant
 73 | #     Searched Entry: 20488
 74 | #     FDR Estimation: Enabled
 75 | #     Merge Options: no merge
 76 | #     Precursor Options: corrected
 77 | #     Charge Options: no correction
 78 | #     Filter Options: no filter
 79 | #     Process: true
 80 | #     Associate chimera: no
 81 | 
 82 | 
 83 | 
 84 | 
 85 | # ================================================================================
 86 | # Step 1.2: Set FDR 1.0%.
 87 | # ================================================================================
 88 | 
 89 | # The number of MS/MS spectra is "694565", the number of peptide-spectrum matches (PSMs) is "207332", the number of peptide sequences is "26594".
 90 | 
 91 | 
 92 | 
 93 | 
 94 | # ================================================================================
 95 | # Step 1.3: Right-click on the DB search node "??", select "Deep Denovo Export".
 96 | # ================================================================================
 97 | 
 98 | # We will get the following 11 pairs of csv and mgf files in the PEAKS project folder:
 99 | 
100 | #       export_0.csv, export_0.mgf
101 | #       export_1.csv, export_1.mgf
102 | #       export_2.csv, export_2.mgf
103 | #       export_3.csv, export_3.mgf
104 | #       export_4.csv, export_4.mgf
105 | #       export_5.csv, export_5.mgf
106 | #       export_6.csv, export_6.mgf
107 | #       export_7.csv, export_7.mgf
108 | #       export_8.csv, export_8.mgf
109 | #       export_9.csv, export_9.mgf
110 | #       export_10.csv, export_10.mgf
111 | 
112 | 
113 | 
114 | 
115 | # ================================================================================
116 | # Step 2: Train personalized DeepNovo model.
117 | # ================================================================================
118 | 
119 | # This step 2 took about 12 hours on a server with GPU Titan X, 32 GB memory
120 | 
121 | # Note that you will need to specify the paths to your own data and model folders when you run the Python scripts. The following scripts just show examples of my data and model folders.
122 | 
123 | # ================================================================================
124 | # Step 2.1: Prepare the training data.
125 | # ================================================================================
126 | 
127 | # Run merge_mgf_file() and merge_feature_file()
128 | # ======================= UNCOMMENT and RUN ======================================
129 | # ~ folder_path = data_training_dir
130 | # ~ fraction_list = range(0, num_fractions)
131 | # ~ merge_mgf_file(
132 |     # ~ input_file_list=[folder_path + "export_" + str(i) + ".mgf" for i in fraction_list],
133 |     # ~ fraction_list=fraction_list,
134 |     # ~ output_file=folder_path + "spectrum.mgf")
135 | # ~ merge_feature_file(
136 |     # ~ input_file_list=[folder_path + "export_" + str(i) + ".csv" for i in fraction_list],
137 |     # ~ fraction_list=fraction_list,
138 |     # ~ output_file=folder_path + "feature.csv")
139 | # ================================================================================
140 | # We will get two output files in the same folder: "spectrum.mgf" and "feature.csv".
141 | # Both functions also report the number of entries that have been processed: "counter = 694565".
142 | # That number should be the same as the total number of MS/MS spectra from the raw files.
143 | 
144 | # Run split_feature_unlabel()
145 | # ======================= UNCOMMENT and RUN ======================================
146 | # ~ input_feature_file = data_training_dir + "feature.csv"
147 | # ~ split_feature_unlabel(input_feature_file)
148 | # ================================================================================
149 | # It will split the "feature.csv" into 2 files: "feature.csv.labeled" and "feature.csv.unlabeled".
150 | # It also reports the number of labeled and unlabel features: "num_labeled = 207332" and "num_unlabeled = 487233".
151 | # Note that "207332" is also the number of PSMs reported at FDR 1.0% in Step 1.
152 | 
153 | # Run calculate_mass_shift_ppm() and correct_mass_shift_ppm()
154 | # ======================= UNCOMMENT and RUN ======================================
155 | # ~ labeled_feature_file = data_training_dir + "feature.csv.labeled"
156 | # ~ ppm = calculate_mass_shift_ppm(labeled_feature_file)
157 | # ~ input_feature_file = data_training_dir + "feature.csv.labeled"
158 | # ~ correct_mass_shift_ppm(input_feature_file, ppm)
159 | # ~ input_feature_file = data_training_dir + "feature.csv"
160 | # ~ correct_mass_shift_ppm(input_feature_file, ppm)
161 | # ================================================================================
162 | # The mass shift is calculated from "feature.csv.labeled".
163 | # The mass shift ppm (part per million) is reported as: "mean_precursor_ppm = 7.07514819678".
164 | # Then mass is corrected for 2 files: "feature.csv.labeled.mass_corrected" and "feature.csv.mass_corrected".
165 | 
166 | # Run split_feature_training_noshare()
167 | # ======================= UNCOMMENT and RUN ======================================
168 | # ~ input_feature_file = data_training_dir + "feature.csv.labeled.mass_corrected"
169 | # ~ proportion = [0.90, 0.05, 0.05]
170 | # ~ split_feature_training_noshare(input_feature_file, proportion)
171 | # ================================================================================
172 | # It will split "feature.csv.labeled.mass_corrected" into train/valid/test sets with "proportion = [0.9, 0.05, 0.05]".
173 | # Those 3 sets do not share common peptides.
174 | # Their sizes are also reported.
175 | #   "num_total = 207332"
176 | #   "num_unique = 26656"
177 | #   "num_train = 185823"
178 | #   "num_valid = 10900"
179 | #   "num_test = 10609"
180 | 
181 | 
182 | 
183 | 
184 | # ================================================================================
185 | # Step 2.2: Training DeepNovo model.
186 | # ================================================================================
187 | 
188 | # Run DeepNovo training
189 | # The training will stop after 10 epoch. The model with best performance on the valid set, "ckpt-16200" is saved in the model folder "train.mel_16.class_1".
190 | # ======================= UNCOMMENT and RUN ======================================
191 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --train"]
192 | # ~ command += ["--train_dir", model_dir]
193 | # ~ command += ["--train_spectrum", data_training_dir + "spectrum.mgf"]
194 | # ~ command += ["--train_feature", data_training_dir + "feature.csv.labeled.mass_corrected.train.noshare"]
195 | # ~ command += ["--valid_spectrum", data_training_dir + "spectrum.mgf"]
196 | # ~ command += ["--valid_feature", data_training_dir + "feature.csv.labeled.mass_corrected.valid.noshare"]
197 | # ~ command += ["--reset_step"]
198 | # ~ command = " ".join(command)
199 | # ~ print(command)
200 | # ~ os.system(command)
201 | # ================================================================================
202 | 
203 | # Run DeepNovo testing
204 | # ======================= UNCOMMENT and RUN ======================================
205 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test_true_feeding"]
206 | # ~ command += ["--train_dir", model_dir]
207 | # ~ command += ["--test_spectrum", data_training_dir + "spectrum.mgf"]
208 | # ~ command += ["--test_feature", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"]
209 | # ~ command = " ".join(command)
210 | # ~ print(command)
211 | # ~ os.system(command)
212 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --search_denovo"]
213 | # ~ command += ["--train_dir", model_dir]
214 | # ~ command += ["--denovo_spectrum", data_training_dir + "spectrum.mgf"]
215 | # ~ command += ["--denovo_feature", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"]
216 | # ~ command = " ".join(command)
217 | # ~ print(command)
218 | # ~ os.system(command)
219 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test"]
220 | # ~ command += ["--target_file", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"]
221 | # ~ command += ["--predicted_file", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo"]
222 | # ~ command = " ".join(command)
223 | # ~ print(command)
224 | # ~ os.system(command)
225 | # ================================================================================
226 | # The testing accuracy at the amino acid (AA) and peptide levels will be reported as following:
227 | #   "precision_AA_mass_db  = 0.8425"
228 | #   "precision_peptide_mass_db  = 0.6430"
229 | 
230 | 
231 | 
232 | 
233 | # ================================================================================
234 | # Step 3: Perform personalized de novo sequencing with DeepNovo.
235 | # ================================================================================
236 | 
237 | # This step 3 took about 5 hours on a server with GPU Titan X, 32 GB memory
238 | 
239 | # Run DeepNovo de novo sequencing on all features (label and unlabeled)
240 | # ======================= UNCOMMENT and RUN ======================================
241 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --search_denovo"]
242 | # ~ command += ["--train_dir", model_dir]
243 | # ~ command += ["--denovo_spectrum", data_training_dir + "spectrum.mgf"]
244 | # ~ command += ["--denovo_feature", data_training_dir + "feature.csv.mass_corrected"]
245 | # ~ command = " ".join(command)
246 | # ~ print(command)
247 | # ~ os.system(command)
248 | # ================================================================================
249 | # The de novo results will be written to the file "feature.csv.mass_corrected.deepnovo_denovo".
250 | # The tool will also report the number of features that have been processed:
251 | #   "Total spectra: 694565"
252 | #     "read: 690354"
253 | #     "skipped: 4211"
254 | #       "by mass: 4211"
255 | 
256 | 
257 | 
258 | 
259 | # ================================================================================
260 | # Step 4: Quality control.
261 | # ================================================================================
262 | 
263 | # ================================================================================
264 | # Step 4.1: Post-process de novo results to improve their accuracy. 
265 | # ================================================================================
266 | 
267 | # Run select_top_score()
268 | # This script selects a threshold of de novo confidence scores and uses it to filter de novo results.
269 | # The score threshold is calculated based on a 95% cutoff of the testing accuracy obtained at the end of Step 2 above.
270 | # ======================= UNCOMMENT and RUN ======================================
271 | # ~ accuracy_cutoff = 0.95
272 | # ~ accuracy_file = data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo.accuracy"
273 | # ~ score_cutoff = find_score_cutoff(accuracy_file, accuracy_cutoff)
274 | # ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo"
275 | # ~ output_file = input_file + ".top95"
276 | # ~ select_top_score(input_file, output_file, score_cutoff)
277 | # ================================================================================
278 | # After this step we'll get the file "feature.csv.mass_corrected.deepnovo_denovo.top95".
279 | # The score cutoff and the number of selected features will also be reported:
280 | #   "score_cutoff =  -0.5"
281 | #   "total_feature =  690354"
282 | #   "select_feature =  233589"
283 | 
284 | # Run convert_I_to_L()
285 | # This script converts I (Isoleucine) to L (Leucine) in all de novo peptides, because de novo sequencing is not able to distinguish them.
286 | # ======================= UNCOMMENT and RUN ======================================
287 | # ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95"
288 | # ~ output_file = input_file + ".I_to_L"
289 | # ~ convert_I_to_L(input_file, output_file)
290 | # ================================================================================
291 | 
292 | # Run correct_by_consensus()
293 | # This script corrects de novo sequencing errors by grouping predicted sequences of the same mass together and voting the consensus sequence.
294 | # ======================= UNCOMMENT and RUN ======================================
295 | # ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L"
296 | # ~ output_file = input_file + ".consensus"
297 | # ~ correct_by_consensus(input_file, output_file)
298 | # ================================================================================
299 | 
300 | # Run filter_by_minlen()
301 | # This script filters out sequences of length less than 5 amino acids.
302 | # ======================= UNCOMMENT and RUN ======================================
303 | # ~ minlen = 5
304 | # ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus"
305 | # ~ output_file = input_file + ".minlen" + str(minlen)
306 | # ~ filter_by_minlen(input_file, output_file, minlen)
307 | # ================================================================================
308 | # The numbers of features will be reported as:
309 | #   "total_feature =  233589"
310 | #   "minlen_feature =  223507"
311 | #   "removed_feature =  10082"
312 | 
313 | # Up to this step, we get the following file: 
314 | #   "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5"
315 | # We test its accuracy against the test set:
316 | # Run DeepNovo testing
317 | # ======================= UNCOMMENT and RUN ======================================
318 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test"]
319 | # ~ command += ["--target_file", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"]
320 | # ~ command += ["--predicted_file", data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5"]
321 | # ~ command = " ".join(command)
322 | # ~ print(command)
323 | # ~ os.system(command)
324 | # ================================================================================
325 | # We get these results:
326 | #   "precision_AA_mass_db  = 0.9530"
327 | #   "precision_peptide_mass_db  = 0.8441"
328 | 
329 | # Repeat the same testing but now against all labeled features:
330 | # Run DeepNovo testing
331 | # ====================== UNCOMMENT and RUN =======================================
332 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test"]
333 | # ~ command += ["--target_file", data_training_dir + "feature.csv.labeled.mass_corrected"]
334 | # ~ command += ["--predicted_file", data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5"]
335 | # ~ command = " ".join(command)
336 | # ~ print(command)
337 | # ~ os.system(command)
338 | # ================================================================================
339 | # We get these results:
340 | #   "precision_AA_mass_db  = 0.9797"
341 | #   "precision_peptide_mass_db  = 0.9371"
342 | # Note that these accuracy results look better than those against the test set because the test set was not used for training the model.
343 | # The number of de novo only features is also reported as
344 | #   "predicted_only: 68721"
345 | # and they are written to the file 
346 | #   "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5.denovo_only"
347 | 
348 | 
349 | 
350 | 
351 | # ================================================================================
352 | # Step 4.2: Run second round of PEAKS X DB search against the list of database and de novo peptides. 
353 | # ================================================================================
354 | 
355 | # Before running PEAKS, we need to combine database and de novo peptides into a list.
356 | # This script will select unique de novo peptides, filter out those that belong to the human Swiss-Prot protein database, and combine the remaining de novo peptides and the database peptides identified from Step 1 into a fasta file.
357 | # ======================= UNCOMMENT and RUN ======================================
358 | # ~ aa_workflow_step_4_2.preprocess(
359 |     # ~ denovo_file=data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5.denovo_only",
360 |     # ~ db_fasta_file=data_fasta_dir + "uniprot_sprot.human.plus_contaminants.fasta",
361 |     # ~ labeled_feature_file=data_training_dir + "feature.csv.labeled.mass_corrected",
362 |     # ~ peptide_list_fasta=data_training_dir + "aa_workflow.step_4.peptide_list.fasta")
363 | # ================================================================================
364 | # The numbers of de novo and database peptides are reported as following:
365 | #   "Number of top-scoring denovo peptides: 17318"
366 | #   "num_db_peptides = 25274"
367 | #   "num_denovo_peptides = 6444" (not in database)
368 | 
369 | # Run PEAKS X DB search with as following:
370 | #   Select the DENOVO node result from Step 1.1, and select PEAKS DB search;
371 | #   Select option "No digestion" for "Digest mode";
372 | #   Select the fasta file "aa_workflow.step_4.peptide_list.fasta" as the only database, no contaminant;
373 | #   Leave other settings the same as in Step 1.1.
374 | # Set FDR 1.0% and export the "DB search psm.csv" file, rename it to "aa_workflow.step_4.psm.csv".
375 | 
376 | # Extract de novo peptides from the PSMs of PEAKS X DB search round 2.
377 | # ======================= UNCOMMENT and RUN ======================================
378 | # ~ aa_workflow_step_4_2.postprocess(
379 |     # ~ psm_file = data_training_dir + "aa_workflow.step_4.psm.csv",
380 |     # ~ output_denovo_peptide_file = data_training_dir + "aa_workflow.step_4.output_peptide_list")
381 | # ================================================================================
382 | # The number of de novo peptides is reported as following:
383 | #   "num_denovo_peptides = 1259"
384 | 
385 | 
386 | 
387 | 
388 | # ================================================================================
389 | # Step 5: Neoantigen selection. 
390 | # ================================================================================
391 | # ~ aa_workflow_step_5.step_5(
392 |     # ~ psm_file=data_training_dir + "aa_workflow.step_4.psm.csv",
393 |     # ~ netmhc_file=data_training_dir + "aa_workflow.step_5.netmhcpan.csv",
394 |     # ~ immunogenicity_file=data_training_dir + "aa_workflow.step_5.immunogenicity.csv",
395 |     # ~ db_fasta_file=data_fasta_dir + "uniprot_sprot.human.plus_contaminants.fasta",
396 |     # ~ labeled_feature_file=data_training_dir + "feature.csv.labeled",
397 |     # ~ snp_file=data_training_dir + "aa_workflow.step_5.supp_data5_snp.csv",
398 |     # ~ snp_enst_fasta=data_training_dir + "aa_workflow.step_5.supp_data5_snp_enst.fasta",
399 |     # ~ snp_sample_id=patient_id,
400 |     # ~ output_neoantigen_criteria=data_training_dir + "aa_workflow.step_5.output_neoantigen_criteria.csv",
401 |     # ~ output_protein_mutation=data_training_dir + "aa_workflow.step_5.protein_mutation.csv")
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 
408 | 
409 | 
410 | 
411 | 
412 | 
413 | 
414 | 
415 | 
416 | 


--------------------------------------------------------------------------------
/deepnovo_worker_db.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Hieu Tran. All Rights Reserved.
  2 | #
  3 | # DeepNovo is publicly available for non-commercial uses.
  4 | # ==============================================================================
  5 | 
  6 | """TODO(nh2tran): docstring."""
  7 | 
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | from __future__ import print_function
 11 | 
 12 | import sys
 13 | import time
 14 | import re
 15 | from random import shuffle
 16 | from itertools import combinations
 17 | 
 18 | from Bio import SeqIO
 19 | from pyteomics import parser
 20 | import numpy as np
 21 | import tensorflow as tf
 22 | 
 23 | import deepnovo_config
 24 | from deepnovo_cython_modules import get_candidate_intensity
 25 | 
 26 | 
 27 | class WorkerDB(object):
 28 |   """TODO(nh2tran): docstring.
 29 |      This class contains the database search module.
 30 |      We use "db" for "database".
 31 |      We use "pepmod" to refer to a modified version of a "peptide"
 32 |   """
 33 | 
 34 | 
 35 |   def __init__(self, db_fasta_file):
 36 |     """TODO(nh2tran): docstring."""
 37 | 
 38 |     print("".join(["="] * 80)) # section-separating line
 39 |     print("WorkerDB: __init__()")
 40 | 
 41 |     # search_db and search_hybrid could use different fasta files for their
 42 |     #   worker_db objects. So it's better to have fasta files as input.
 43 |     self.db_fasta_file = db_fasta_file
 44 | 
 45 |     # we currently use deepnovo_config to store both const & settings
 46 |     # the settings should be shown in __init__() to keep track carefully
 47 |     # input info to build a db
 48 |     self.cleavage_rule = deepnovo_config.cleavage_rule
 49 |     self.num_missed_cleavage = deepnovo_config.num_missed_cleavage
 50 |     self.fixed_mod_list = deepnovo_config.fixed_mod_list
 51 |     self.var_mod_list = deepnovo_config.var_mod_list
 52 |     self.num_mod = deepnovo_config.num_mod
 53 |     self.precursor_mass_tolerance = deepnovo_config.precursor_mass_tolerance
 54 |     self.precursor_mass_ppm = deepnovo_config.precursor_mass_ppm
 55 |     self.decoy = deepnovo_config.FLAGS.decoy
 56 |     print("db_fasta_file = {0:s}".format(self.db_fasta_file))
 57 |     print("cleavage_rule = {0:s}".format(self.cleavage_rule))
 58 |     print("num_missed_cleavage = {0:d}".format(self.num_missed_cleavage))
 59 |     print("fixed_mod_list = {0}".format(self.fixed_mod_list))
 60 |     print("var_mod_list = {0}".format(self.var_mod_list))
 61 |     print("num_mod = {0}".format(self.num_mod))
 62 |     print("precursor_mass_tolerance = {0:.4f}".format(self.precursor_mass_tolerance))
 63 |     print("precursor_mass_ppm = {0:.6f}".format(self.precursor_mass_ppm))
 64 | 
 65 |     # data structure to store a db
 66 |     # all attributes will be built/loaded by build_db()
 67 |     self.peptide_count = None
 68 |     self.peptide_list = None
 69 |     self.peptide_mass_array = None
 70 |     self.pepmod_maxmass_array = None
 71 | 
 72 |     self.test_time = 0.0
 73 | 
 74 | 
 75 |   def build_db(self):
 76 |     """TODO(nh2tran): docstring."""
 77 | 
 78 |     print("".join(["="] * 80)) # section-separating line
 79 |     print("WorkerDB: build_db()")
 80 | 
 81 |     # parse the input fasta file into a list of sequences
 82 |     # more about SeqIO and SeqRecord: http://biopython.org/wiki/SeqRecord
 83 |     with open(self.db_fasta_file, "r") as handle:
 84 |       record_iterator = SeqIO.parse(handle, "fasta")
 85 |       record_list = list(record_iterator)
 86 |       print("Number of protein sequences: {0:d}".format(len(record_list)))
 87 | 
 88 |     # cleave protein sequences into a list of unique peptides
 89 |     # more about pyteomics.parser.cleave and cleavage rules:
 90 |     # https://pythonhosted.org/pyteomics/api/parser.html
 91 | 
 92 |     # create a peptide to protein accession id map.
 93 |     peptide_2_protein_id = {}
 94 |     for record in record_list:
 95 |       protein_sequence = str(record.seq)
 96 |       protein_id = str(record.id)
 97 |       cleaved_peptide_set = parser.cleave(
 98 |         sequence=protein_sequence,
 99 |         rule=parser.expasy_rules[self.cleavage_rule],
100 |         missed_cleavages=self.num_missed_cleavage)
101 |       for peptide in cleaved_peptide_set:
102 |         if any(x in peptide for x in ['X', 'B', 'U', 'Z']):
103 |           # skip peptides with undetermined amino acid ['X', 'B', 'U', 'Z']
104 |           continue
105 |         if peptide not in peptide_2_protein_id:
106 |           peptide_2_protein_id[peptide] = {protein_id}
107 |         else:
108 |           peptide_2_protein_id[peptide].add(protein_id)
109 | 
110 |     peptide_list = [list(peptide) for peptide in peptide_2_protein_id.keys()]
111 |     peptide_list = [[x + 'mod' if x in self.fixed_mod_list else x for x in peptide] for peptide in peptide_list ]
112 | 
113 |     peptide_count = len(peptide_list)
114 |     print("Number of peptides: {0:d}".format(peptide_count))
115 | 
116 |     # for each peptide, find the mass and the max modification mass
117 |     peptide_mass_array = np.zeros(peptide_count)
118 |     pepmod_maxmass_array = np.zeros(peptide_count)
119 |     for index, peptide in enumerate(peptide_list):
120 |       peptide_mass_array[index] = self._compute_peptide_mass(peptide)
121 |       pepmod = [x + 'mod' if x in self.var_mod_list else x for x in peptide]
122 |       pepmod_maxmass_array[index] = self._compute_peptide_mass(pepmod)
123 | 
124 |     self.peptide_count = peptide_count
125 |     self.peptide_list = peptide_list
126 |     self.peptide_mass_array = peptide_mass_array
127 |     self.pepmod_maxmass_array = pepmod_maxmass_array
128 |     self.peptide_2_protein_id = peptide_2_protein_id
129 | 
130 | 
131 |   def search_db(self, model, worker_io, predicted_denovo_list=None):
132 |     """TODO(nh2tran): docstring."""
133 | 
134 |     print("".join(["="] * 80)) # section-separating line
135 |     print("WorkerDB: search_db()")
136 | 
137 |     # move load/build db here?
138 | 
139 |     # if provided, convert predicted_denovo_list to dictionary for easy lookup
140 |     denovo_peptide_dict = None
141 |     if predicted_denovo_list is not None:
142 |       print("WorkerDB: search_db() - read denovo peptides")
143 |       denovo_peptide_dict = {}
144 |       for predicted in predicted_denovo_list:
145 |         feature_id = predicted["feature_id"]
146 |         sequence = predicted["sequence"]
147 |         denovo_peptide_dict[feature_id] = sequence
148 | 
149 |     print("WorkerDB: search_db() - open tensorflow session")
150 |     session = tf.Session()
151 |     model.restore_model(session)
152 | 
153 |     worker_io.open_input()
154 |     worker_io.get_location()
155 |     worker_io.split_feature_index()
156 |     worker_io.open_output()
157 | 
158 |     print("".join(["="] * 80)) # section-separating line
159 |     print("WorkerDB: search_db() - search loop")
160 | 
161 |     for index, feature_index_batch in enumerate(worker_io.feature_index_batch_list):
162 |       print("Read {0:d}/{1:d} batches".format(index + 1,
163 |                                               worker_io.feature_index_batch_count))
164 |       spectrum_batch = worker_io.get_spectrum(feature_index_batch)
165 |       predicted_batch = self._search_db_batch(spectrum_batch,
166 |                                               model,
167 |                                               session,
168 |                                               denovo_peptide_dict)
169 |       worker_io.write_prediction(predicted_batch)
170 | 
171 |     print("Total spectra: {0:d}".format(worker_io.feature_count["total"]))
172 |     print("  read: {0:d}".format(worker_io.feature_count["read"]))
173 |     print("  skipped: {0:d}".format(worker_io.feature_count["skipped"]))
174 |     print("    by mass: {0:d}".format(worker_io.feature_count["skipped_mass"]))
175 | 
176 |     worker_io.close_input()
177 |     worker_io.close_output()
178 |     session.close()
179 | 
180 | 
181 |   def _compute_peptide_mass(self, peptide):
182 |     """TODO(nh2tran): docstring.
183 |     """
184 | 
185 |     #~ print("".join(["="] * 80)) # section-separating line ===
186 |     #~ print("WorkerDB: _compute_peptide_mass()")
187 | 
188 |     peptide_mass = (deepnovo_config.mass_N_terminus
189 |                     + sum(deepnovo_config.mass_AA[aa] for aa in peptide)
190 |                     + deepnovo_config.mass_C_terminus)
191 | 
192 |     return peptide_mass
193 | 
194 | 
195 |   def _expand_peptide_modification(self, peptide):
196 |     """TODO(nh2tran): docstring.
197 |        May also use parser.isoforms
198 |     """
199 | 
200 |     #~ print("".join(["="] * 80)) # section-separating line
201 |     #~ print("WorkerDB: _expand_peptide_modification()")
202 | 
203 |     # all possible positions for modification
204 |     position_list = [position for position, aa in enumerate(peptide)
205 |                      if aa in self.var_mod_list]
206 |     position_count = len(position_list)
207 |     # max number of modifications allowed
208 |     num_mod = min(position_count, self.num_mod)
209 |     # find all combinations upto num_mod
210 |     position_combination_list = []
211 |     for x in xrange(1, num_mod+1):
212 |       position_combination_list += combinations(position_list, x)
213 |     # find all pepmod
214 |     pepmod_list = [peptide]
215 |     for position_combination in position_combination_list:
216 |       pepmod = peptide[:]
217 |       for position in position_combination:
218 |         pepmod[position] += 'mod'
219 |       pepmod_list.append(pepmod)
220 |     
221 |     return pepmod_list
222 | 
223 | 
224 |   def _filter_by_mass(self, precursor_mass):
225 |     """TODO(nh2tran): docstring.
226 |     """
227 | 
228 |     #~ print("".join(["="] * 80)) # section-separating line
229 |     #~ print("WorkerDB: _filter_by_mass()")
230 | 
231 |     # use precursor_mass_ppm instead of absolute precursor_mass_tolerance
232 |     #~ precursor_mass_tolerance = self.precursor_mass_tolerance
233 |     precursor_mass_tolerance = self.precursor_mass_ppm * precursor_mass
234 | 
235 |     # 1st filter by the peptide mass and the max pepmod mass
236 |     filter1_index = np.flatnonzero(np.logical_and(
237 |         np.less_equal(self.peptide_mass_array,
238 |                       precursor_mass + precursor_mass_tolerance),
239 |         np.greater_equal(self.pepmod_maxmass_array,
240 |                          precursor_mass - precursor_mass_tolerance)))
241 | 
242 |     # find all possible modifications
243 |     pepmod_list = []
244 |     for index in filter1_index:
245 |       peptide = self.peptide_list[index]
246 |       pepmod_list += self._expand_peptide_modification(peptide)
247 |     pepmod_mass_array = np.array([self._compute_peptide_mass(pepmod)
248 |                                   for pepmod in pepmod_list])
249 | 
250 |     # 2nd filter by exact pepmod mass
251 |     filter2_index = np.flatnonzero(np.logical_and(
252 |         np.less_equal(pepmod_mass_array,
253 |                       precursor_mass + precursor_mass_tolerance),
254 |         np.greater_equal(pepmod_mass_array,
255 |                          precursor_mass - precursor_mass_tolerance)))
256 | 
257 |     candidate_list = [pepmod_list[x] for x in filter2_index]
258 | 
259 |     return candidate_list
260 | 
261 | 
262 |   def _score_spectrum(self,
263 |                       precursor_mass,
264 |                       spectrum_original,
265 |                       state0_c,
266 |                       state0_h,
267 |                       candidate_list,
268 |                       model,
269 |                       model_output_logprob,
270 |                       model_lstm_state,
271 |                       session,
272 |                       direction):
273 |     """TODO(nh2tran): docstring."""
274 | 
275 |     #~ print("".join(["="] * 80)) # section-separating line
276 |     #~ print("WorkerDB: _score()")
277 | 
278 |     # convert symbols into id
279 |     candidate_list = [[deepnovo_config.vocab[x] for x in candidate] 
280 |                       for candidate in candidate_list]
281 | 
282 |     # we shall group candidates into minibatches
283 |     # === candidate_len ===
284 |     # s
285 |     # i
286 |     # z
287 |     # e
288 |     # =====================
289 |     minibatch_size = len(candidate_list) # number of candidates
290 |     candidate_len = len(candidate_list[0]) # length of each candidate
291 | 
292 |     # candidates share the same state0, so repeat into [minibatch_size, 512]
293 |     # the states will also be updated after every iteration
294 |     state0_c = state0_c.reshape((1, -1)) # reshape to [1, 512]
295 |     state0_h = state0_h.reshape((1, -1))
296 |     minibatch_state_c = np.repeat(state0_c, minibatch_size, axis=0)
297 |     minibatch_state_h = np.repeat(state0_h, minibatch_size, axis=0)
298 | 
299 |     # mass of each candidate, will be accumulated everytime an AA is appended
300 |     minibatch_prefix_mass = np.zeros(minibatch_size)
301 | 
302 |     # output is a list of candidate_len arrays of shape [minibatch_size, 26]
303 |     # each row is log of probability distribution over 26 classes/symbols
304 |     output_logprob_list = []
305 | 
306 |     # recurrent iterations
307 |     for position in range(candidate_len):
308 | 
309 |       # gather minibatch data
310 |       minibatch_AA_id = np.zeros(minibatch_size)
311 |       for index, candidate in enumerate(candidate_list):
312 |         AA = candidate[position]
313 |         minibatch_AA_id[index] = AA
314 |         minibatch_prefix_mass[index] += deepnovo_config.mass_ID[AA]
315 | 
316 |       # this is the most time-consuming ~70-75%
317 |       minibatch_intensity = [get_candidate_intensity(spectrum_original,
318 |                                                      precursor_mass,
319 |                                                      prefix_mass,
320 |                                                      direction)
321 |                              for prefix_mass in np.nditer(minibatch_prefix_mass)]
322 | 
323 |       # final shape [minibatch_size, 26, 8, 10]
324 |       minibatch_intensity = np.array(minibatch_intensity)
325 | 
326 |       # model feed
327 |       input_feed = {}
328 |       input_feed[model.input_dict["AAid"][1].name] = minibatch_AA_id
329 |       input_feed[model.input_dict["intensity"].name] = minibatch_intensity
330 |       input_feed[model.input_dict["lstm_state"][0].name] = minibatch_state_c
331 |       input_feed[model.input_dict["lstm_state"][1].name] = minibatch_state_h
332 |       # and run
333 |       output_feed = [model_output_logprob, model_lstm_state]
334 |       output_logprob, (minibatch_state_c, minibatch_state_h) = session.run(
335 |           fetches=output_feed,
336 |           feed_dict=input_feed)
337 | 
338 |       output_logprob_list.append(output_logprob)
339 | 
340 |     return output_logprob_list
341 | 
342 | 
343 |   def _search_db_batch(self,
344 |                        spectrum_batch,
345 |                        model,
346 |                        session,
347 |                        denovo_peptide_dict):
348 |     """TODO(nh2tran): docstring.
349 |        Inputs:
350 |          spectrum_batch: a list of spectrum, each is a dictionary
351 |            spectrum["feature_id"]
352 |            spectrum["precursor_mass"]
353 |            spectrum["spectrum_holder"]
354 |            spectrum["spectrum_original_forward"]
355 |            spectrum["spectrum_original_backward"]
356 |        Outputs:
357 |          predicted_batch: a list of predicted, each is a dictionary
358 |            predicted["feature_id"]
359 |            predicted["sequence"]
360 |            predicted["score"]
361 |            predicted["position_score"]
362 |     """
363 | 
364 |     #~ print("".join(["="] * 80)) # section-separating line
365 |     #~ print("WorkerDB: _search_db_batch()")
366 | 
367 |     # initialize the lstm using the spectrum
368 |     # for faster speed, we initialize the whole spectrum_batch instead of 1-by-1
369 |     input_feed = {}
370 |     spectrum_holder = np.array([spectrum["spectrum_holder"]
371 |                                 for spectrum in spectrum_batch])
372 |     input_feed[model.input_dict["spectrum"].name] = spectrum_holder
373 |     output_feed = [model.output_forward["lstm_state0"],
374 |                    model.output_backward["lstm_state0"]]
375 |     ((state0_c_forward, state0_h_forward),
376 |      (state0_c_backward, state0_h_backward)) = session.run(fetches=output_feed,
377 |                                                            feed_dict=input_feed)
378 | 
379 |     predicted_batch = []
380 |     # we search spectrum by spectrum
381 |     # a faster way is to process them in parallel, but hard to debug
382 |     #~ test_id = "F12:7420"
383 |     for spectrum_index, spectrum in enumerate(spectrum_batch):
384 |       #~ if spectrum["feature_id"] != test_id:
385 |         #~ continue
386 | 
387 |       predicted = {"feature_id": spectrum["feature_id"],
388 |                    "sequence": [],
389 |                    "score": -float("inf"),
390 |                    "position_score": [],
391 |                    "precursor_mz": spectrum["precursor_mz"],
392 |                    "precursor_charge": spectrum["precursor_charge"],
393 |                    "protein_access_id": "",
394 |                    "scan_list_middle": spectrum["scan_list_middle"]}
395 | 
396 |       # filter by precursor mass
397 |       # example: [['M', 'D', 'K', 'F', 'Nmod', 'K', 'K']]
398 |       precursor_mass = spectrum["precursor_mass"]
399 |       candidate_list = self._filter_by_mass(precursor_mass)
400 | 
401 |       # add denovo peptide if provided
402 |       feature_id = spectrum["feature_id"]
403 |       if denovo_peptide_dict is not None and feature_id in denovo_peptide_dict:
404 |         sequence = denovo_peptide_dict[feature_id]
405 |         # TODO(nh2tran): change the precursor_mass_tolerance of denovo
406 |         sequence_mass = self._compute_peptide_mass(sequence)
407 |         precursor_mass_tolerance = precursor_mass * self.precursor_mass_ppm
408 |         if abs(precursor_mass - sequence_mass) <= precursor_mass_tolerance:
409 |           candidate_list.append(sequence)
410 | 
411 |       # if no candidate found, return empty sequence for this spectrum.
412 |       if not candidate_list:
413 |         predicted_batch.append(predicted)
414 |         continue
415 | 
416 |       # if decoy is activated, randomly shuffle amino acids to form decoy db.
417 |       if self.decoy:
418 |         for x in candidate_list:
419 |           shuffle(x) # this function works in place and returns None.
420 | 
421 |       # add special GO/EOS and reverse
422 |       # example: [['_GO', 'M', 'D', 'K', 'F', 'Nmod', 'K', 'K', '_EOS']]
423 |       candidate_forward_list = [[deepnovo_config._GO] + x + [deepnovo_config._EOS]
424 |                                 for x in candidate_list]
425 |       candidate_backward_list = [x[::-1] for x in candidate_forward_list]
426 | 
427 |       # add PAD to all candidates to the same max length
428 |       # [['_GO', 'M', 'D', 'K', 'F', 'Nmod', 'K', 'K', '_EOS', '_PAD', '_PAD']]
429 |       # due to the same precursor mass, candidates have very similar lengths
430 |       candidate_len_list = [len(x) for x in candidate_list]
431 |       candidate_maxlen = max(candidate_len_list)
432 |       for index, length in enumerate(candidate_len_list):
433 |         if length < candidate_maxlen:
434 |           pad_size = candidate_maxlen - length
435 |           candidate_forward_list[index] += [deepnovo_config._PAD] * pad_size
436 |           candidate_backward_list[index] += [deepnovo_config._PAD] * pad_size
437 |       
438 |       # score the spectrum against its candidates
439 |       #   using the forward model
440 |       logprob_forward_list = self._score_spectrum(
441 |           spectrum["precursor_mass"],
442 |           spectrum["spectrum_original_forward"],
443 |           state0_c_forward[spectrum_index],
444 |           state0_h_forward[spectrum_index],
445 |           candidate_forward_list,
446 |           model,
447 |           model.output_forward["logprob"],
448 |           model.output_forward["lstm_state"],
449 |           session,
450 |           direction=0)
451 |       #   and using the backward model
452 |       logprob_backward_list = self._score_spectrum(
453 |           spectrum["precursor_mass"],
454 |           spectrum["spectrum_original_backward"],
455 |           state0_c_backward[spectrum_index],
456 |           state0_h_backward[spectrum_index],
457 |           candidate_backward_list,
458 |           model,
459 |           model.output_backward["logprob"],
460 |           model.output_backward["lstm_state"],
461 |           session,
462 |           direction=1)
463 | 
464 |       # note that the candidates are grouped into minibatches
465 |       # === candidate_len ===
466 |       # s
467 |       # i
468 |       # z
469 |       # e
470 |       # =====================
471 |       # logprob_forward_list is a list of candidate_maxlen arrays of shape
472 |       #   [minibatch_size, 26]
473 |       # each row is log of probability distribution over 26 classes/symbols
474 | 
475 |       # find the best scoring candidate
476 |       #~ test_handle = open("test_file", 'w')
477 |       for index, candidate in enumerate(candidate_list):
478 | 
479 |         # only calculate score on the actual length, not on GO/EOS/PAD
480 |         candidate_len = candidate_len_list[index]
481 | 
482 |         # align forward and backward logprob
483 |         logprob_forward = [logprob_forward_list[position][index]
484 |                            for position in range(candidate_len)]
485 |         logprob_backward = [logprob_backward_list[position][index]
486 |                             for position in range(candidate_len)]
487 |         logprob_backward = logprob_backward[::-1]
488 | 
489 |         # score is the sum of logprob(AA) of the candidate in both directions
490 |         #   averaged by the candidate length
491 |         position_score = []
492 |         for position in range(candidate_len):
493 |           AA = candidate[position]
494 |           AA_id = deepnovo_config.vocab[AA]
495 |           position_score.append(logprob_forward[position][AA_id]
496 |                                 + logprob_backward[position][AA_id])
497 |         score = sum(position_score) / candidate_len
498 |         if score > predicted["score"]:
499 |           predicted["sequence"] = candidate
500 |           predicted["score"] = score
501 |           predicted["position_score"] = position_score
502 |           protein_access_id = self.peptide_2_protein_id.get(
503 |               ''.join(candidate).replace('mod', ''),
504 |               'DENOVO')
505 |           if isinstance(protein_access_id, set):
506 |             protein_access_id = ','.join(list(protein_access_id))
507 |           predicted["protein_access_id"] = protein_access_id
508 | 
509 |         #~ if spectrum["feature_id"] == test_id:
510 |           #~ print_candidate = ",".join(candidate)
511 |           #~ print_score = "{0:.2f}".format(score)
512 |           #~ print_row = "\t".join([print_candidate, print_score])
513 |           #~ print(print_row, file=test_handle, end="\n")
514 |       #~ test_handle.close()
515 |       #~ print(abc)
516 |       predicted_batch.append(predicted)
517 | 
518 |     return predicted_batch
519 | 
520 | 
521 | 


--------------------------------------------------------------------------------
/aa_workflow_step_5.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | 
  4 | import sys
  5 | import csv
  6 | import re
  7 | from Bio import SeqIO
  8 | from Bio.SeqIO import FastaIO
  9 | import Levenshtein
 10 | import multiprocessing
 11 | num_processes = 8
 12 | import time
 13 | 
 14 | 
 15 | WEAK_BINDING = 2.0 # NetMHC weak binding rank
 16 | STRONG_BINDING = 0.5 # NetMHC strong binding rank
 17 | 
 18 | AA_3_to_1 = {
 19 |   'Ala':'A',
 20 |   'Arg':'R',
 21 |   'Asn':'N',
 22 |   'Asp':'D',
 23 |   'Cys':'C',
 24 |   'Glu':'E',
 25 |   'Gln':'Q',
 26 |   'Gly':'G',
 27 |   'His':'H',
 28 |   'Ile':'I',
 29 |   'Leu':'L',
 30 |   'Lys':'K',
 31 |   'Met':'M',
 32 |   'Phe':'F',
 33 |   'Pro':'P',
 34 |   'Ser':'S',
 35 |   'Thr':'T',
 36 |   'Trp':'W',
 37 |   'Tyr':'Y',
 38 |   'Val':'V'}
 39 | 
 40 | CODON_AA = { # dictionary {codon: aa}
 41 |   'TTT':'F',
 42 |   'TTC':'F',
 43 |   'TTA':'L',
 44 |   'TTG':'L',
 45 |   'TCT':'S',
 46 |   'TCC':'S',
 47 |   'TCA':'S',
 48 |   'TCG':'S',
 49 |   'TAT':'Y',
 50 |   'TAC':'Y',
 51 |   'TAA':'X',
 52 |   'TAG':'X',
 53 |   'TGT':'C',
 54 |   'TGC':'C',
 55 |   'TGA':'X',
 56 |   'TGG':'W',
 57 |   'CTT':'L',
 58 |   'CTC':'L',
 59 |   'CTA':'L',
 60 |   'CTG':'L',
 61 |   'CCT':'P',
 62 |   'CCC':'P',
 63 |   'CCA':'P',
 64 |   'CCG':'P',
 65 |   'CAT':'H',
 66 |   'CAC':'H',
 67 |   'CAA':'Q',
 68 |   'CAG':'Q',
 69 |   'CGT':'R',
 70 |   'CGC':'R',
 71 |   'CGA':'R',
 72 |   'CGG':'R',
 73 |   'ATT':'I',
 74 |   'ATC':'I',
 75 |   'ATA':'I',
 76 |   'ATG':'M',
 77 |   'ACT':'T',
 78 |   'ACC':'T',
 79 |   'ACA':'T',
 80 |   'ACG':'T',
 81 |   'AAT':'N',
 82 |   'AAC':'N',
 83 |   'AAA':'K',
 84 |   'AAG':'K',
 85 |   'AGT':'S',
 86 |   'AGC':'S',
 87 |   'AGA':'R',
 88 |   'AGG':'R',
 89 |   'GTT':'V',
 90 |   'GTC':'V',
 91 |   'GTA':'V',
 92 |   'GTG':'V',
 93 |   'GCT':'A',
 94 |   'GCC':'A',
 95 |   'GCA':'A',
 96 |   'GCG':'A',
 97 |   'GAT':'D',
 98 |   'GAC':'D',
 99 |   'GAA':'E',
100 |   'GAG':'E',
101 |   'GGT':'G',
102 |   'GGC':'G',
103 |   'GGA':'G',
104 |   'GGG':'G'}
105 | 
106 | AA_CODON ={} # dictionary {aa: list of codons}
107 | for codon, aa in CODON_AA.iteritems():
108 |   if aa in AA_CODON:
109 |     AA_CODON[aa].append(codon)
110 |   else:
111 |     AA_CODON[aa] = [codon]
112 | 
113 | AA_PAIRWISE_DISTANCE = {} # dictionary {(aa1, aa2): min_distance}
114 | for aa1 in AA_CODON:
115 |   for aa2 in AA_CODON:
116 |     if (aa1, aa2) not in AA_PAIRWISE_DISTANCE:
117 |       min_distance = 3
118 |       for codon1 in AA_CODON[aa1]:
119 |         for codon2 in AA_CODON[aa2]:
120 |           distance = Levenshtein.hamming(codon1, codon2)
121 |           assert distance <= 3, "Error: codon distance > 3"
122 |           min_distance = min(min_distance, distance)
123 |       AA_PAIRWISE_DISTANCE[(aa1, aa2)] = min_distance
124 |       AA_PAIRWISE_DISTANCE[(aa2, aa1)] = min_distance
125 | 
126 | # a mutation pair (aa1, aa2) is missense if their codons are different by 1 nucleotide
127 | AA_PAIR_MISSENSE = [(aa1, aa2) for (aa1, aa2), min_distance in AA_PAIRWISE_DISTANCE.iteritems()
128 |                     if min_distance == 1]
129 | # for now, remove N-D, Q-E because not sure mutations or modifications
130 | AA_PAIR_MISSENSE.remove(('N', 'D'))
131 | AA_PAIR_MISSENSE.remove(('D', 'N'))
132 | AA_PAIR_MISSENSE.remove(('Q', 'E'))
133 | AA_PAIR_MISSENSE.remove(('E', 'Q'))
134 | 
135 | 
136 | def drop_mod_peaks(peptide):
137 |   peptide = peptide.replace("M(+15.99)", "M")
138 |   peptide = peptide.replace("N(+.98)", "N")
139 |   peptide = peptide.replace("Q(+.98)", "Q")
140 |   return peptide
141 | 
142 | 
143 | def read_denovo_psm(psm_file):
144 | 
145 |   print("read_denovo_psm()")
146 |   print("psm_file:", psm_file)
147 | 
148 |   # store PSM of denovo peptides in a dictionary 
149 |   # {peptide: {'num_psm': , 'total_score': , 'total_abundance'}}
150 |   denovo_peptide_psm = {}
151 |   with open(psm_file, 'r') as input_handle:
152 |     csv_reader = csv.DictReader(input_handle, delimiter=',')
153 |     for row in csv_reader:
154 |       accession = drop_mod_peaks(row['Accession'])
155 |       if accession == 'DENOVO':
156 |         peptide = drop_mod_peaks(row['Peptide'])
157 |         score = float(row['-10lgP'])
158 |         abundance = float(row['Area']) if row['Area'] else 0
159 |         if peptide not in denovo_peptide_psm:
160 |           denovo_peptide_psm[peptide] = {'num_psm': 1,
161 |                                          'total_score': score,
162 |                                          'total_abundance': abundance}
163 |         else:
164 |           denovo_peptide_psm[peptide]['num_psm'] += 1
165 |           denovo_peptide_psm[peptide]['total_score'] += score
166 |           denovo_peptide_psm[peptide]['total_abundance'] += abundance
167 | 
168 |   print("Number of denovo peptides:", len(denovo_peptide_psm))
169 |   num_psm_list = [x['num_psm'] for x in denovo_peptide_psm.values()]
170 |   print("Number of denovo peptides with >= 1 psm: ", len([x for x in num_psm_list if x >= 1]))
171 |   print("Number of denovo peptides with >= 2 psm: ", len([x for x in num_psm_list if x >= 2]))
172 |   print("Number of denovo peptides with >= 3 psm: ", len([x for x in num_psm_list if x >= 3]))
173 |   print()
174 | 
175 |   return denovo_peptide_psm
176 | 
177 | 
178 | def read_netmhc(netmhc_file):
179 | 
180 |   print("read_netmhc()")
181 |   print("netmhc_file:", netmhc_file)
182 | 
183 |   # store NetMHC predictions of denovo peptides in a dictionary 
184 |   # {peptide: {'best_nM': , 'best_rank': , 'is_weak_binding': , 'is_strong_binding': }}
185 |   peptide_netmhc = {}
186 |   with open(netmhc_file, 'r') as input_handle:
187 |     csv_reader = csv.DictReader(input_handle, delimiter=',')
188 |     for row in csv_reader:
189 |       peptide = row['Peptide']
190 |       if peptide not in peptide_netmhc:
191 |         best_nM = min([float(row[x]) for x in ['nM1', 'nM2', 'nM3', 'nM4', 'nM5', 'nM6'] if x in csv_reader.fieldnames])
192 |         best_rank = min([float(row[x]) for x in ['Rank1', 'Rank2', 'Rank3', 'Rank4', 'Rank5', 'Rank6'] if x in csv_reader.fieldnames])
193 |         is_weak_binding = int(best_rank <= WEAK_BINDING)
194 |         is_strong_binding = int(best_rank <= STRONG_BINDING)
195 |         peptide_netmhc[peptide] = {
196 |           'best_nM': best_nM,
197 |           'best_rank': best_rank,
198 |           'is_weak_binding': is_weak_binding,
199 |           'is_strong_binding': is_strong_binding}
200 |       else:
201 |         print("Warning: duplicate peptide found in peptide_netmhc:", peptide)
202 | 
203 |   print("Number of peptides:", len(peptide_netmhc))
204 |   print("Number of peptides with weak binding: ", sum([x['is_weak_binding'] for x in peptide_netmhc.values()]))
205 |   print("Number of peptides with strong binding: ", sum([x['is_strong_binding'] for x in peptide_netmhc.values()]))
206 |   print()
207 | 
208 |   return peptide_netmhc
209 | 
210 | 
211 | def read_immunogenicity(immunogenicity_file):
212 | 
213 |   print("read_immunogenicity()")
214 |   print("immunogenicity_file:", immunogenicity_file)
215 | 
216 |   # store immunogenicity of denovo peptides in a dictionary 
217 |   # {peptide: {'immunogenicity': }}
218 |   peptide_immunogenicity = {}
219 |   with open(immunogenicity_file, 'r') as input_handle:
220 |     csv_reader = csv.DictReader(input_handle, delimiter=',')
221 |     for row in csv_reader:
222 |       peptide = row['peptide']
223 |       if peptide not in peptide_immunogenicity:
224 |         score = float(row['score'])
225 |         peptide_immunogenicity[peptide] = {'immunogenicity': score}
226 |       else:
227 |         print("Warning: duplicate peptide found in peptide_immunogenicity:", peptide)
228 | 
229 |   print("Number of peptides:", len(peptide_immunogenicity))
230 |   print()
231 | 
232 |   return peptide_immunogenicity
233 | 
234 | 
235 | def read_fasta(fasta_file,
236 |                get_uniprot_id=False,
237 |                get_enst_id=False,
238 |                get_gene_name=False):
239 | 
240 |   print("read_fasta()")
241 |   print("fasta_file:", fasta_file)
242 |   print("get_uniprot_id:", get_uniprot_id)
243 |   print("get_enst_id:", get_enst_id)
244 |   print("get_gene_name:", get_gene_name)
245 | 
246 |   with open(fasta_file, 'r') as file_handle:
247 |     record_list = list(SeqIO.parse(file_handle, "fasta"))
248 |     protein_list = []
249 |     for record in record_list:
250 |       uniprot_id = ''
251 |       enst_id = ''
252 |       gene_name = ''
253 |       name = str(record.name)
254 |       if get_uniprot_id:
255 |         uniprot_id = name.split('|')[1]
256 |       if get_enst_id:
257 |         enst_id = name
258 |       if get_gene_name:
259 |         description_list = str(record.description).strip().split(' ')
260 |         gene_name_list = [x for x in description_list if 'GN=' in x]
261 |         if len(gene_name_list) == 1:
262 |           gene_name = gene_name_list[0].split('=')[1]
263 |       seq = str(record.seq)
264 |       protein_list.append({'name': name,
265 |                            'uniprot_id': uniprot_id,
266 |                            'enst_id': enst_id,
267 |                            'gene_name': gene_name,
268 |                            'seq': seq})
269 | 
270 |   print("Number of protein sequences in the fasta file: ", len(protein_list))
271 |   print()
272 | 
273 |   return protein_list
274 | 
275 | 
276 | def read_db_peptide(labeled_feature_file):
277 | 
278 |   print("read_db_peptide()")
279 |   print("labeled_feature_file:", labeled_feature_file)
280 | 
281 |   db_peptide_set = set()
282 |   with open(labeled_feature_file, 'r') as input_handle:
283 |     csv_reader = csv.DictReader(input_handle, delimiter=',')
284 |     for row in csv_reader:
285 |       peptide = drop_mod_peaks(row['seq'])
286 |       db_peptide_set.add(peptide)
287 |   print("Number of db peptides identified at step 1: ", len(db_peptide_set))
288 |   print()
289 | 
290 |   return db_peptide_set
291 | 
292 | 
293 | def hamming1_align((peptide, protein_list)):
294 | 
295 |   # I and L are considered the same in this alignment
296 |   query = peptide.replace('I', 'L')
297 |   query_length = len(query)
298 |   match_list = []
299 |   for protein in protein_list:
300 |     subject = protein['seq'].replace('I', 'L')
301 |     subject_length = len(subject)
302 | 
303 |     # First, find candidate locations by pigeonhole principle:
304 |     # if hamming distance is 1, the left or right half must be exact match
305 |     # Then, calculate hamming distance at candidate locations and return those equal to 1
306 |     query_left = query[:query_length/2]
307 |     query_right = query[query_length/2:]
308 |     left_index = [x.start() for x in re.finditer(query_left, subject)]
309 |     right_index = [x.start() for x in re.finditer(query_right, subject)]
310 |     right_index = [(x - query_length/2) for x in right_index]
311 |     candidate_index = left_index + right_index
312 |     candidate_index = [x for x in candidate_index if x >= 0 and (x + query_length) <= subject_length]
313 |     hamming1_index = [x for x in candidate_index
314 |                       if Levenshtein.hamming(query, subject[x : (x + query_length)]) == 1]
315 | 
316 |     if hamming1_index:
317 |       match_list += [{'protein': protein, 'match_index': index}
318 |                       for index in hamming1_index]
319 | 
320 |   return peptide, match_list
321 | 
322 | 
323 | def find_mutation(peptide_list, protein_list):
324 | 
325 |   print("find_mutation()")
326 | 
327 |   print("Align peptides against protein sequences with 1 mismatch ...")
328 |   print("Number of peptides: ", len(peptide_list))
329 |   print("Number of protein sequences:", len(protein_list))
330 |   print("I and L are considered the same in this alignment")
331 |   start_time = time.time()
332 |   pool = multiprocessing.Pool(processes=num_processes)
333 |   search_list = [(peptide, protein_list) for peptide in peptide_list]
334 |   result_list = pool.map(hamming1_align, search_list)
335 |   print(time.time() - start_time, "seconds")
336 |   print()
337 | 
338 |   peptide_mutation = {}
339 |   protein_mutation = {}
340 |   for peptide, match_list in result_list:
341 |     missense_list = []
342 |     peptide_length = len(peptide)
343 |     peptide_ItoL = peptide.replace('I', 'L')
344 |     for match in match_list:
345 |       protein = match['protein']
346 |       match_index = match['match_index']
347 | 
348 |       wildtype = protein['seq'][match_index : (match_index + peptide_length)]
349 |       wildtype_ItoL = wildtype.replace('I', 'L')
350 |       mutation_index = [x for x in range(len(peptide_ItoL)) if peptide_ItoL[x] != wildtype_ItoL[x]]
351 |       assert len(mutation_index) == 1, "Error: not 1 mutation found"
352 |       mutation_index = mutation_index[0]
353 |       mutation_wildtype = wildtype[mutation_index]
354 |       mutation_aa = peptide[mutation_index]
355 |       match['wildtype'] = wildtype
356 |       match['mutation_pos'] = mutation_index + 1
357 |       match['mutation_wt'] = mutation_wildtype
358 |       match['mutation_aa'] = mutation_aa
359 |       match['is_missense'] = int((mutation_aa, mutation_wildtype) in AA_PAIR_MISSENSE)
360 |       notflanking = int(match['mutation_pos'] != 1 and match['mutation_pos'] != len(peptide))
361 |       match['is_missense_notflanking'] = match['is_missense'] * notflanking
362 | 
363 |       if match['is_missense_notflanking']:
364 |         protein_mutation_entry = {'peptide': peptide, 'match_index': match['match_index']}
365 |         if not protein['name'] in protein_mutation:
366 |           protein_mutation[protein['name']] = [protein_mutation_entry]
367 |         else:
368 |           protein_mutation[protein['name']].append(protein_mutation_entry)
369 | 
370 |     num_hits = len(match_list)
371 |     num_missense = len([x for x in match_list if x['is_missense'] == 1])
372 |     num_missense_notflanking = len([x for x in match_list if x['is_missense_notflanking'] == 1])
373 |     peptide_mutation[peptide] = {'num_hits': num_hits,
374 |                                  'num_missense': num_missense,
375 |                                  'num_missense_notflanking': num_missense_notflanking,
376 |                                  'match_list': match_list}
377 | 
378 |   print("Number of denovo peptides with >= 1 hits:",
379 |         len([x for x in peptide_mutation.values() if x['num_hits'] >= 1]))
380 |   print("Number of denovo peptides with >= 1 missense hits:",
381 |         len([x for x in peptide_mutation.values() if x['num_missense'] >= 1]))
382 |   print("Number of denovo peptides with >= 1 missense, not flanking hits:",
383 |         len([x for x in peptide_mutation.values() if x['num_missense_notflanking'] >= 1]))
384 |   print()
385 | 
386 |   return peptide_mutation, protein_mutation
387 |         
388 | 
389 | def read_missense_snp(snp_file, snp_enst_fasta, snp_sample_id):
390 | 
391 |   print("read_missense_snp()")
392 |   print("snp_file:", snp_file)
393 |   print("snp_enst_fasta:", snp_enst_fasta)
394 |   print("snp_sample_id:", snp_sample_id)
395 | 
396 |   # read missense SNP
397 |   snp_list = []
398 |   with open(snp_file, 'r') as input_handle:
399 |     csv_reader = csv.DictReader(input_handle, delimiter=',')
400 |     for row in csv_reader:
401 |       mutation_type = row['Effect']
402 |       if mutation_type == 'missense_variant' and snp_sample_id == row['Sample ID']:
403 |         enst_id = row['ENSEMBL Transcript ID']
404 |         mutation_change = row['Aa change']
405 |         snp_list.append({'enst_id': enst_id, 'mutation_change': mutation_change})
406 |   print("Number of missense SNPs:", len(snp_list))
407 |   print()
408 | 
409 |   # read SNP Ensembl Transcript fasta
410 |   protein_list = read_fasta(snp_enst_fasta, get_enst_id=True)
411 |   # clean letter 'X' from the 1st position of some enst protein sequences
412 |   for protein in protein_list:
413 |     if protein['seq'][0] == 'X':
414 |       protein['seq'] = protein['seq'][1:]
415 |   # convert protein_list to a dictionary with key as Ensembl Transcript ID
416 |   protein_dict = {}
417 |   for protein in protein_list:
418 |     enst_id = protein['enst_id']
419 |     assert enst_id not in protein_dict, "Error: duplicate enst_id"
420 |     protein_dict[enst_id] = protein
421 | 
422 |   # cross-check snp_list and snp_enst_fasta for enst_id, location, and identity of mutated amino acid
423 |   # because some transcripts were removed or updated, so their SNPs are no longer correct
424 |   num_not_missense = 0
425 |   snp_confirmed_list = []
426 |   enst_id_confirmed_set = set()
427 |   for snp in snp_list:
428 |     # example: Pro575Leu; note that the location is 1-based, not 0-based
429 |     aa_3letter_ref = snp['mutation_change'][:3]
430 |     aa_loc = int(snp['mutation_change'][3:-3])
431 |     aa_3letter_alt = snp['mutation_change'][-3:]
432 |     aa_ref = AA_3_to_1[aa_3letter_ref]
433 |     aa_alt = AA_3_to_1[aa_3letter_alt]
434 |     enst_id = snp['enst_id']
435 |     if enst_id in protein_dict:
436 |       protein = protein_dict[enst_id]
437 |       if aa_loc-1 < len(protein['seq']) and aa_ref == protein['seq'][aa_loc-1]:
438 |         snp_confirmed_list.append({'enst_id':snp['enst_id'],
439 |                                    'aa_loc': aa_loc,
440 |                                    'aa_ref': aa_ref,
441 |                                    'aa_alt': aa_alt})
442 |         enst_id_confirmed_set.add(enst_id)
443 |   protein_confirmed_list = [protein_dict[enst_id] for enst_id in enst_id_confirmed_set]
444 | 
445 |   print("len(snp_list):", len(snp_list))
446 |   print("len(snp_confirmed_list):", len(snp_confirmed_list))
447 |   print("len(protein_dict):", len(protein_dict))
448 |   print("len(protein_confirmed_list):", len(protein_confirmed_list))
449 |   print()
450 | 
451 |   return snp_confirmed_list, protein_confirmed_list
452 | 
453 | 
454 | def match_peptide_snp(peptide_list, snp_file, snp_enst_fasta, snp_sample_id):
455 | 
456 |   print('match_peptide_snp()')
457 | 
458 |   snp_list, protein_list = read_missense_snp(snp_file, snp_enst_fasta, snp_sample_id)
459 |   peptide_mutation, _ = find_mutation(peptide_list, protein_list)
460 |   peptide_snp = {}
461 |   for peptide, mutation in peptide_mutation.iteritems():
462 |     peptide_snp[peptide] = {'snp_list': []}
463 |     if mutation['num_hits'] > 0:
464 |       for match in mutation['match_list']:
465 |         enst_id = match['protein']['enst_id']
466 |         match_index = match['match_index']
467 |         for snp in snp_list:
468 |           if (enst_id == snp['enst_id']
469 |               and match_index + match['mutation_pos'] == snp['aa_loc']
470 |               and match['mutation_wt'] == snp['aa_ref']
471 |               and match['mutation_aa'].replace('I', 'L') == snp['aa_alt'].replace('I', 'L')):
472 |             match_snp = snp
473 |             match_snp.update({'wildtype': match['wildtype']})
474 |             peptide_snp[peptide]['snp_list'].append(match_snp)
475 | 
476 |   num_peptide_snp = len([x for x in peptide_snp.values() if x['snp_list']])
477 |   print('Number of peptide mutations match to SNPs:', num_peptide_snp)
478 |   for peptide in peptide_snp:
479 |     if peptide_snp[peptide]['snp_list']:
480 |       print(peptide, peptide_snp[peptide]['snp_list'])
481 |   print()
482 | 
483 |   return peptide_snp
484 | 
485 | 
486 | def step_5(psm_file, netmhc_file, immunogenicity_file, db_fasta_file, labeled_feature_file,
487 |            snp_file, snp_enst_fasta, snp_sample_id,
488 |            output_neoantigen_criteria, output_protein_mutation):
489 | 
490 |   print("".join(["="] * 80)) # section-separating line
491 |   print("step_5()")
492 | 
493 |   denovo_psm = read_denovo_psm(psm_file)
494 |   if netmhc_file:
495 |     denovo_netmhc = read_netmhc(netmhc_file)
496 |   else:
497 |     denovo_netmhc = None
498 |   denovo_peptide_list = denovo_psm.keys()
499 |   if immunogenicity_file:
500 |     denovo_immunogenicity = read_immunogenicity(immunogenicity_file)
501 |   else:
502 |     denovo_immunogenicity = None
503 |   denovo_peptide_list = denovo_psm.keys()
504 | 
505 |   print("Find denovo mutations with respect to the reference fasta:")
506 |   protein_list = read_fasta(db_fasta_file)
507 |   denovo_mutation, protein_mutation = find_mutation(denovo_peptide_list, protein_list)
508 | 
509 |   print("Write protein with missense and not flanking mutations:")
510 |   print("output_protein_mutation:", output_protein_mutation)
511 |   print()
512 |   with open(output_protein_mutation, 'w') as output_handle:
513 |     fieldnames = ['protein_name', 'num_peptide', 'peptide_list']
514 |     csv_writer = csv.DictWriter(output_handle, fieldnames=fieldnames, delimiter=',')
515 |     csv_writer.writeheader()
516 |     for protein_name, peptide_list in protein_mutation.iteritems():
517 |       row = {'protein_name': protein_name,
518 |              'num_peptide': len(peptide_list),
519 |              'peptide_list': peptide_list}
520 |       csv_writer.writerow(row)
521 | 
522 |   print("Find wildtypes in identified db peptides")
523 |   db_peptide_set = read_db_peptide(labeled_feature_file)
524 |   for peptide in denovo_mutation:
525 |     num_missense_db = 0
526 |     num_missense_notflanking_db = 0
527 |     for match in denovo_mutation[peptide]['match_list']:
528 |       match['is_missense_db'] = match['is_missense'] * int(match['wildtype'] in db_peptide_set)
529 |       match['is_missense_notflanking_db'] = match['is_missense_notflanking'] * int(match['wildtype'] in db_peptide_set)
530 |       num_missense_db += match['is_missense_db']
531 |       num_missense_notflanking_db += match['is_missense_notflanking_db']
532 |     denovo_mutation[peptide]['num_missense_db'] = num_missense_db
533 |     denovo_mutation[peptide]['num_missense_notflanking_db'] = num_missense_notflanking_db
534 |   print("Number of denovo peptides with >= 1 missense_db hits:",
535 |         len([x for x in denovo_mutation.values() if x['num_missense_db'] >= 1]))
536 |   print("Number of denovo peptides with >= 1 missense_notflanking_db hits:",
537 |         len([x for x in denovo_mutation.values() if x['num_missense_notflanking_db'] >= 1]))
538 |   print()
539 | 
540 |   if snp_file:
541 |     print("Find denovo mutations match to SNPs:")
542 |     denovo_snp = match_peptide_snp(denovo_peptide_list, snp_file, snp_enst_fasta, snp_sample_id)
543 |   else:
544 |     denovo_snp = None
545 | 
546 |   print("Write neoantigen criteria:")
547 |   print("output_neoantigen_criteria:", output_neoantigen_criteria)
548 |   print()
549 |   with open(output_neoantigen_criteria, 'w') as output_handle:
550 |     fieldnames = ['peptide',
551 |                   'num_psm',
552 |                   'total_score',
553 |                   'total_abundance',
554 |                   'best_nM',
555 |                   'best_rank',
556 |                   'is_weak_binding',
557 |                   'is_strong_binding',
558 |                   'immunogenicity',
559 |                   'num_hits',
560 |                   'num_missense',
561 |                   'num_missense_notflanking',
562 |                   'num_missense_db',
563 |                   'num_missense_notflanking_db',
564 |                   'match_list',
565 |                   'snp_list']
566 |     csv_writer = csv.DictWriter(output_handle, fieldnames=fieldnames, delimiter=',')
567 |     csv_writer.writeheader()
568 |     for peptide in denovo_peptide_list:
569 |       row = {'peptide': peptide}
570 |       row.update(denovo_psm[peptide])
571 |       if denovo_netmhc is not None and peptide in denovo_netmhc:
572 |         row.update(denovo_netmhc[peptide])
573 |       if denovo_immunogenicity is not None and peptide in denovo_immunogenicity:
574 |         row.update(denovo_immunogenicity[peptide])
575 |       row.update(denovo_mutation[peptide])
576 |       if denovo_snp is not None:
577 |         row.update(denovo_snp[peptide])
578 |       for match in row['match_list']:
579 |         match['protein'] = match['protein']['name']
580 |       csv_writer.writerow(row)
581 | 
582 |   print("Selection criteria: >= 1 missense, not flanking hits AND >= 2 psm")
583 |   num_selection = len([peptide for peptide in denovo_peptide_list
584 |                        if denovo_mutation[peptide]['num_missense_notflanking'] >= 1
585 |                        and denovo_psm[peptide]['num_psm'] >= 2])
586 |   print("num_selection :", num_selection)
587 | 
588 | 
589 | 
590 | 
591 | 
592 | 
593 | 
594 | 
595 | 
596 | 


--------------------------------------------------------------------------------
/deepnovo_worker_io.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Hieu Tran. All Rights Reserved.
  2 | #
  3 | # DeepNovo is publicly available for non-commercial uses.
  4 | # ==============================================================================
  5 | 
  6 | """TODO(nh2tran): docstring."""
  7 | 
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | from __future__ import print_function
 11 | 
 12 | import re
 13 | import os
 14 | import numpy as np
 15 | import pickle
 16 | 
 17 | import deepnovo_config
 18 | from deepnovo_cython_modules import process_spectrum
 19 | 
 20 | 
 21 | class WorkerIO(object):
 22 |   """TODO(nh2tran): docstring.
 23 |   """
 24 | 
 25 | 
 26 |   def __init__(self, input_spectrum_file, input_feature_file, output_file=None):
 27 |     """TODO(nh2tran): docstring.
 28 |        The input_file could be input_file or input_file_train/valid/test.
 29 |        The output_file is None for train/valid/test cases.
 30 |        During training we use two separate WorkerIO objects for train and valid.
 31 |     """
 32 | 
 33 |     print("".join(["="] * 80)) # section-separating line
 34 |     print("WorkerIO: __init__()")
 35 | 
 36 |     # we currently use deepnovo_config to store both const & settings
 37 |     # the settings should be shown in __init__() to keep track carefully
 38 |     self.MZ_MAX = deepnovo_config.MZ_MAX
 39 |     self.MZ_SIZE = deepnovo_config.MZ_SIZE
 40 |     self.batch_size = deepnovo_config.batch_size
 41 |     self.header_seq = deepnovo_config.FLAGS.header_seq
 42 |     self.neighbor_size = deepnovo_config.neighbor_size
 43 |     print("neighbor_size = {0:d}".format(self.neighbor_size))
 44 |     self.dia_window = deepnovo_config.dia_window
 45 | 
 46 |     self.input_spectrum_file = input_spectrum_file
 47 |     self.input_feature_file = input_feature_file
 48 |     self.output_file = output_file
 49 |     print("input_spectrum_file = {0:s}".format(self.input_spectrum_file))
 50 |     print("input_feature_file = {0:s}".format(self.input_feature_file))
 51 |     print("output_file = {0:s}".format(self.output_file))
 52 |     # keep the file handles open throughout the process to read/write batches
 53 |     self.input_spectrum_handle = None
 54 |     self.input_feature_handle = None
 55 |     self.output_handle = None
 56 | 
 57 |     # split data into batches
 58 |     self.feature_index_list = []
 59 |     self.feature_index_batch_list = []
 60 |     self.feature_index_batch_count = 0
 61 | 
 62 |     ### store file location of each feature for random access
 63 |     self.feature_location_list = []
 64 | 
 65 |     # store the file location of all spectra for random access
 66 |     self.spectrum_location_dict = {}
 67 |     self.spectrum_rtinseconds_dict = {}
 68 | 
 69 |     # record the status of spectra that have been read
 70 |     self.feature_count = {"total": 0,
 71 |                           "read": 0,
 72 |                           "skipped": 0,
 73 |                           "skipped_mass": 0}
 74 |     self.spectrum_count = 0
 75 | 
 76 | 
 77 |   def close_input(self):
 78 |     """TODO(nh2tran): docstring."""
 79 | 
 80 |     print("".join(["="] * 80)) # section-separating line
 81 |     print("WorkerIO: close_input()")
 82 | 
 83 |     self.input_spectrum_handle.close()
 84 |     self.input_feature_handle.close()
 85 | 
 86 | 
 87 |   def close_output(self):
 88 |     """TODO(nh2tran): docstring."""
 89 | 
 90 |     print("".join(["="] * 80)) # section-separating line
 91 |     print("WorkerIO: close_output()")
 92 | 
 93 |     self.output_handle.close()
 94 | 
 95 | 
 96 |   def get_spectrum(self, feature_index_batch):
 97 |     """TODO(nh2tran): docstring."""
 98 | 
 99 |     #~ print("".join(["="] * 80)) # section-separating line
100 |     #~ print("WorkerIO: get_spectrum()")
101 | 
102 |     spectrum_list = []
103 |     for feature_index in feature_index_batch:
104 |       # parse a feature
105 |       feature_location = self.feature_location_list[feature_index]
106 |       feature_id, feature_area, precursor_mz, precursor_charge, rt_mean, raw_sequence, scan_list, ms1_list = self._parse_feature(feature_location)
107 |       # skip if precursor_mass > MZ_MAX
108 |       precursor_mass = precursor_mz * precursor_charge - deepnovo_config.mass_H * precursor_charge
109 |       if precursor_mass > self.MZ_MAX:
110 |         self.feature_count["skipped"] += 1
111 |         self.feature_count["skipped_mass"] += 1
112 |         continue
113 |       self.feature_count["read"] += 1
114 |       # parse and process spectrum
115 |       (spectrum_holder,
116 |        spectrum_original_forward,
117 |        spectrum_original_backward,
118 |        scan_list_middle,
119 |        scan_list_original,
120 |        ms1_profile) = self._parse_spectrum(precursor_mz, precursor_mass, rt_mean, scan_list, ms1_list)
121 |       # update dataset
122 |       spectrum = {"feature_id": feature_id,#str(feature_index),#scan,
123 |                   "feature_area": feature_area,
124 |                   "raw_sequence": raw_sequence,
125 |                   "precursor_mass": precursor_mass,
126 |                   "spectrum_holder": spectrum_holder,
127 |                   "spectrum_original_forward": spectrum_original_forward,
128 |                   "spectrum_original_backward": spectrum_original_backward,
129 |                   "precursor_mz": precursor_mz,
130 |                   "precursor_charge": precursor_charge,
131 |                   "scan_list_middle": scan_list_middle,
132 |                   "scan_list_original": scan_list_original,
133 |                   "ms1_profile": ms1_profile}
134 |       spectrum_list.append(spectrum)
135 | 
136 |     return spectrum_list
137 | 
138 | 
139 |   def get_location(self):
140 |     """TODO(nh2tran): docstring."""
141 | 
142 |     print("".join(["="] * 80)) # section-separating line
143 |     print("WorkerIO: get_location()")
144 | 
145 |     ### store file location of each spectrum for random access {scan:location}
146 |     ### since mgf file can be rather big, cache the locations for each spectrum mgf file.
147 |     spectrum_location_file = self.input_spectrum_file + '.locations.pkl'
148 |     if os.path.exists(spectrum_location_file):
149 |       print("WorkerIO: read cached spectrum locations")
150 |       with open(spectrum_location_file, 'rb') as fr:
151 |         data = pickle.load(fr)
152 |         self.spectrum_location_dict, self.spectrum_rtinseconds_dict, self.spectrum_count = data
153 |     else:
154 |       print("WorkerIO: build spectrum location from scratch")
155 |       spectrum_location_dict = {}
156 |       spectrum_rtinseconds_dict = {}
157 |       line = True
158 |       while line:
159 |         current_location = self.input_spectrum_handle.tell()
160 |         line = self.input_spectrum_handle.readline()
161 |         if "BEGIN IONS" in line:
162 |           spectrum_location = current_location
163 |         elif "SCANS=" in line:
164 |           scan = re.split('=|\r|\n', line)[1]
165 |           spectrum_location_dict[scan] = spectrum_location
166 |         elif "RTINSECONDS=" in line:
167 |           rtinseconds = float(re.split('=|\r|\n', line)[1])
168 |           spectrum_rtinseconds_dict[scan] = rtinseconds
169 |       self.spectrum_location_dict = spectrum_location_dict
170 |       self.spectrum_rtinseconds_dict = spectrum_rtinseconds_dict
171 |       self.spectrum_count = len(spectrum_location_dict)
172 |       with open(spectrum_location_file, 'wb') as fw:
173 |         pickle.dump((self.spectrum_location_dict, self.spectrum_rtinseconds_dict, self.spectrum_count), fw)
174 | 
175 |     ### store file location of each feature for random access
176 |     feature_location_list = []
177 |     # skip header line
178 |     _ = self.input_feature_handle.readline()
179 |     line = True
180 |     while line:
181 |       feature_location = self.input_feature_handle.tell()
182 |       feature_location_list.append(feature_location)
183 |       line = self.input_feature_handle.readline()
184 |     feature_location_list = feature_location_list[:-1]
185 |     self.feature_location_list = feature_location_list
186 |     self.feature_count["total"] = len(feature_location_list)
187 |     self.feature_index_list = range(self.feature_count["total"])
188 | 
189 |     print("spectrum_count = {0:d}".format(self.spectrum_count))
190 |     print("feature_count[total] = {0:d}".format(self.feature_count["total"]))
191 | 
192 | 
193 |   def open_input(self):
194 |     """TODO(nh2tran): docstring."""
195 | 
196 |     print("".join(["="] * 80)) # section-separating line
197 |     print("WorkerIO: open_input()")
198 | 
199 |     self.input_spectrum_handle = open(self.input_spectrum_file, 'r')
200 |     self.input_feature_handle = open(self.input_feature_file, 'r')
201 | 
202 | 
203 |   def open_output(self):
204 |     """TODO(nh2tran): docstring."""
205 | 
206 |     print("".join(["="] * 80)) # section-separating line
207 |     print("WorkerIO: open_output()")
208 | 
209 |     self.output_handle = open(self.output_file, 'w')
210 |     self._print_prediction_header()
211 | 
212 | 
213 |   def split_feature_index(self):
214 |     """TODO(nh2tran): docstring."""
215 | 
216 |     print("".join(["="] * 80)) # section-separating line
217 |     print("WorkerIO: split_index()")
218 | 
219 |     index_batch_list = [self.feature_index_list[i:(i+self.batch_size)]
220 |                             for i in range(0,
221 |                                            self.feature_count["total"],
222 |                                            self.batch_size)]
223 | 
224 |     self.feature_index_batch_list = index_batch_list
225 |     self.feature_index_batch_count = len(self.feature_index_batch_list)
226 | 
227 | 
228 |   def write_prediction(self, predicted_batch):
229 |     """TODO(nh2tran): docstring."""
230 | 
231 |     #~ print("".join(["="] * 80)) # section-separating line
232 |     #~ print("WorkerIO: write_prediction()")
233 | 
234 |     for predicted in predicted_batch:
235 |       feature_id = predicted["feature_id"]
236 |       feature_area = str(predicted["feature_area"])
237 |       precursor_mz = str(predicted["precursor_mz"])
238 |       precursor_charge = str(predicted["precursor_charge"])
239 |       scan_list_middle = ";".join(predicted["scan_list_middle"])
240 |       scan_list_original = ";".join(predicted["scan_list_original"])
241 |       if predicted["sequence"]:
242 |         predicted_sequence = ';'.join([','.join(x) for x in predicted["sequence"]])
243 |         predicted_score = ';'.join(['{0:.2f}'.format(x) for x in predicted["score"]])
244 |         predicted_score_max = '{0:.2f}'.format(np.max(predicted["score"]))
245 |         predicted_position_score = ';'.join([
246 |             ','.join(['{0:.2f}'.format(y) for y in x])
247 |             for x in predicted["position_score"]])
248 |         if "protein_access_id" in predicted:
249 |           # predicted_batch is returned from search_db
250 |           protein_access_id = predicted['protein_access_id']
251 |         else:
252 |           # predicted_batch is returned from search_denovo
253 |           protein_access_id = 'DENOVO'
254 |       else: # if no peptide found, write empty sequence to the output file
255 |         predicted_sequence = ""
256 |         predicted_score = ""
257 |         predicted_score_max = ""
258 |         predicted_position_score = ""
259 |         protein_access_id = ""
260 |       predicted_row = "\t".join([feature_id,
261 |                                  feature_area,
262 |                                  predicted_sequence,
263 |                                  predicted_score,
264 |                                  predicted_position_score,
265 |                                  precursor_mz,
266 |                                  precursor_charge,
267 |                                  protein_access_id,
268 |                                  scan_list_middle,
269 |                                  scan_list_original,
270 |                                  predicted_score_max])
271 |       print(predicted_row, file=self.output_handle, end="\n")
272 | 
273 | 
274 |   def _parse_spectrum(self, precursor_mz, precursor_mass, rt_mean, scan_list, ms1_list):
275 |     """TODO(nh2tran): docstring."""
276 | 
277 |     #~ print("".join(["="] * 80)) # section-separating line
278 |     #~ print("WorkerIO: _parse_spectrum()")
279 | 
280 |     spectrum_holder_list = []
281 |     spectrum_original_forward_list = []
282 |     spectrum_original_backward_list = []
283 | 
284 |     ### select best neighbors from the scan_list by their distance to rt_mean
285 |     # probably move this selection to get_location(), run once rather than repeating
286 |     neighbor_count = len(scan_list)
287 |     best_scan_index = None
288 |     best_distance = float('inf')
289 |     for scan_index, scan in enumerate(scan_list):
290 |       distance = abs(self.spectrum_rtinseconds_dict[scan] - rt_mean)
291 |       if distance < best_distance:
292 |         best_distance = distance
293 |         best_scan_index = scan_index
294 |     neighbor_center = best_scan_index
295 |     neighbor_left_count = neighbor_center
296 |     neighbor_right_count = neighbor_count - neighbor_left_count - 1
297 |     neighbor_size_half = self.neighbor_size // 2
298 |     neighbor_left_count = min(neighbor_left_count, neighbor_size_half)
299 |     neighbor_right_count = min(neighbor_right_count, neighbor_size_half)
300 | 
301 |     ### padding zero arrays to the left if not enough neighbor spectra
302 |     if neighbor_left_count < neighbor_size_half:
303 |       for x in range(neighbor_size_half - neighbor_left_count):
304 |         spectrum_holder_list.append(np.zeros(
305 |             shape=(1, self.MZ_SIZE),
306 |             dtype=np.float32))
307 |         spectrum_original_forward_list.append(np.zeros(
308 |             shape=(1, self.MZ_SIZE),
309 |             dtype=np.float32))
310 |         spectrum_original_backward_list.append(np.zeros(
311 |             shape=(1, self.MZ_SIZE),
312 |             dtype=np.float32))
313 | 
314 |     ### parse and add neighbor spectra
315 |     scan_list_middle = []
316 |     ms1_intensity_list_middle = []
317 |     for index in range(neighbor_center - neighbor_left_count, neighbor_center + neighbor_right_count + 1):
318 |       scan = scan_list[index]
319 |       scan_list_middle.append(scan)
320 |       ms1_entry = ms1_list[index]
321 |       ms1_intensity = float(re.split(':', ms1_entry)[1])
322 |       ms1_intensity_list_middle.append(ms1_intensity)
323 |     ms1_intensity_max = max(ms1_intensity_list_middle)
324 |     assert ms1_intensity_max > 0.0, "Error: Zero ms1_intensity_max"
325 |     ms1_intensity_list_middle = [x/ms1_intensity_max for x in ms1_intensity_list_middle]
326 |     for scan, ms1_intensity in zip(scan_list_middle, ms1_intensity_list_middle):
327 |       spectrum_location = self.spectrum_location_dict[scan]
328 |       self.input_spectrum_handle.seek(spectrum_location)
329 |       # parse header lines
330 |       line = self.input_spectrum_handle.readline()
331 |       assert "BEGIN IONS" in line, "Error: wrong input BEGIN IONS"
332 |       line = self.input_spectrum_handle.readline()
333 |       assert "TITLE=" in line, "Error: wrong input TITLE="
334 |       line = self.input_spectrum_handle.readline()
335 |       assert "PEPMASS=" in line, "Error: wrong input PEPMASS="
336 |       line = self.input_spectrum_handle.readline()
337 |       assert "CHARGE=" in line, "Error: wrong input CHARGE="
338 |       line = self.input_spectrum_handle.readline()
339 |       assert "SCANS=" in line, "Error: wrong input SCANS="
340 |       line = self.input_spectrum_handle.readline()
341 |       assert "RTINSECONDS=" in line, "Error: wrong input RTINSECONDS="
342 |       # parse fragment ions
343 |       mz_list, intensity_list = self._parse_spectrum_ion()
344 |       # pre-process spectrum
345 |       (spectrum_holder,
346 |        spectrum_original_forward,
347 |        spectrum_original_backward) = process_spectrum(mz_list,
348 |                                                       intensity_list,
349 |                                                       precursor_mass)
350 |       # normalize by each individual spectrum
351 |       #~ spectrum_holder /= np.max(spectrum_holder)
352 |       #~ spectrum_original_forward /= np.max(spectrum_original_forward)
353 |       #~ spectrum_original_backward /= np.max(spectrum_original_backward)
354 |       # weight by ms1 profile
355 |       #~ spectrum_holder *= ms1_intensity
356 |       #~ spectrum_original_forward *= ms1_intensity
357 |       #~ spectrum_original_backward *= ms1_intensity
358 |       # add spectrum to the neighbor list
359 |       spectrum_holder_list.append(spectrum_holder)
360 |       spectrum_original_forward_list.append(spectrum_original_forward)
361 |       spectrum_original_backward_list.append(spectrum_original_backward)
362 |     ### padding zero arrays to the right if not enough neighbor spectra
363 |     if neighbor_right_count < neighbor_size_half:
364 |       for x in range(neighbor_size_half - neighbor_right_count):
365 |         spectrum_holder_list.append(np.zeros(
366 |             shape=(1, self.MZ_SIZE),
367 |             dtype=np.float32))
368 |         spectrum_original_forward_list.append(np.zeros(
369 |             shape=(1, self.MZ_SIZE),
370 |             dtype=np.float32))
371 |         spectrum_original_backward_list.append(np.zeros(
372 |             shape=(1, self.MZ_SIZE),
373 |             dtype=np.float32))
374 | 
375 |     spectrum_holder = np.vstack(spectrum_holder_list)
376 |     spectrum_original_forward = np.vstack(spectrum_original_forward_list)
377 |     spectrum_original_backward = np.vstack(spectrum_original_backward_list)
378 |     assert spectrum_holder.shape == (self.neighbor_size,
379 |                                      self.MZ_SIZE), "Error:shape"
380 |     # spectrum-CNN normalization: by feature
381 |     spectrum_holder /= np.max(spectrum_holder)
382 | 
383 |     # ms1_profile 
384 |     for x in range(neighbor_size_half - neighbor_left_count):
385 |       ms1_intensity_list_middle = [0.0] + ms1_intensity_list_middle
386 |     for x in range(neighbor_size_half - neighbor_right_count):
387 |       ms1_intensity_list_middle = ms1_intensity_list_middle + [0.0]
388 |     assert len(ms1_intensity_list_middle) == self.neighbor_size, "Error: ms1 profile"
389 |     ms1_profile = np.array(ms1_intensity_list_middle)
390 | 
391 |     return spectrum_holder, spectrum_original_forward, spectrum_original_backward, scan_list_middle, scan_list, ms1_profile
392 | 
393 | 
394 |   def _parse_feature(self, feature_location):
395 |     """TODO(nh2tran): docstring."""
396 | 
397 |     #~ print("".join(["="] * 80)) # section-separating line
398 |     #~ print("WorkerIO: _parse_feature()")
399 | 
400 |     self.input_feature_handle.seek(feature_location)
401 |     line = self.input_feature_handle.readline()
402 |     line = re.split(',|\r|\n', line)
403 |     feature_id = line[deepnovo_config.col_feature_id]
404 |     feature_area_str = line[deepnovo_config.col_feature_area]
405 |     feature_area = float(feature_area_str) if feature_area_str else 1.0
406 |     precursor_mz = float(line[deepnovo_config.col_precursor_mz])
407 |     precursor_charge = float(line[deepnovo_config.col_precursor_charge])
408 |     rt_mean = float(line[deepnovo_config.col_rt_mean])
409 |     raw_sequence = line[deepnovo_config.col_raw_sequence]
410 |     scan_list = re.split(';', line[deepnovo_config.col_scan_list])
411 |     ms1_list = re.split(';', line[deepnovo_config.col_ms1_list])
412 |     assert len(scan_list) == len(ms1_list), "Error: scan_list and ms1_list not matched."
413 | 
414 |     return feature_id, feature_area, precursor_mz, precursor_charge, rt_mean, raw_sequence, scan_list, ms1_list
415 | 
416 | 
417 |   def _parse_spectrum_ion(self):
418 |     """TODO(nh2tran): docstring."""
419 | 
420 |     #~ print("".join(["="] * 80)) # section-separating line
421 |     #~ print("WorkerIO: _parse_spectrum_ion()")
422 | 
423 |     # ion
424 |     mz_list = []
425 |     intensity_list = []
426 |     line = self.input_spectrum_handle.readline()
427 |     while not "END IONS" in line:
428 |       mz, intensity = re.split(' |\n', line)[:2]
429 |       mz_float = float(mz)
430 |       intensity_float = float(intensity)
431 |       # skip an ion if its mass > MZ_MAX
432 |       if mz_float > self.MZ_MAX:
433 |         line = self.input_spectrum_handle.readline()
434 |         continue
435 |       mz_list.append(mz_float)
436 |       intensity_list.append(intensity_float)
437 |       line = self.input_spectrum_handle.readline()
438 | 
439 |     return mz_list, intensity_list
440 | 
441 | 
442 |   def _print_prediction_header(self):
443 |     """TODO(nh2tran): docstring."""
444 | 
445 |     print("".join(["="] * 80)) # section-separating line
446 |     print("WorkerIO: _print_prediction_header()")
447 | 
448 |     header_list = ["feature_id",
449 |                    "feature_area",
450 |                    "predicted_sequence",
451 |                    "predicted_score",
452 |                    "predicted_position_score",
453 |                    "precursor_mz",
454 |                    "precursor_charge",
455 |                    "protein_access_id",
456 |                    "scan_list_middle",
457 |                    "scan_list_original",
458 |                    "predicted_score_max"]
459 |     header_row = "\t".join(header_list)
460 |     print(header_row, file=self.output_handle, end="\n")
461 | 
462 | class WorkerI(object):
463 |   """
464 |   This is a helper class designed for multi-process get_spectrum
465 |   """
466 |   def __init__(self, worker_io):
467 |     self.MZ_MAX = worker_io.MZ_MAX
468 |     self.MZ_SIZE = worker_io.MZ_SIZE
469 |     self.batch_size = worker_io.batch_size
470 |     self.header_seq = worker_io.header_seq
471 |     self.neighbor_size = worker_io.neighbor_size
472 | 
473 |     self.dia_window = worker_io.dia_window
474 | 
475 |     self.input_spectrum_file = worker_io.input_spectrum_file
476 |     self.input_feature_file = worker_io.input_feature_file
477 |     self.output_file = worker_io.output_file
478 | 
479 |     # split data into batches
480 |     self.feature_index_list = worker_io.feature_index_list
481 |     self.feature_index_batch_list = worker_io.feature_index_batch_list
482 |     self.feature_index_batch_count = worker_io.feature_index_batch_count
483 | 
484 |     ### store file location of each feature for random access
485 |     self.feature_location_list = worker_io.feature_location_list
486 | 
487 |     # store the file location of all spectra for random access
488 |     self.spectrum_location_dict = worker_io.spectrum_location_dict
489 |     self.spectrum_rtinseconds_dict = worker_io.spectrum_rtinseconds_dict
490 | 
491 |     # record the status of spectra that have been read
492 |     self.feature_count = worker_io.feature_count
493 |     self.spectrum_count = worker_io.spectrum_count
494 | 
495 |   def get_spectrum(self, feature_index_batch, input_feature_file_handle, input_spectrum_file_handle):
496 |     """TODO(nh2tran): docstring."""
497 | 
498 |     #~ print("".join(["="] * 80)) # section-separating line
499 |     #~ print("WorkerIO: get_spectrum()")
500 | 
501 |     spectrum_list = []
502 |     for feature_index in feature_index_batch:
503 |       # parse a feature
504 |       feature_location = self.feature_location_list[feature_index]
505 |       feature_id, feature_area, precursor_mz, precursor_charge, rt_mean, raw_sequence, scan_list, ms1_list = self._parse_feature(feature_location, input_feature_file_handle)
506 |       # skip if precursor_mass > MZ_MAX
507 |       precursor_mass = precursor_mz * precursor_charge - deepnovo_config.mass_H * precursor_charge
508 |       if precursor_mass > self.MZ_MAX:
509 |         self.feature_count["skipped"] += 1
510 |         self.feature_count["skipped_mass"] += 1
511 |         continue
512 |       self.feature_count["read"] += 1
513 | 
514 |       # parse and process spectrum
515 |       (spectrum_holder,
516 |        spectrum_original_forward,
517 |        spectrum_original_backward,
518 |        scan_list_middle,
519 |        scan_list_original,
520 |        ms1_profile) = self._parse_spectrum(precursor_mz, precursor_mass, rt_mean, scan_list, ms1_list, input_spectrum_file_handle)
521 |       # update dataset
522 |       spectrum = {"feature_id": feature_id,#str(feature_index),#scan,
523 |                   "feature_area": feature_area,
524 |                   "raw_sequence": raw_sequence,
525 |                   "precursor_mass": precursor_mass,
526 |                   "spectrum_holder": spectrum_holder,
527 |                   "spectrum_original_forward": spectrum_original_forward,
528 |                   "spectrum_original_backward": spectrum_original_backward,
529 |                   "precursor_mz": precursor_mz,
530 |                   "precursor_charge": precursor_charge,
531 |                   "scan_list_middle": scan_list_middle,
532 |                   "scan_list_original": scan_list_original,
533 |                   "ms1_profile": ms1_profile}
534 |       spectrum_list.append(spectrum)
535 | 
536 |     return spectrum_list
537 | 
538 |   def _parse_feature(self, feature_location, input_file_handle):
539 |     """TODO(nh2tran): docstring."""
540 | 
541 |     #~ print("".join(["="] * 80)) # section-separating line
542 |     #~ print("WorkerIO: _parse_feature()")
543 | 
544 |     input_file_handle.seek(feature_location)
545 |     line = input_file_handle.readline()
546 |     line = re.split(',|\r|\n', line)
547 |     feature_id = line[deepnovo_config.col_feature_id]
548 |     feature_area = 0#float(line[deepnovo_config.col_feature_area])
549 |     precursor_mz = float(line[deepnovo_config.col_precursor_mz])
550 |     precursor_charge = float(line[deepnovo_config.col_precursor_charge])
551 |     rt_mean = float(line[deepnovo_config.col_rt_mean])
552 |     raw_sequence = line[deepnovo_config.col_raw_sequence]
553 |     scan_list = re.split(';', line[deepnovo_config.col_scan_list])
554 |     ms1_list = re.split(';', line[deepnovo_config.col_ms1_list])
555 |     assert len(scan_list) == len(ms1_list), "Error: scan_list and ms1_list not matched."
556 | 
557 |     return feature_id, feature_area, precursor_mz, precursor_charge, rt_mean, raw_sequence, scan_list, ms1_list
558 | 
559 |   def _parse_spectrum(self, precursor_mz, precursor_mass, rt_mean, scan_list, ms1_list, input_file_handle):
560 |     """TODO(nh2tran): docstring."""
561 | 
562 |     #~ print("".join(["="] * 80)) # section-separating line
563 |     #~ print("WorkerIO: _parse_spectrum()")
564 | 
565 |     spectrum_holder_list = []
566 |     spectrum_original_forward_list = []
567 |     spectrum_original_backward_list = []
568 | 
569 |     ### select best neighbors from the scan_list by their distance to rt_mean
570 |     # probably move this selection to get_location(), run once rather than repeating
571 |     neighbor_count = len(scan_list)
572 |     best_scan_index = None
573 |     best_distance = float('inf')
574 |     for scan_index, scan in enumerate(scan_list):
575 |       distance = abs(self.spectrum_rtinseconds_dict[scan] - rt_mean)
576 |       if distance < best_distance:
577 |         best_distance = distance
578 |         best_scan_index = scan_index
579 |     neighbor_center = best_scan_index
580 |     neighbor_left_count = neighbor_center
581 |     neighbor_right_count = neighbor_count - neighbor_left_count - 1
582 |     neighbor_size_half = self.neighbor_size // 2
583 |     neighbor_left_count = min(neighbor_left_count, neighbor_size_half)
584 |     neighbor_right_count = min(neighbor_right_count, neighbor_size_half)
585 | 
586 |     ### padding zero arrays to the left if not enough neighbor spectra
587 |     if neighbor_left_count < neighbor_size_half:
588 |       for x in range(neighbor_size_half - neighbor_left_count):
589 |         spectrum_holder_list.append(np.zeros(
590 |             shape=(1, self.MZ_SIZE),
591 |             dtype=np.float32))
592 |         spectrum_original_forward_list.append(np.zeros(
593 |             shape=(1, self.MZ_SIZE),
594 |             dtype=np.float32))
595 |         spectrum_original_backward_list.append(np.zeros(
596 |             shape=(1, self.MZ_SIZE),
597 |             dtype=np.float32))
598 | 
599 |     ### parse and add neighbor spectra
600 |     scan_list_middle = []
601 |     ms1_intensity_list_middle = []
602 |     for index in range(neighbor_center - neighbor_left_count, neighbor_center + neighbor_right_count + 1):
603 |       scan = scan_list[index]
604 |       scan_list_middle.append(scan)
605 |       ms1_entry = ms1_list[index]
606 |       ms1_intensity = float(re.split(':', ms1_entry)[1])
607 |       ms1_intensity_list_middle.append(ms1_intensity)
608 |     ms1_intensity_max = max(ms1_intensity_list_middle)
609 |     assert ms1_intensity_max > 0.0, "Error: Zero ms1_intensity_max"
610 |     ms1_intensity_list_middle = [x/ms1_intensity_max for x in ms1_intensity_list_middle]
611 |     for scan, ms1_intensity in zip(scan_list_middle, ms1_intensity_list_middle):
612 |       spectrum_location = self.spectrum_location_dict[scan]
613 |       input_file_handle.seek(spectrum_location)
614 |       # parse header lines
615 |       line = input_file_handle.readline()
616 |       assert "BEGIN IONS" in line, "Error: wrong input BEGIN IONS"
617 |       line = input_file_handle.readline()
618 |       assert "TITLE=" in line, "Error: wrong input TITLE="
619 |       line = input_file_handle.readline()
620 |       assert "PEPMASS=" in line, "Error: wrong input PEPMASS="
621 |       line = input_file_handle.readline()
622 |       assert "CHARGE=" in line, "Error: wrong input CHARGE="
623 |       line = input_file_handle.readline()
624 |       assert "SCANS=" in line, "Error: wrong input SCANS="
625 |       line = input_file_handle.readline()
626 |       assert "RTINSECONDS=" in line, "Error: wrong input RTINSECONDS="
627 |       # parse fragment ions
628 |       mz_list, intensity_list = self._parse_spectrum_ion(input_file_handle)
629 |       # pre-process spectrum
630 |       (spectrum_holder,
631 |        spectrum_original_forward,
632 |        spectrum_original_backward) = process_spectrum(mz_list,
633 |                                                       intensity_list,
634 |                                                       precursor_mass)
635 |       # normalize by each individual spectrum
636 |       #~ spectrum_holder /= np.max(spectrum_holder)
637 |       #~ spectrum_original_forward /= np.max(spectrum_original_forward)
638 |       #~ spectrum_original_backward /= np.max(spectrum_original_backward)
639 |       # weight by ms1 profile
640 |       #~ spectrum_holder *= ms1_intensity
641 |       #~ spectrum_original_forward *= ms1_intensity
642 |       #~ spectrum_original_backward *= ms1_intensity
643 |       # add spectrum to the neighbor list
644 |       spectrum_holder_list.append(spectrum_holder)
645 |       spectrum_original_forward_list.append(spectrum_original_forward)
646 |       spectrum_original_backward_list.append(spectrum_original_backward)
647 |     ### padding zero arrays to the right if not enough neighbor spectra
648 |     if neighbor_right_count < neighbor_size_half:
649 |       for x in range(neighbor_size_half - neighbor_right_count):
650 |         spectrum_holder_list.append(np.zeros(
651 |             shape=(1, self.MZ_SIZE),
652 |             dtype=np.float32))
653 |         spectrum_original_forward_list.append(np.zeros(
654 |             shape=(1, self.MZ_SIZE),
655 |             dtype=np.float32))
656 |         spectrum_original_backward_list.append(np.zeros(
657 |             shape=(1, self.MZ_SIZE),
658 |             dtype=np.float32))
659 | 
660 |     spectrum_holder = np.vstack(spectrum_holder_list)
661 |     spectrum_original_forward = np.vstack(spectrum_original_forward_list)
662 |     spectrum_original_backward = np.vstack(spectrum_original_backward_list)
663 |     assert spectrum_holder.shape == (self.neighbor_size,
664 |                                      self.MZ_SIZE), "Error:shape"
665 |     # spectrum-CNN normalization: by feature
666 |     spectrum_holder /= np.max(spectrum_holder)
667 | 
668 |     # ms1_profile
669 |     for x in range(neighbor_size_half - neighbor_left_count):
670 |       ms1_intensity_list_middle = [0.0] + ms1_intensity_list_middle
671 |     for x in range(neighbor_size_half - neighbor_right_count):
672 |       ms1_intensity_list_middle = ms1_intensity_list_middle + [0.0]
673 |     assert len(ms1_intensity_list_middle) == self.neighbor_size, "Error: ms1 profile"
674 |     ms1_profile = np.array(ms1_intensity_list_middle)
675 | 
676 |     return spectrum_holder, spectrum_original_forward, spectrum_original_backward, scan_list_middle, scan_list, ms1_profile
677 | 
678 |   def _parse_spectrum_ion(self, input_file_handle):
679 |     """TODO(nh2tran): docstring."""
680 | 
681 |     #~ print("".join(["="] * 80)) # section-separating line
682 |     #~ print("WorkerIO: _parse_spectrum_ion()")
683 | 
684 |     # ion
685 |     mz_list = []
686 |     intensity_list = []
687 |     line = input_file_handle.readline()
688 |     while not "END IONS" in line:
689 |       mz, intensity = re.split(' |\n', line)[:2]
690 |       mz_float = float(mz)
691 |       intensity_float = float(intensity)
692 |       # skip an ion if its mass > MZ_MAX
693 |       if mz_float > self.MZ_MAX:
694 |         line = input_file_handle.readline()
695 |         continue
696 |       mz_list.append(mz_float)
697 |       intensity_list.append(intensity_float)
698 |       line = input_file_handle.readline()
699 | 
700 |     return mz_list, intensity_list
701 | 


--------------------------------------------------------------------------------