├── .gitattributes ├── LICENSE ├── README.md ├── deepnovo_cython_setup.py ├── deepnovo_main.py ├── aa_workflow_step_4_2.py ├── deepnovo_cython_modules.pyx ├── deepnovo_preprocess.py ├── deepnovo_config.py ├── plot.py ├── deepnovo_worker_test.py ├── deepnovo_postprocess.py ├── aa_workflow.py ├── deepnovo_worker_db.py ├── aa_workflow_step_5.py └── deepnovo_worker_io.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | DeepNovoAA is publicly available for non-commercial uses. 2 | Copyright (C) 2020. Authors. All rights reserved. 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepNovo-AA 2 | 3 | ## General information 4 | 5 | - Publication: Personalized deep learning of individual immunopeptidomes to identify neoantigens for cancer vaccines. Nature Machine Intelligence, 2020. (https://www.nature.com/articles/s42256-020-00260-4) 6 | 7 | - To run the workflow on an example dataset, follow step-by-step instructions and Python scripts in the file `aa_workflow.py`. 8 | -------------------------------------------------------------------------------- /deepnovo_cython_setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Hieu Tran. All Rights Reserved. 2 | # 3 | # DeepNovo is publicly available for non-commercial uses. 4 | # ============================================================================== 5 | 6 | """TODO(nh2tran): docstring.""" 7 | 8 | import os 9 | from distutils.core import setup 10 | from Cython.Build import cythonize 11 | import numpy 12 | 13 | setup(ext_modules=cythonize("deepnovo_cython_modules.pyx"), 14 | include_dirs=[numpy.get_include(), os.path.join(numpy.get_include(), 'numpy')]) 15 | -------------------------------------------------------------------------------- /deepnovo_main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Hieu Tran. All Rights Reserved. 2 | # 3 | # DeepNovo is publicly available for non-commercial uses. 4 | # ============================================================================== 5 | 6 | """TODO(nh2tran): docstring.""" 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import tensorflow as tf 13 | 14 | import deepnovo_config 15 | import deepnovo_model 16 | import deepnovo_worker_db 17 | import deepnovo_worker_denovo 18 | import deepnovo_worker_io 19 | import deepnovo_worker_test 20 | import deepnovo_main_modules 21 | 22 | 23 | def main(_): 24 | """TODO(nh2tran): docstring.""" 25 | 26 | print("main()") 27 | 28 | if deepnovo_config.FLAGS.knapsack_build: 29 | deepnovo_main_modules.knapsack_build() 30 | elif deepnovo_config.FLAGS.train: 31 | deepnovo_main_modules.train() 32 | elif deepnovo_config.FLAGS.test_true_feeding: 33 | deepnovo_main_modules.test_true_feeding() 34 | elif deepnovo_config.FLAGS.decode: 35 | deepnovo_main_modules.decode() 36 | elif deepnovo_config.FLAGS.search_denovo: 37 | model = deepnovo_model.ModelInference() 38 | model.build_model() 39 | worker_io = deepnovo_worker_io.WorkerIO( 40 | input_spectrum_file=deepnovo_config.denovo_input_spectrum_file, 41 | input_feature_file=deepnovo_config.denovo_input_feature_file, 42 | output_file=deepnovo_config.denovo_output_file) 43 | worker_denovo = deepnovo_worker_denovo.WorkerDenovo() 44 | worker_denovo.search_denovo(model, worker_io) 45 | elif deepnovo_config.FLAGS.search_db: 46 | model = deepnovo_model.ModelInference() 47 | model.build_model() 48 | worker_io = deepnovo_worker_io.WorkerIO( 49 | input_spectrum_file=deepnovo_config.db_input_spectrum_file, 50 | input_feature_file=deepnovo_config.db_input_feature_file, 51 | output_file=deepnovo_config.db_output_file) 52 | worker_db = deepnovo_worker_db.WorkerDB( 53 | db_fasta_file=deepnovo_config.db_fasta_file) 54 | worker_db.build_db() 55 | worker_db.search_db(model, worker_io) 56 | elif deepnovo_config.FLAGS.search_hybrid: 57 | model = deepnovo_model.ModelInference() 58 | model.build_model() 59 | # denovo search 60 | worker_io = deepnovo_worker_io.WorkerIO( 61 | input_spectrum_file=deepnovo_config.hybrid_input_spectrum_file, 62 | input_feature_file=deepnovo_config.hybrid_input_feature_file, 63 | output_file=deepnovo_config.hybrid_denovo_file) 64 | worker_denovo = deepnovo_worker_denovo.WorkerDenovo() 65 | predicted_denovo_list = worker_denovo.search_denovo(model, worker_io) 66 | # db search with predicted_denovo_list 67 | worker_io = deepnovo_worker_io.WorkerIO( 68 | input_spectrum_file=deepnovo_config.hybrid_input_spectrum_file, 69 | input_feature_file=deepnovo_config.hybrid_input_feature_file, 70 | output_file=deepnovo_config.hybrid_output_file) 71 | worker_db = deepnovo_worker_db.WorkerDB( 72 | db_fasta_file=deepnovo_config.hybrid_fasta_file) 73 | worker_db.build_db() 74 | worker_db.search_db(model, worker_io, predicted_denovo_list) 75 | elif deepnovo_config.FLAGS.test: 76 | # test 1%FDR 77 | #~ worker_db = deepnovo_worker_db.WorkerDB() 78 | #~ worker_db.build_db() 79 | #~ worker_test = deepnovo_worker_test.WorkerTest() 80 | #~ worker_test.test_accuracy(worker_db.peptide_list) 81 | worker_test = deepnovo_worker_test.WorkerTest() 82 | worker_test.test_accuracy() 83 | else: 84 | print("ERROR: wrong option!") 85 | sys.exit() 86 | 87 | 88 | if __name__ == "__main__": 89 | tf.app.run() 90 | -------------------------------------------------------------------------------- /aa_workflow_step_4_2.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import csv 6 | from Bio import SeqIO 7 | from Bio.SeqIO import FastaIO 8 | 9 | 10 | def drop_mod(peptide): 11 | peptide = peptide.replace("M(Oxidation)", "M") 12 | peptide = peptide.replace("N(Deamidation)", "N") 13 | peptide = peptide.replace("Q(Deamidation)", "Q") 14 | return peptide 15 | 16 | 17 | def drop_mod_peaks(peptide): 18 | peptide = peptide.replace("M(+15.99)", "M") 19 | peptide = peptide.replace("N(+.98)", "N") 20 | peptide = peptide.replace("Q(+.98)", "Q") 21 | return peptide 22 | 23 | 24 | def change_I_to_L(string): 25 | return string.replace('I', 'L') 26 | 27 | 28 | def preprocess(denovo_file, db_fasta_file, labeled_feature_file, peptide_list_fasta): 29 | """Remove denovo peptides that exist in the database fasta file. 30 | Combine db and denovo into a peptide list file for PEAKS X DB search round 2. 31 | 32 | Usage: 33 | denovo_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5.denovo_only" 34 | db_fasta_file = "data.fasta/uniprot_sprot.human.plus_contaminants.fasta" 35 | labeled_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled" 36 | peptide_list_fasta = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/aa_workflow.step_4.peptide_list.fasta" 37 | """ 38 | 39 | print("".join(["="] * 80)) # section-separating line 40 | print("preprocess()") 41 | 42 | print("denovo_file =", denovo_file) 43 | print("db_fasta_file =", db_fasta_file) 44 | print("labeled_feature_file =", labeled_feature_file) 45 | print("peptide_list_fasta =", peptide_list_fasta) 46 | 47 | denovo_peptide_set = set() 48 | with open(denovo_file, 'r') as fr: 49 | reader = csv.reader(fr, delimiter='\t') 50 | names = next(reader) 51 | seq_index = names.index('predicted_sequence') 52 | for line in reader: 53 | if not line[seq_index]: 54 | continue 55 | peptide = line[seq_index] 56 | peptide = drop_mod(peptide) 57 | peptide = ''.join(peptide.split(',')) 58 | if peptide in denovo_peptide_set: 59 | continue 60 | else: 61 | denovo_peptide_set.add(peptide) 62 | print("Number of top-scoring denovo peptides: {}".format(len(denovo_peptide_set))) 63 | 64 | with open(db_fasta_file, 'r') as input_fasta_handle: 65 | record_list = list(SeqIO.parse(input_fasta_handle, "fasta")) 66 | print("Number of protein sequences: ", len(record_list)) 67 | human_protein_list = [str(record.seq) for record in record_list] 68 | 69 | # remove denovo peptides that exist in the database fasta file 70 | to_L_protein_list = [change_I_to_L(protein) for protein in human_protein_list] 71 | pure_denovo_seq_set = set() 72 | for i, peptide in enumerate(denovo_peptide_set): 73 | peptide_string = change_I_to_L(peptide) 74 | indb = False 75 | for protein in to_L_protein_list: 76 | if peptide_string in protein: 77 | indb = True 78 | break 79 | if not indb: 80 | pure_denovo_seq_set.add(peptide) 81 | if i % 1000 == 0: 82 | print("processing {}".format(i)) 83 | print("Number of denovo peptides not in database: {}".format(len(pure_denovo_seq_set))) 84 | 85 | db_peptide_set = set() 86 | with open(labeled_feature_file, 'r') as input_handle: 87 | csv_reader = csv.DictReader(input_handle, delimiter=',') 88 | for row in csv_reader: 89 | peptide = drop_mod_peaks(row['seq']) 90 | db_peptide_set.add(peptide) 91 | 92 | with open(peptide_list_fasta, 'w') as output_handle: 93 | counter = 0 94 | for peptide in db_peptide_set: 95 | counter += 1 96 | output_handle.write(">DB|db_{}\n".format(counter)) 97 | output_handle.write(peptide + '\n') 98 | counter = 0 99 | for peptide in pure_denovo_seq_set: 100 | counter += 1 101 | output_handle.write(">DENOVO|denovo_{}\n".format(counter)) 102 | output_handle.write(''.join(peptide) + '\n') 103 | 104 | num_db_peptides = len(db_peptide_set) 105 | num_denovo_peptides = len(pure_denovo_seq_set) 106 | print("num_db_peptides =", num_db_peptides) 107 | print("num_denovo_peptides =", num_denovo_peptides) 108 | 109 | 110 | def postprocess(psm_file, output_denovo_peptide_file): 111 | """Extract denovo peptides from the PSMs of PEAKS X DB search round 2. 112 | 113 | Usage: 114 | psm_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/aa_workflow.step_4.psm.csv" 115 | output_denovo_peptide_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/aa_workflow.step_4.output_peptide_list" 116 | """ 117 | 118 | print("".join(["="] * 80)) # section-separating line 119 | print("postprocess()") 120 | 121 | print("psm_file =", psm_file) 122 | print("output_denovo_peptide_file =", output_denovo_peptide_file) 123 | 124 | denovo_peptide_set = set() 125 | num_denovo_psm = 0 126 | with open(psm_file, 'r') as input_handle: 127 | csv_reader = csv.DictReader(input_handle, delimiter=',') 128 | for row in csv_reader: 129 | peptide = drop_mod_peaks(row['Peptide']) 130 | accession = drop_mod_peaks(row['Accession']) 131 | if accession == 'DENOVO': 132 | num_denovo_psm += 1 133 | denovo_peptide_set.add(peptide) 134 | 135 | with open(output_denovo_peptide_file, 'w') as output_handle: 136 | for peptide in denovo_peptide_set: 137 | output_handle.write(peptide + '\n') 138 | 139 | num_denovo_peptides = len(denovo_peptide_set) 140 | print("num_denovo_peptides =", num_denovo_peptides) 141 | print("num_denovo_psm =", num_denovo_psm) 142 | -------------------------------------------------------------------------------- /deepnovo_cython_modules.pyx: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Hieu Tran. All Rights Reserved. 2 | # 3 | # DeepNovo is publicly available for non-commercial uses. 4 | # ============================================================================== 5 | 6 | """TODO(nh2tran): docstring.""" 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import sys 13 | 14 | import numpy as np 15 | cimport numpy as np 16 | cimport cython 17 | 18 | import deepnovo_config 19 | 20 | mass_ID_np = deepnovo_config.mass_ID_np 21 | cdef int GO_ID = deepnovo_config.GO_ID 22 | cdef int EOS_ID = deepnovo_config.EOS_ID 23 | cdef float mass_H2O = deepnovo_config.mass_H2O 24 | cdef float mass_NH3 = deepnovo_config.mass_NH3 25 | cdef float mass_H = deepnovo_config.mass_H 26 | cdef int SPECTRUM_RESOLUTION = deepnovo_config.SPECTRUM_RESOLUTION 27 | cdef int WINDOW_SIZE = deepnovo_config.WINDOW_SIZE 28 | cdef int vocab_size = deepnovo_config.vocab_size 29 | cdef int num_ion = deepnovo_config.num_ion 30 | cdef int neighbor_size = deepnovo_config.neighbor_size 31 | cdef int MZ_SIZE = deepnovo_config.MZ_SIZE 32 | 33 | 34 | @cython.boundscheck(False) # turn off bounds-checking 35 | @cython.wraparound(False) # turn off negative index wrapping 36 | cdef void copy_values(float[:,:,:] candidate_intensity_view, float[:,:] spectrum_view, int[:,:] location_sub, int i1, int i2): 37 | cdef int j 38 | cdef int neighbor 39 | cdef int i1_start = neighbor_size * i1 40 | for neighbor in range(neighbor_size): 41 | for j in range(WINDOW_SIZE): 42 | candidate_intensity_view[i2, i1_start + neighbor, j] = spectrum_view[neighbor, location_sub[i1, i2] + j] 43 | 44 | 45 | @cython.boundscheck(False) # turn off bounds-checking 46 | @cython.wraparound(False) # turn off negative index wrapping 47 | def get_location(peptide_mass, prefix_mass, direction): 48 | if direction == 0: 49 | candidate_b_mass = prefix_mass + mass_ID_np 50 | candidate_y_mass = peptide_mass - candidate_b_mass 51 | elif direction == 1: 52 | candidate_y_mass = prefix_mass + mass_ID_np 53 | candidate_b_mass = peptide_mass - candidate_y_mass 54 | 55 | # b-ions 56 | candidate_b_H2O = candidate_b_mass - mass_H2O 57 | candidate_b_NH3 = candidate_b_mass - mass_NH3 58 | candidate_b_plus2_charge1 = ((candidate_b_mass + 2 * mass_H) / 2 59 | - mass_H) 60 | 61 | # y-ions 62 | candidate_y_H2O = candidate_y_mass - mass_H2O 63 | candidate_y_NH3 = candidate_y_mass - mass_NH3 64 | candidate_y_plus2_charge1 = ((candidate_y_mass + 2 * mass_H) / 2 65 | - mass_H) 66 | 67 | # ion_2 68 | #~ b_ions = [candidate_b_mass] 69 | #~ y_ions = [candidate_y_mass] 70 | #~ ion_mass_list = b_ions + y_ions 71 | 72 | # ion_8 73 | b_ions = [candidate_b_mass, 74 | candidate_b_H2O, 75 | candidate_b_NH3, 76 | candidate_b_plus2_charge1] 77 | y_ions = [candidate_y_mass, 78 | candidate_y_H2O, 79 | candidate_y_NH3, 80 | candidate_y_plus2_charge1] 81 | ion_mass_list = b_ions + y_ions 82 | ion_mass = np.array(ion_mass_list, dtype=np.float32) 83 | 84 | # ion locations 85 | location_sub50 = np.rint(ion_mass * SPECTRUM_RESOLUTION).astype(np.int32) # TODO(nh2tran): line-too-long 86 | # location_sub50 = np.int32(ion_mass * SPECTRUM_RESOLUTION) 87 | location_sub50 -= (WINDOW_SIZE // 2) 88 | location_plus50 = location_sub50 + WINDOW_SIZE 89 | ion_id_rows, aa_id_cols = np.nonzero(np.logical_and( 90 | location_sub50 >= 0, 91 | location_plus50 <= MZ_SIZE)) 92 | return ion_id_rows, aa_id_cols, location_sub50, location_plus50 93 | 94 | @cython.boundscheck(False) # turn off bounds-checking 95 | @cython.wraparound(False) # turn off negative index wrapping 96 | def get_candidate_intensity(float[:,:] spectrum_original, peptide_mass, prefix_mass, direction): 97 | """TODO(nh2tran): docstring.""" 98 | ion_id_rows, aa_id_cols, location_sub50, location_plus50 = get_location(peptide_mass, prefix_mass, direction) 99 | # candidate_intensity 100 | candidate_intensity = np.zeros(shape=(vocab_size, 101 | neighbor_size*num_ion, 102 | WINDOW_SIZE), 103 | dtype=np.float32) 104 | cdef int [:,:] location_sub50_view = location_sub50 105 | cdef int [:,:] location_plus50_view = location_plus50 106 | cdef float [:,:,:] candidate_intensity_view = candidate_intensity 107 | cdef int[:] row = ion_id_rows.astype(np.int32) 108 | cdef int[:] col = aa_id_cols.astype(np.int32) 109 | cdef int index 110 | for index in range(ion_id_rows.size): 111 | if col[index] < 3: 112 | continue 113 | copy_values(candidate_intensity_view, spectrum_original, location_sub50_view, row[index], col[index]) 114 | # PAD/GO/EOS 115 | # candidate_intensity[deepnovo_config.PAD_ID].fill(0.0) 116 | # candidate_intensity[FIRST_LABEL].fill(0.0) 117 | # candidate_intensity[LAST_LABEL].fill(0.0) 118 | #~ b_ion_count = len(b_ions) 119 | #~ if (direction==0): 120 | #~ candidate_intensity[LAST_LABEL,b_ion_count:].fill(0.0) 121 | #~ elif (direction==1): 122 | #~ candidate_intensity[LAST_LABEL,:b_ion_count].fill(0.0) 123 | 124 | #~ for aa_id in ([LAST_LABEL] + range(3,deepnovo_config.vocab_size)): 125 | #~ for ion_id in range(deepnovo_config.num_ion): 126 | #~ location_sub50 = location_sub50_list[ion_id][aa_id] 127 | #~ # 128 | #~ if (location_sub50 > 0): 129 | #~ candidate_intensity[aa_id,ion_id] = spectrum_original[location_sub50:location_sub50+deepnovo_config.WINDOW_SIZE] 130 | 131 | # Nomalization to [0, 1] 132 | max_intensity = np.max(candidate_intensity) 133 | if max_intensity > 1.0: 134 | candidate_intensity /= max_intensity 135 | # Nomalization to N(0,1): tf.image.per_image_whitening 136 | #~ adjusted_stddev = max(np.std(candidate_intensity), 1.0/math.sqrt(candidate_intensity.size)) 137 | #~ candidate_intensity = (candidate_intensity-np.mean(candidate_intensity)) / adjusted_stddev 138 | return candidate_intensity 139 | 140 | 141 | def process_spectrum(spectrum_mz_list, spectrum_intensity_list, peptide_mass): 142 | """TODO(nh2tran): docstring.""" 143 | 144 | # neutral mass, location, assuming ion charge z=1 145 | charge = 1.0 146 | spectrum_mz = np.array(spectrum_mz_list, dtype=np.float32) 147 | neutral_mass = spectrum_mz - charge*deepnovo_config.mass_H 148 | neutral_mass_location = np.rint(neutral_mass * deepnovo_config.SPECTRUM_RESOLUTION).astype(np.int32) # TODO(nh2tran): line-too-long 149 | cdef int [:] neutral_mass_location_view = neutral_mass_location 150 | 151 | # intensity 152 | spectrum_intensity = np.array(spectrum_intensity_list, dtype=np.float32) 153 | # log-transform 154 | #~ spectrum_intensity = np.log(spectrum_intensity) 155 | # find max intensity value for normalization and to assign to special locations 156 | spectrum_intensity_max = np.max(spectrum_intensity) 157 | # no normalization for each individual spectrum, we'll do it for multi-spectra 158 | #~ norm_intensity = spectrum_intensity / spectrum_intensity_max 159 | norm_intensity = spectrum_intensity 160 | cdef float [:] norm_intensity_view = norm_intensity 161 | 162 | # fill spectrum holders 163 | spectrum_holder = np.zeros(shape=(1, deepnovo_config.MZ_SIZE), dtype=np.float32) 164 | cdef float [:,:] spectrum_holder_view = spectrum_holder 165 | # note that different peaks may fall into the same location, hence loop += 166 | cdef int index 167 | for index in range(neutral_mass_location.size): 168 | #~ spectrum_holder_view[neutral_mass_location_view[index]] += norm_intensity_view[index] # TODO(nh2tran): line-too-long 169 | spectrum_holder_view[0, neutral_mass_location_view[index]] = max(spectrum_holder_view[0, neutral_mass_location_view[index]], # TODO(nh2tran): line-too-long 170 | norm_intensity_view[index]) # TODO(nh2tran): line-too-long 171 | spectrum_original_forward = np.copy(spectrum_holder) 172 | spectrum_original_backward = np.copy(spectrum_holder) 173 | 174 | # add complement 175 | complement_mass = peptide_mass - neutral_mass 176 | complement_mass_location = np.rint(complement_mass * deepnovo_config.SPECTRUM_RESOLUTION).astype(np.int32) # TODO(nh2tran): line-too-long 177 | cdef int [:] complement_mass_location_view = complement_mass_location 178 | #~ cdef int index 179 | for index in np.nonzero(complement_mass_location > 0)[0]: 180 | spectrum_holder_view[0, complement_mass_location_view[index]] += norm_intensity_view[index] # TODO(nh2tran): line-too-long 181 | 182 | # peptide_mass 183 | spectrum_original_forward[0, int(round(peptide_mass * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 # TODO(nh2tran): line-too-long 184 | spectrum_original_backward[0, int(round(peptide_mass * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 # TODO(nh2tran): line-too-long 185 | 186 | # N-terminal, b-ion, peptide_mass_C 187 | # append N-terminal 188 | mass_N = deepnovo_config.mass_N_terminus - deepnovo_config.mass_H 189 | spectrum_holder[0, int(round(mass_N * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 190 | # append peptide_mass_C 191 | mass_C = deepnovo_config.mass_C_terminus + deepnovo_config.mass_H 192 | peptide_mass_C = peptide_mass - mass_C 193 | spectrum_holder[0, int(round(peptide_mass_C * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 # TODO(nh2tran): line-too-long 194 | spectrum_original_forward[0, int(round(peptide_mass_C * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 # TODO(nh2tran): line-too-long 195 | 196 | # C-terminal, y-ion, peptide_mass_N 197 | # append C-terminal 198 | mass_C = deepnovo_config.mass_C_terminus + deepnovo_config.mass_H 199 | spectrum_holder[0, int(round(mass_C * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 200 | # append peptide_mass_N 201 | mass_N = deepnovo_config.mass_N_terminus - deepnovo_config.mass_H 202 | peptide_mass_N = peptide_mass - mass_N 203 | spectrum_holder[0, int(round(peptide_mass_N * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max# 1.0 # TODO(nh2tran): line-too-long 204 | spectrum_original_backward[0, int(round(peptide_mass_N * deepnovo_config.SPECTRUM_RESOLUTION))] = spectrum_intensity_max # 1.0 # TODO(nh2tran): line-too-long 205 | 206 | return spectrum_holder, spectrum_original_forward, spectrum_original_backward 207 | -------------------------------------------------------------------------------- /deepnovo_preprocess.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import math 6 | import os 7 | import random 8 | import sys 9 | import time 10 | import re 11 | 12 | import csv 13 | import numpy as np 14 | random.seed(0) 15 | np.random.seed(0) 16 | 17 | from Bio import SeqIO 18 | from Bio.SeqIO import FastaIO 19 | 20 | import deepnovo_config 21 | 22 | 23 | 24 | 25 | 26 | 27 | # write multi-line fasta file into single-line format 28 | def write_fasta_1line(input_fasta_file, output_fasta_file): 29 | with open(input_fasta_file, "r") as handle: 30 | record_list = list(SeqIO.parse(handle, "fasta")) 31 | print(input_fasta_file) 32 | print("Number of protein sequences: ", len(record_list)) 33 | with open(output_fasta_file, "w") as handle: 34 | fasta_writer = FastaIO.FastaWriter(handle, wrap=None) 35 | fasta_writer.write_file(record_list) 36 | 37 | # ~ input_fasta_file = "data/uniprot.human_all_isoforms.fasta" 38 | # ~ output_fasta_file = input_fasta_file + ".1line" 39 | # ~ write_fasta_1line(input_fasta_file, output_fasta_file) 40 | 41 | 42 | # randomly split a feature file into train/valid/test files for training 43 | def split_feature_training(input_feature_file, proportion): 44 | print("split_feature_training()") 45 | 46 | print("input_feature_file = ", input_feature_file) 47 | print("proportion = ", proportion) 48 | 49 | output_file_train = input_feature_file + ".train" 50 | output_file_valid = input_feature_file + ".valid" 51 | output_file_test = input_feature_file + ".test" 52 | print("output_file_train =", output_file_train) 53 | print("output_file_valid =", output_file_valid) 54 | print("output_file_test =", output_file_test) 55 | 56 | 57 | num_total = 0 58 | num_train = 0 59 | num_valid = 0 60 | num_test = 0 61 | 62 | # read and write header line 63 | csv_reader = csv.DictReader(open(input_feature_file)) 64 | csv_writer_train = csv.DictWriter(open(output_file_train, mode='w'), csv_reader.fieldnames) 65 | csv_writer_valid = csv.DictWriter(open(output_file_valid, mode='w'), csv_reader.fieldnames) 66 | csv_writer_test = csv.DictWriter(open(output_file_test, mode='w'), csv_reader.fieldnames) 67 | csv_writer_train.writeheader() 68 | csv_writer_valid.writeheader() 69 | csv_writer_test.writeheader() 70 | 71 | # iterate over feature rows 72 | # use random numbers 0/1/2 to assign rows to writers train/valid/test 73 | for row in csv_reader: 74 | num_total += 1 75 | random_num = np.random.choice(a=3, size=1, p=proportion) 76 | if random_num == 0: 77 | csv_writer = csv_writer_train 78 | num_train += 1 79 | elif random_num == 1: 80 | csv_writer = csv_writer_valid 81 | num_valid += 1 82 | else: 83 | csv_writer = csv_writer_test 84 | num_test += 1 85 | csv_writer.writerow(row) 86 | 87 | print("num_total =", num_total) 88 | print("num_train =", num_train) 89 | print("num_valid =", num_valid) 90 | print("num_test =", num_test) 91 | 92 | # ~ input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_15/feature.csv.labeled.mass_corrected" 93 | # ~ proportion = [0.90, 0.05, 0.05] 94 | # ~ split_feature_training(input_feature_file, proportion) 95 | 96 | 97 | # randomly split a feature file into train/valid/test files for training 98 | # train/valid/test do NOT SHARE PEPTIDES 99 | def split_feature_training_noshare(input_feature_file, proportion): 100 | """Randomly split a feature file into train/valid/test files for training. 101 | train/valid/test do NOT SHARE PEPTIDES. 102 | 103 | Usage: 104 | input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled.mass_corrected" 105 | proportion = [0.90, 0.05, 0.05] 106 | split_feature_training_noshare(input_feature_file, proportion) 107 | """ 108 | 109 | print("split_feature_training_noshare()") 110 | 111 | print("input_feature_file = ", input_feature_file) 112 | print("proportion = ", proportion) 113 | 114 | output_file_train = input_feature_file + ".train" + ".noshare" 115 | output_file_valid = input_feature_file + ".valid" + ".noshare" 116 | output_file_test = input_feature_file + ".test" + ".noshare" 117 | print("output_file_train =", output_file_train) 118 | print("output_file_valid =", output_file_valid) 119 | print("output_file_test =", output_file_test) 120 | 121 | num_total = 0 122 | num_unique = 0 123 | num_train = 0 124 | num_valid = 0 125 | num_test = 0 126 | 127 | peptide_train_list = [] 128 | peptide_valid_list = [] 129 | peptide_test_list = [] 130 | 131 | # read and write header line 132 | csv_reader = csv.DictReader(open(input_feature_file)) 133 | csv_writer_train = csv.DictWriter(open(output_file_train, mode='w'), csv_reader.fieldnames) 134 | csv_writer_valid = csv.DictWriter(open(output_file_valid, mode='w'), csv_reader.fieldnames) 135 | csv_writer_test = csv.DictWriter(open(output_file_test, mode='w'), csv_reader.fieldnames) 136 | csv_writer_train.writeheader() 137 | csv_writer_valid.writeheader() 138 | csv_writer_test.writeheader() 139 | 140 | # iterate over feature rows 141 | # if the peptide already exists, use the corresponding writer 142 | # if not, use random numbers 0/1/2 to assign writers train/valid/test 143 | for row in csv_reader: 144 | num_total += 1 145 | peptide = row['seq'] 146 | if (peptide in peptide_train_list): 147 | csv_writer = csv_writer_train 148 | num_train += 1 149 | elif (peptide in peptide_valid_list): 150 | csv_writer = csv_writer_valid 151 | num_valid += 1 152 | elif (peptide in peptide_test_list): 153 | csv_writer = csv_writer_test 154 | num_test += 1 155 | else: 156 | num_unique += 1 157 | random_num = np.random.choice(a=3, size=1, p=proportion) 158 | if random_num == 0: 159 | peptide_train_list.append(peptide) 160 | csv_writer = csv_writer_train 161 | num_train += 1 162 | elif random_num == 1: 163 | peptide_valid_list.append(peptide) 164 | csv_writer = csv_writer_valid 165 | num_valid += 1 166 | else: 167 | peptide_test_list.append(peptide) 168 | csv_writer = csv_writer_test 169 | num_test += 1 170 | csv_writer.writerow(row) 171 | 172 | print("num_total =", num_total) 173 | print("num_unique =", num_unique) 174 | print("num_train =", num_train) 175 | print("num_valid =", num_valid) 176 | print("num_test =", num_test) 177 | 178 | 179 | # calculate peptide mass = N-terminus + amino acids + C-terminus 180 | def compute_peptide_mass(peptide): 181 | """TODO(nh2tran): docstring. 182 | """ 183 | 184 | peptide_mass = (deepnovo_config.mass_N_terminus 185 | + sum(deepnovo_config.mass_AA[aa] for aa in peptide) 186 | + deepnovo_config.mass_C_terminus) 187 | 188 | return peptide_mass 189 | 190 | # ~ peptide = 'AAAAAAALQAK' 191 | # ~ print(compute_peptide_mass(peptide)) 192 | 193 | 194 | # parse peptide sequence with modifications 195 | # C(+57.02) >> C(Carbamidomethylation) 196 | # M(+15.99) >> M(Oxidation) 197 | # NQ(+.98) >> NQ(Deamidation) 198 | def parse_sequence_with_mod(raw_sequence): 199 | #print("parse_sequence_with_mod()") 200 | 201 | raw_sequence_len = len(raw_sequence) 202 | index = 0 203 | peptide = [] 204 | while index < raw_sequence_len: 205 | if raw_sequence[index] == "(": 206 | if peptide[-1] == "C" and raw_sequence[index:index + 8] == "(+57.02)": 207 | peptide[-1] = "C(Carbamidomethylation)" 208 | index += 8 209 | elif peptide[-1] == 'M' and raw_sequence[index:index + 8] == "(+15.99)": 210 | peptide[-1] = 'M(Oxidation)' 211 | index += 8 212 | elif peptide[-1] == 'N' and raw_sequence[index:index + 6] == "(+.98)": 213 | peptide[-1] = 'N(Deamidation)' 214 | index += 6 215 | elif peptide[-1] == 'Q' and raw_sequence[index:index + 6] == "(+.98)": 216 | peptide[-1] = 'Q(Deamidation)' 217 | index += 6 218 | else: # unknown modification 219 | print("ERROR: unknown modification!") 220 | print("raw_sequence = ", raw_sequence) 221 | sys.exit() 222 | else: 223 | peptide.append(raw_sequence[index]) 224 | index += 1 225 | 226 | return peptide 227 | 228 | # ~ raw_sequence = 'RHM(+15.99)GIGKR' 229 | # ~ print(parse_sequence_with_mod(raw_sequence)) 230 | 231 | 232 | # calculate ppm of precursor_mz against peptide_mz 233 | # ppm / 1e6 = (precursor_mz - peptide_mz) / peptide_mz 234 | def calculate_mass_shift_ppm(input_feature_file): 235 | """Calculate ppm of precursor_mz against peptide_mz. 236 | ppm / 1e6 = (precursor_mz - peptide_mz) / peptide_mz 237 | 238 | Usage: 239 | input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled" 240 | ppm = calculate_mass_shift_ppm(input_feature_file) 241 | """ 242 | 243 | print("calculate_mass_shift_ppm()") 244 | 245 | print("input_feature_file = ", input_feature_file) 246 | 247 | precursor_ppm_list = [] 248 | csv_reader = csv.DictReader(open(input_feature_file)) 249 | for row in csv_reader: 250 | peptide = parse_sequence_with_mod(row['seq']) 251 | precursor_mz = float(row['m/z']) 252 | precursor_charge = float(row['z']) 253 | peptide_mass = compute_peptide_mass(peptide) 254 | peptide_mz = (peptide_mass + precursor_charge * deepnovo_config.mass_H) / precursor_charge 255 | precursor_ppm = (precursor_mz - peptide_mz) / peptide_mz * 1e6 256 | precursor_ppm_list.append(precursor_ppm) 257 | mean_precursor_ppm = np.mean(precursor_ppm_list) 258 | 259 | print("mean_precursor_ppm =", mean_precursor_ppm) 260 | return mean_precursor_ppm 261 | 262 | 263 | # correct precursor_mz given ppm 264 | # corrected_mz = precursor_mz / (1 + ppm / 1e6) 265 | def correct_mass_shift_ppm(input_feature_file, ppm): 266 | """Correct precursor_mz given ppm: corrected_mz = precursor_mz / (1 + ppm / 1e6). 267 | 268 | Usage: 269 | input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv" 270 | correct_mass_shift_ppm(input_feature_file, ppm) 271 | """ 272 | 273 | print("correct_mass_shift_ppm()") 274 | 275 | print("input_feature_file = ", input_feature_file) 276 | print("ppm =", ppm) 277 | 278 | output_feature_file = input_feature_file + ".mass_corrected" 279 | print("output_feature_file =", output_feature_file) 280 | 281 | csv_reader = csv.DictReader(open(input_feature_file)) 282 | csv_writer = csv.DictWriter(open(output_feature_file, mode='w'), csv_reader.fieldnames) 283 | csv_writer.writeheader() 284 | for row in csv_reader: 285 | precursor_mz = float(row['m/z']) 286 | corrected_mz = precursor_mz / (1 + ppm / 1e6) 287 | row['m/z'] = corrected_mz 288 | csv_writer.writerow(row) 289 | 290 | 291 | # split a feature file into labeled and unlabeled files 292 | def split_feature_unlabel(input_feature_file): 293 | """Split a feature file into labeled and unlabeled files. 294 | 295 | Usage: 296 | input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv" 297 | split_feature_unlabel(input_feature_file) 298 | """ 299 | 300 | print(''.join(['='] * 80)) # section-separating line 301 | print("split_feature_unlabel()") 302 | print("input_feature_file =", input_feature_file) 303 | 304 | output_file_labeled = input_feature_file + ".labeled" 305 | output_file_unlabeled = input_feature_file + ".unlabeled" 306 | print("output_file_labeled =", output_file_labeled) 307 | print("output_file_unlabeled =", output_file_unlabeled) 308 | 309 | num_labeled = 0 310 | num_unlabeled = 0 311 | 312 | # read and write header line 313 | csv_reader = csv.DictReader(open(input_feature_file)) 314 | csv_writer_labeled = csv.DictWriter(open(output_file_labeled, mode='w'), csv_reader.fieldnames) 315 | csv_writer_unlabeled = csv.DictWriter(open(output_file_unlabeled, mode='w'), csv_reader.fieldnames) 316 | csv_writer_labeled.writeheader() 317 | csv_writer_unlabeled.writeheader() 318 | 319 | # iterate over feature rows 320 | # unlabeled features have empty peptide sequence 321 | for row in csv_reader: 322 | peptide = row['seq'] 323 | if peptide == '': 324 | csv_writer = csv_writer_unlabeled 325 | num_unlabeled += 1 326 | else: 327 | csv_writer = csv_writer_labeled 328 | num_labeled += 1 329 | csv_writer.writerow(row) 330 | 331 | print("num_labeled =", num_labeled) 332 | print("num_unlabeled =", num_unlabeled) 333 | 334 | 335 | # merge multiple mgf files into one, adding fraction ID to scan ID 336 | def merge_mgf_file(input_file_list, fraction_list, output_file): 337 | """Merge multiple mgf files into one, adding fraction ID to scan ID. 338 | 339 | Usage: 340 | folder_path = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/" 341 | fraction_list = range(0, 10+1) 342 | merge_mgf_file( 343 | input_file_list=[folder_path + "export_" + str(i) + ".mgf" for i in fraction_list], 344 | fraction_list=fraction_list, 345 | output_file=folder_path + "spectrum.mgf") 346 | """ 347 | 348 | print("merge_mgf_file()") 349 | 350 | # iterate over mgf files and their lines 351 | counter = 0 352 | with open(output_file, mode="w") as output_handle: 353 | for input_file, fraction in zip(input_file_list, fraction_list): 354 | print("input_file = ", os.path.join(input_file)) 355 | with open(input_file, mode="r") as input_handle: 356 | for line in input_handle: 357 | if "SCANS=" in line: # a spectrum found 358 | counter += 1 359 | scan = re.split('=|\n|\r', line)[1] 360 | # re-number scan id 361 | output_handle.write("SCANS=F{0}:{1}\n".format(fraction, scan)) 362 | else: 363 | output_handle.write(line) 364 | print("output_file = {0:s}".format(output_file)) 365 | print("counter = {0:d}".format(counter)) 366 | 367 | 368 | # merge multiple feature files into one, adding fraction ID to feature & scan ID 369 | def merge_feature_file(input_file_list, fraction_list, output_file): 370 | """Merge multiple feature files into one, adding fraction ID to feature & scan ID. 371 | 372 | Usage: 373 | folder_path = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/" 374 | fraction_list = range(0, 10+1) 375 | merge_feature_file( 376 | input_file_list=[folder_path + "export_" + str(i) + ".csv" for i in fraction_list], 377 | fraction_list=fraction_list, 378 | output_file=folder_path + "feature.csv") 379 | """ 380 | 381 | print("merge_feature_file()") 382 | 383 | # read and write header line 384 | csv_reader = csv.DictReader(open(input_file_list[0])) 385 | csv_writer = csv.DictWriter(open(output_file, mode='w'), csv_reader.fieldnames) 386 | csv_writer.writeheader() 387 | 388 | # iterate over feature files and their rows 389 | counter = 0 390 | for input_file, fraction in zip(input_file_list, fraction_list): 391 | print("input_file = ", os.path.join(input_file)) 392 | csv_reader = csv.DictReader(open(input_file)) 393 | for row in csv_reader: 394 | counter += 1 395 | # add fraction to feature id 396 | feature_id = row['spec_group_id'] 397 | feature_id = "F" + str(fraction) + ":" + feature_id 398 | row['spec_group_id'] = feature_id 399 | # add fraction to scan id 400 | scan_list = re.split(';', row['scans']) 401 | scan_list = ["F" + str(fraction) + ":" + x for x in scan_list] 402 | row['scans'] = ";".join(scan_list) 403 | # join the line back together and write to output 404 | csv_writer.writerow(row) 405 | print("output_file = {0:s}".format(output_file)) 406 | print("counter = {0:d}".format(counter)) 407 | 408 | 409 | -------------------------------------------------------------------------------- /deepnovo_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Hieu Tran. All Rights Reserved. 2 | # 3 | # DeepNovo is publicly available for non-commercial uses. 4 | # ============================================================================== 5 | 6 | """TODO(nh2tran): docstring.""" 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import numpy as np 13 | import tensorflow as tf 14 | 15 | 16 | # ============================================================================== 17 | # FLAGS (options) for this app 18 | # ============================================================================== 19 | 20 | 21 | tf.app.flags.DEFINE_string("train_dir", # flag_name 22 | "train", # default_value 23 | "Training directory.") # docstring 24 | 25 | tf.app.flags.DEFINE_boolean("reset_step", 26 | False, # default_value 27 | "Set to true to reset the global step after loading a pretrained model.") 28 | 29 | tf.app.flags.DEFINE_integer("direction", 30 | 2, 31 | "Set to 0/1/2 for Forward/Backward/Bi-directional.") 32 | 33 | tf.app.flags.DEFINE_boolean("use_intensity", 34 | True, 35 | "Set to True to use intensity-model.") 36 | 37 | tf.app.flags.DEFINE_boolean("shared", 38 | False, 39 | "Set to True to use shared weights.") 40 | 41 | tf.app.flags.DEFINE_boolean("use_lstm", 42 | True, 43 | "Set to True to use lstm-model.") 44 | 45 | tf.app.flags.DEFINE_boolean("lstm_kmer", 46 | False, 47 | "Set to True to use lstm model on k-mers instead of full sequence.") 48 | 49 | tf.app.flags.DEFINE_boolean("knapsack_build", 50 | False, 51 | "Set to True to build knapsack matrix.") 52 | 53 | tf.app.flags.DEFINE_boolean("train", 54 | False, 55 | "Set to True for training.") 56 | 57 | tf.app.flags.DEFINE_boolean("test_true_feeding", 58 | False, 59 | "Set to True for testing.") 60 | 61 | tf.app.flags.DEFINE_boolean("decode", 62 | False, 63 | "Set to True for decoding.") 64 | 65 | tf.app.flags.DEFINE_boolean("beam_search", 66 | False, 67 | "Set to True for beam search.") 68 | 69 | tf.app.flags.DEFINE_integer("beam_size", 70 | 5, 71 | "Number of optimal paths to search during decoding.") 72 | 73 | tf.app.flags.DEFINE_boolean("search_db", 74 | False, 75 | "Set to True to do a database search.") 76 | 77 | tf.app.flags.DEFINE_boolean("search_denovo", 78 | False, 79 | "Set to True to do a denovo search.") 80 | 81 | tf.app.flags.DEFINE_boolean("search_hybrid", 82 | False, 83 | "Set to True to do a hybrid, db+denovo, search.") 84 | 85 | tf.app.flags.DEFINE_boolean("test", 86 | False, 87 | "Set to True to test the prediction accuracy.") 88 | 89 | tf.app.flags.DEFINE_boolean("header_seq", 90 | True, 91 | "Set to False if peptide sequence is not provided.") 92 | 93 | tf.app.flags.DEFINE_boolean("decoy", 94 | False, 95 | "Set to True to search decoy database.") 96 | 97 | tf.app.flags.DEFINE_integer("multiprocessor", 98 | 1, 99 | "Use multi processors to read data during training.") 100 | 101 | 102 | # I/O arguments 103 | tf.app.flags.DEFINE_string("train_spectrum", 104 | "train_spectrum", 105 | "Spectrum mgf file to train a new model.") 106 | tf.app.flags.DEFINE_string("train_feature", 107 | "train_feature", 108 | "Feature csv file to train a new model.") 109 | tf.app.flags.DEFINE_string("valid_spectrum", 110 | "valid_spectrum", 111 | "Spectrum mgf file for validation during training.") 112 | tf.app.flags.DEFINE_string("valid_feature", 113 | "valid_feature", 114 | "Feature csv file for validation during training.") 115 | tf.app.flags.DEFINE_string("test_spectrum", 116 | "test_spectrum", 117 | "Spectrum mgf file for testing.") 118 | tf.app.flags.DEFINE_string("test_feature", 119 | "test_feature", 120 | "Feature csv file for testing.") 121 | tf.app.flags.DEFINE_string("denovo_spectrum", 122 | "denovo_spectrum", 123 | "Spectrum mgf file to perform de novo sequencing.") 124 | tf.app.flags.DEFINE_string("denovo_feature", 125 | "denovo_feature", 126 | "Feature csv file to perform de novo sequencing.") 127 | tf.app.flags.DEFINE_string("target_file", 128 | "target_file", 129 | "Target file to calculate the prediction accuracy.") 130 | tf.app.flags.DEFINE_string("predicted_file", 131 | "predicted_file", 132 | "Predicted file to calculate the prediction accuracy.") 133 | 134 | 135 | FLAGS = tf.app.flags.FLAGS 136 | 137 | 138 | # ============================================================================== 139 | # GLOBAL VARIABLES for VOCABULARY 140 | # ============================================================================== 141 | 142 | 143 | # Special vocabulary symbols - we always put them at the start. 144 | _PAD = "_PAD" 145 | _GO = "_GO" 146 | _EOS = "_EOS" 147 | _START_VOCAB = [_PAD, _GO, _EOS] 148 | 149 | PAD_ID = 0 150 | GO_ID = 1 151 | EOS_ID = 2 152 | 153 | vocab_reverse = ['A', 154 | 'R', 155 | 'N', 156 | 'N(Deamidation)', 157 | 'D', 158 | 'C', 159 | #'C(Carbamidomethylation)', 160 | 'E', 161 | 'Q', 162 | 'Q(Deamidation)', 163 | 'G', 164 | 'H', 165 | 'I', 166 | 'L', 167 | 'K', 168 | 'M', 169 | 'M(Oxidation)', 170 | 'F', 171 | 'P', 172 | 'S', 173 | 'T', 174 | 'W', 175 | 'Y', 176 | 'V', 177 | ] 178 | 179 | vocab_reverse = _START_VOCAB + vocab_reverse 180 | print("vocab_reverse ", vocab_reverse) 181 | 182 | vocab = dict([(x, y) for (y, x) in enumerate(vocab_reverse)]) 183 | print("vocab ", vocab) 184 | 185 | vocab_size = len(vocab_reverse) 186 | print("vocab_size ", vocab_size) 187 | 188 | 189 | # ============================================================================== 190 | # GLOBAL VARIABLES for THEORETICAL MASS 191 | # ============================================================================== 192 | 193 | 194 | mass_H = 1.0078 195 | mass_H2O = 18.0106 196 | mass_NH3 = 17.0265 197 | mass_N_terminus = 1.0078 198 | mass_C_terminus = 17.0027 199 | mass_CO = 27.9949 200 | 201 | mass_AA = {'_PAD': 0.0, 202 | '_GO': mass_N_terminus-mass_H, 203 | '_EOS': mass_C_terminus+mass_H, 204 | 'A': 71.03711, # 0 205 | 'R': 156.10111, # 1 206 | 'N': 114.04293, # 2 207 | 'N(Deamidation)': 115.02695, 208 | 'D': 115.02694, # 3 209 | 'C': 103.00919, # 4 210 | #'C(Carbamidomethylation)': 161.01919, # C(+58.01) # orbi 211 | #'C(Carbamidomethylation)': 160.03065, # C(+57.02) 212 | 'E': 129.04259, # 5 213 | 'Q': 128.05858, # 6 214 | 'Q(Deamidation)': 129.0426, 215 | 'G': 57.02146, # 7 216 | 'H': 137.05891, # 8 217 | 'I': 113.08406, # 9 218 | 'L': 113.08406, # 10 219 | 'K': 128.09496, # 11 220 | 'M': 131.04049, # 12 221 | 'M(Oxidation)': 147.0354, 222 | 'F': 147.06841, # 13 223 | 'P': 97.05276, # 14 224 | 'S': 87.03203, # 15 225 | 'T': 101.04768, # 16 226 | 'W': 186.07931, # 17 227 | 'Y': 163.06333, # 18 228 | 'V': 99.06841, # 19 229 | } 230 | 231 | mass_ID = [mass_AA[vocab_reverse[x]] for x in range(vocab_size)] 232 | mass_ID_np = np.array(mass_ID, dtype=np.float32) 233 | 234 | mass_AA_min = mass_AA["G"] # 57.02146 235 | 236 | 237 | # ============================================================================== 238 | # GLOBAL VARIABLES for PRECISION, RESOLUTION, temp-Limits of MASS & LEN 239 | # ============================================================================== 240 | 241 | 242 | # if change, need to re-compile cython_speedup << NO NEED 243 | # ~ SPECTRUM_RESOLUTION = 10 # bins for 1.0 Da = precision 0.1 Da 244 | # ~ SPECTRUM_RESOLUTION = 20 # bins for 1.0 Da = precision 0.05 Da 245 | # ~ SPECTRUM_RESOLUTION = 40 # bins for 1.0 Da = precision 0.025 Da 246 | SPECTRUM_RESOLUTION = 50 # bins for 1.0 Da = precision 0.02 Da 247 | # ~ SPECTRUM_RESOLUTION = 100 # bins for 1.0 Da = precision 0.01 Da 248 | print("SPECTRUM_RESOLUTION ", SPECTRUM_RESOLUTION) 249 | 250 | # if change, need to re-compile cython_speedup << NO NEED 251 | WINDOW_SIZE = 10 # 10 bins 252 | print("WINDOW_SIZE ", WINDOW_SIZE) 253 | 254 | # skip peptide mass > MZ_MAX 255 | MZ_MAX = 3000.0 256 | MZ_SIZE = int(MZ_MAX * SPECTRUM_RESOLUTION) # 30k 257 | 258 | KNAPSACK_AA_RESOLUTION = 10000 # 0.0001 Da 259 | mass_AA_min_round = int(round(mass_AA_min * KNAPSACK_AA_RESOLUTION)) # 57.02146 260 | KNAPSACK_MASS_PRECISION_TOLERANCE = 100 # 0.01 Da 261 | num_position = 0 262 | 263 | PRECURSOR_MASS_PRECISION_TOLERANCE = 0.01 264 | 265 | # ONLY for accuracy evaluation 266 | # ~ PRECURSOR_MASS_PRECISION_INPUT_FILTER = 0.01 267 | # ~ PRECURSOR_MASS_PRECISION_INPUT_FILTER = 1000 268 | AA_MATCH_PRECISION = 0.1 269 | 270 | # during training or test_true_feeding: 271 | # skip peptide length > MAX_LEN 272 | # assign peptides to buckets of the same length for efficient padding 273 | if FLAGS.train or FLAGS.test_true_feeding: 274 | MAX_LEN = 30 275 | _buckets = [12, 22, 32] 276 | print("MAX_LEN ", MAX_LEN) 277 | print("_buckets ", _buckets) 278 | 279 | 280 | # ============================================================================== 281 | # HYPER-PARAMETERS of the NEURAL NETWORKS 282 | # ============================================================================== 283 | 284 | 285 | num_ion = 8 # 2 286 | print("num_ion ", num_ion) 287 | 288 | l2_weight = 0.0 289 | print("l2_weight ", l2_weight) 290 | 291 | embedding_size = 512 292 | print("embedding_size ", embedding_size) 293 | 294 | num_layers = 1 295 | num_units = 512 296 | print("num_layers ", num_layers) 297 | print("num_units ", num_units) 298 | 299 | keep_conv = 0.75 300 | keep_dense = 0.5 301 | print("keep_conv ", keep_conv) 302 | print("keep_dense ", keep_dense) 303 | 304 | max_gradient_norm = 5.0 305 | print("max_gradient_norm ", max_gradient_norm) 306 | 307 | # DIA model parameters 308 | neighbor_size = 5 # allow up to ? spectra, including the main spectrum 309 | dia_window = 20.0 # the window size of MS2 scan in Dalton 310 | focal_loss = True 311 | 312 | batch_size = 32 313 | print("batch_size ", batch_size) 314 | 315 | epoch_stop = 10 316 | print("epoch_stop ", epoch_stop) 317 | 318 | train_stack_size = 1000 319 | valid_stack_size = 5000 320 | test_stack_size = 5000 # for test_true_feeding 321 | #decode_stack_size = 1000 # for beam_search, deprecated 322 | print("train_stack_size ", train_stack_size) 323 | print("valid_stack_size ", valid_stack_size) 324 | print("test_stack_size ", test_stack_size) 325 | #print("decode_stack_size ", decode_stack_size) 326 | 327 | steps_per_checkpoint = 100 328 | print("steps_per_checkpoint ", steps_per_checkpoint) 329 | 330 | 331 | # ============================================================================== 332 | # INPUT/OUTPUT FILES 333 | # ============================================================================== 334 | 335 | 336 | # pre-built knapsack matrix 337 | knapsack_file = "knapsack.npy" 338 | 339 | # training/testing/decoding files 340 | # ~ input_spectrum_file_train = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/spectrum.mgf" 341 | # ~ input_feature_file_train = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled.mass_corrected.train.noshare" 342 | # ~ input_spectrum_file_valid = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/spectrum.mgf" 343 | # ~ input_feature_file_valid = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled.mass_corrected.valid.noshare" 344 | # ~ input_spectrum_file_test = "data.training/aa.hla.bassani.nature_2016.mel_15.class_1/spectrum.mgf" 345 | # ~ input_feature_file_test = "data.training/aa.hla.bassani.nature_2016.mel_15.class_1/feature.csv.labeled.mass_corrected.test.noshare" 346 | input_spectrum_file_train = FLAGS.train_spectrum 347 | input_feature_file_train = FLAGS.train_feature 348 | input_spectrum_file_valid = FLAGS.valid_spectrum 349 | input_feature_file_valid = FLAGS.valid_feature 350 | input_spectrum_file_test = FLAGS.test_spectrum 351 | input_feature_file_test = FLAGS.test_feature 352 | 353 | # denovo files 354 | # ~ denovo_input_spectrum_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/spectrum.mgf" 355 | # ~ denovo_input_feature_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected" 356 | denovo_input_spectrum_file = FLAGS.denovo_spectrum 357 | denovo_input_feature_file = FLAGS.denovo_feature 358 | denovo_output_file = denovo_input_feature_file + ".deepnovo_denovo" 359 | 360 | # test accuracy 361 | predicted_format = "deepnovo" 362 | # ~ target_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled.mass_corrected" 363 | # ~ predicted_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5" 364 | target_file = FLAGS.target_file 365 | predicted_file = FLAGS.predicted_file 366 | accuracy_file = predicted_file + ".accuracy" 367 | denovo_only_file = predicted_file + ".denovo_only" 368 | scan2fea_file = predicted_file + ".scan2fea" 369 | multifea_file = predicted_file + ".multifea" 370 | 371 | # feature file column format 372 | col_feature_id = 0 373 | col_precursor_mz = 1 374 | col_precursor_charge = 2 375 | col_rt_mean = 3 376 | col_raw_sequence = 4 377 | col_scan_list = 5 378 | col_ms1_list = 6 379 | col_feature_area = 7 380 | col_num = 8 381 | # predicted file column format 382 | pcol_feature_id = 0 383 | pcol_feature_area = 1 384 | pcol_sequence = 2 385 | pcol_score = 3 386 | pcol_position_score = 4 387 | pcol_precursor_mz = 5 388 | pcol_precursor_charge = 6 389 | pcol_protein_id = 7 390 | pcol_scan_list_middle = 8 391 | pcol_scan_list_original = 9 392 | pcol_score_max = 10 393 | 394 | 395 | # ============================================================================== 396 | # DB SEARCH PARAMETERS 397 | # ============================================================================== 398 | 399 | 400 | data_format = "mgf" 401 | cleavage_rule = "trypsin" 402 | num_missed_cleavage = 2 403 | fixed_mod_list = ['C'] 404 | var_mod_list = ['N', 'Q', 'M'] 405 | num_mod = 3 406 | precursor_mass_tolerance = 0.01 # Da 407 | precursor_mass_ppm = 15.0/1000000 # ppm (20 better) # instead of absolute 0.01 Da 408 | topk_output = 1 409 | 410 | # db files 411 | # ~ db_fasta_file = "data/uniprot_sprot.human.db_decoy.fasta" 412 | # ~ db_input_spectrum_file = "data.training/dia.pecan.hela.2018_03_29/testing.spectrum.mgf" 413 | # ~ db_input_feature_file = "data.training/dia.abrf.2018_03_27/testing.feature.csv.2k" 414 | # ~ db_output_file = db_input_feature_file + ".deepnovo_db" 415 | # ~ if FLAGS.decoy: 416 | # ~ db_output_file += ".decoy" 417 | 418 | # hybrid files 419 | # ~ hybrid_fasta_file = "data/uniprot_sprot.human.db_decoy.fasta" 420 | # ~ hybrid_input_spectrum_file = "data.training/dia.abrf.2018_03_27/prediction.spectrum.mgf" 421 | # ~ hybrid_input_feature_file = "data.training/dia.abrf.2018_03_27/prediction.feature.csv.part1" 422 | # ~ hybrid_denovo_file = hybrid_input_feature_file + ".deepnovo_hybrid_denovo" 423 | # ~ hybrid_output_file = hybrid_input_feature_file + ".deepnovo_hybrid" 424 | # ~ if FLAGS.decoy: 425 | # ~ hybrid_output_file += ".decoy" 426 | 427 | -------------------------------------------------------------------------------- /plot.py: -------------------------------------------------------------------------------- 1 | import re 2 | import math 3 | import numpy as np 4 | import pandas as pd 5 | import csv 6 | import matplotlib 7 | matplotlib.use('Agg') 8 | import matplotlib.pyplot as pyplot 9 | from matplotlib_venn import venn2 10 | from matplotlib_venn import venn3 11 | matplotlib.rcParams.update({'font.size': 11}) 12 | from scipy import stats 13 | 14 | 15 | # ~ # TEMP 16 | # ~ file_path = "step_5.output_neoantigen_criteria.xlsx" 17 | # ~ value_list = pd.read_excel(file_path, sheetname='5_targets_152_candidates')['total_abundance'].values 18 | # ~ fig, ax = pyplot.subplots() 19 | # ~ pyplot.boxplot([value_list], labels=['total_abundance']) 20 | # ~ ax.set_yscale('log') 21 | # ~ ax.set_ylabel('Total abundance of supporting PSMs') 22 | # ~ ax.spines["top"].set_visible(False) 23 | # ~ ax.spines["right"].set_visible(False) 24 | # ~ # GRLAFFLKY 25 | # ~ pyplot.plot([1], [134464000], color='red', marker='o', markersize=6) 26 | # ~ pyplot.savefig("temp.png") 27 | 28 | 29 | def read_netmhcpan_csv(input_file, num_allele): 30 | 31 | best_nM_list = [] 32 | best_rank_list = [] 33 | with open(input_file, 'r') as input_handle: 34 | csv_reader = csv.DictReader(input_handle, delimiter=',') 35 | for row in csv_reader: 36 | best_nM = min([float(row['nM' + str(x)]) for x in range(1, num_allele+1)]) 37 | best_rank = min([float(row['Rank' + str(x)]) for x in range(1, num_allele+1)]) 38 | best_nM_list.append(best_nM) 39 | best_rank_list.append(best_rank) 40 | return best_nM_list, best_rank_list 41 | 42 | 43 | def draw_figure2_boxplot_netmhcpan(): 44 | 45 | num_allele = 4 46 | denovo_path = "deepnovo.aa.figure_2g.netmhcpan_denovo.csv" 47 | db_path = "deepnovo.aa.figure_2g.netmhcpan_db.csv" 48 | iedb_path = "deepnovo.aa.figure_2g.netmhcpan_iedb.csv" 49 | denovo_nM_list, denovo_rank_list = read_netmhcpan_csv(denovo_path, num_allele) 50 | db_nM_list, db_rank_list = read_netmhcpan_csv(db_path, num_allele) 51 | iedb_nM_list, iedb_rank_list = read_netmhcpan_csv(iedb_path, num_allele) 52 | 53 | # ~ colors = ['red', 'dodgerblue', 'lightgrey'] 54 | # ~ nM_list = [denovo_nM_list, db_nM_list, iedb_nM_list] 55 | # ~ print([len(x) for x in nM_list]) 56 | # ~ fig, ax = pyplot.subplots() 57 | # ~ nM_plot = pyplot.boxplot(nM_list, labels=['De novo', 'Database', 'IEDB'], patch_artist=True) 58 | # ~ for patch, color in zip(nM_plot['boxes'], colors): 59 | # ~ patch.set_facecolor(color) 60 | # ~ ax.set_yscale('log') 61 | # ~ ax.set_ylabel('Binding affinity (nM, log-scale)') 62 | # ~ ax.spines["top"].set_visible(False) 63 | # ~ ax.spines["right"].set_visible(False) 64 | # ~ # 500-nM threshold 65 | # ~ pyplot.plot([0, 6], [500, 500], color='black', linestyle='--', linewidth=1) 66 | # ~ pyplot.savefig("figure2.boxplot_nM.png") 67 | 68 | # ~ colors = ['red', 'dodgerblue'] 69 | colors = ['red', 'dodgerblue', 'lightgrey'] 70 | # ~ rank_list = [denovo_rank_list, db_rank_list] 71 | rank_list = [denovo_rank_list, db_rank_list, iedb_rank_list] 72 | print([len(x) for x in rank_list]) 73 | fig, ax = pyplot.subplots() 74 | # ~ rank_plot = pyplot.boxplot(rank_list, labels=['De novo', 'Database'], patch_artist=True) 75 | rank_plot = pyplot.boxplot(rank_list, labels=['De novo', 'Database', 'IEDB'], patch_artist=True) 76 | for patch, color in zip(rank_plot['boxes'], colors): 77 | patch.set_facecolor(color) 78 | ax.set_yscale('log') 79 | ax.set_ylabel('Binding affinity rank (%, log-scale)') 80 | ax.spines["top"].set_visible(False) 81 | ax.spines["right"].set_visible(False) 82 | # 2% and 0.5% threshold 83 | pyplot.plot([0, 6], [2, 2], color='black', linestyle='--', linewidth=1) 84 | pyplot.plot([0, 6], [0.5, 0.5], color='black', linestyle='--', linewidth=1) 85 | pyplot.savefig("figure2.boxplot_rank.png") 86 | 87 | print("np.log(np.median(denovo_rank_list)) =", np.log(np.median(denovo_rank_list))) 88 | print("np.log(np.median(db_rank_list)) =", np.log(np.median(db_rank_list))) 89 | print("np.log(np.median(iedb_rank_list))", np.log(np.median(iedb_rank_list))) 90 | mannwhitneyu, pvalue = stats.mannwhitneyu(denovo_rank_list, iedb_rank_list) 91 | print("mannwhitneyu =", mannwhitneyu) 92 | print("pvalue =", pvalue) 93 | 94 | # ~ draw_figure2_boxplot_netmhcpan() 95 | 96 | 97 | def read_immuno_csv(input_file): 98 | 99 | score_list = [] 100 | with open(input_file, 'r') as input_handle: 101 | csv_reader = csv.DictReader(input_handle, delimiter=',') 102 | for row in csv_reader: 103 | score_list.append(float(row['score'])) 104 | return score_list 105 | 106 | 107 | def draw_figure2_boxplot_immuno(): 108 | 109 | denovo_path = "deepnovo.aa.figure_S5.mel_16.immuno_denovo.csv" 110 | db_path = "deepnovo.aa.figure_S5.mel_16.immuno_db.csv" 111 | # ~ iedb_path = "deepnovo.aa.figure_2i.immuno_iedb.csv" 112 | # ~ model_path = "deepnovo.aa.figure_2i.immuno_model.csv" 113 | denovo_score_list = read_immuno_csv(denovo_path) 114 | db_score_list = read_immuno_csv(db_path) 115 | # ~ iedb_score_list = read_immuno_csv(iedb_path) 116 | # ~ model_score_list = read_immuno_csv(model_path) 117 | 118 | # ~ colors = ['red', 'dodgerblue', 'lightgrey', 'white'] 119 | colors = ['red', 'dodgerblue'] 120 | # ~ score_list = [denovo_score_list, db_score_list, iedb_score_list, model_score_list] 121 | score_list = [denovo_score_list, db_score_list] 122 | print([len(x) for x in score_list]) 123 | fig, ax = pyplot.subplots() 124 | # ~ score_plot = pyplot.boxplot(score_list, labels=['De novo', 'Database', 'IEDB', 'Calis et al.'], patch_artist=True) 125 | score_plot = pyplot.boxplot(score_list, labels=['De novo', 'Database'], patch_artist=True) 126 | for patch, color in zip(score_plot['boxes'], colors): 127 | patch.set_facecolor(color) 128 | ax.set_ylabel('Immunogenicity') 129 | ax.spines["top"].set_visible(False) 130 | ax.spines["right"].set_visible(False) 131 | pyplot.plot([0, 6], [0., 0.], color='black', linestyle='--', linewidth=1) 132 | pyplot.savefig("figure2.boxplot_immuno.png") 133 | 134 | print("np.median(denovo_score_list) =", np.median(denovo_score_list)) 135 | print("np.median(db_score_list) =", np.median(db_score_list)) 136 | # ~ print("np.median(iedb_score_list) =", np.median(iedb_score_list)) 137 | # ~ print("np.median(model_score_list) =", np.median(model_score_list)) 138 | mannwhitneyu, pvalue = stats.mannwhitneyu(denovo_score_list, db_score_list) 139 | print("mannwhitneyu =", mannwhitneyu) 140 | print("pvalue =", pvalue) 141 | 142 | # ~ draw_figure2_boxplot_immuno() 143 | 144 | 145 | def draw_figure2_venn(): 146 | 147 | file_path = "temp.manuscript/deepnovo.aa.figure_2.step6.xlsx" 148 | denovo_set = set(pd.read_excel(file_path, sheet_name='denovo_peptide')['denovo_peptide'].values) 149 | db_set = set(pd.read_excel(file_path, sheet_name='db_peptide')['db_peptide'].values) 150 | iedb_set = set(pd.read_excel(file_path, sheet_name='iedb_peptide')['iedb_peptide'].values) 151 | set_labels = ('De novo', 'Database', 'IEDB') 152 | venn_plot = venn3(subsets=[denovo_set, db_set, iedb_set], set_labels=set_labels) 153 | venn_plot.get_patch_by_id('100').set_color('red') 154 | venn_plot.get_patch_by_id('100').set_alpha(0.75) 155 | venn_plot.get_patch_by_id('010').set_color('skyblue') 156 | venn_plot.get_patch_by_id('010').set_alpha(1.0) 157 | venn_plot.get_patch_by_id('001').set_color('grey') 158 | venn_plot.get_patch_by_id('001').set_alpha(0.5) 159 | pyplot.savefig("venn3.svg") 160 | 161 | # ~ draw_figure2_venn() 162 | 163 | 164 | def plot_spectrum_array(spectrum_array, figure_name): 165 | print("plot_spectrum_array()") 166 | 167 | figure = plt.figure(1) 168 | spectrum_count = spectrum_array.shape[0] 169 | for index in range(spectrum_count): 170 | plt.subplot(spectrum_count, 1, index+1) 171 | plt.plot(spectrum_array[index,:]) 172 | plt.ylim((0.0,1.0)) 173 | plt.show() 174 | figure.savefig(figure_name) 175 | plt.close() 176 | 177 | #~ plot_spectrum_array(np.load("spectrum_original_forward.npy"), "spectrum_original_forward.pdf") 178 | 179 | 180 | def read_feature_id(input_file, split_char): 181 | 182 | feature_set = set() 183 | with open(input_file, 'r') as handle: 184 | header_line = handle.readline() 185 | for line in handle: 186 | line = re.split(split_char, line) 187 | feature_id = line[0] 188 | feature_set.add(feature_id) 189 | return feature_set 190 | 191 | 192 | # figure2.venn2/3.peaks_deepnovo.png 193 | #~ matplotlib.rcParams.update({'font.size': 16}) 194 | #~ peaks_set = read_feature_id("data.training/dia.urine.2018_03_29/testing.feature.csv", ',|\r|\n') 195 | #~ peaks_set2 = read_feature_id("data.training/dia.urine.2018_03_29/peaks.denovo.csv.top.feature_id", '\t|\r|\n') 196 | #~ deepnovo_set = read_feature_id("data.training/dia.urine.2018_03_29/testing.unlabeled.csv.deepnovo_denovo.minlen_5.top", '\t|\r|\n') 197 | #~ set_labels = ("PEAKS DB", "PEAKS Denovo", "DeepNovo-DIA") 198 | #~ set_labels = ("", "", "") 199 | #~ venn_plot = venn3(subsets=[peaks_set, peaks_set2, deepnovo_set], set_labels=set_labels) 200 | #~ for text in venn_plot.set_labels: 201 | #~ text.set_fontsize(16) 202 | #~ pyplot.savefig("figure2.venn2.peaks_18_20_deepnovo.png") 203 | 204 | 205 | # figure2.bar.aa/peptide.png 206 | def draw_figure2_bar(y_value, y_label, figure_file): 207 | 208 | fig, ax = pyplot.subplots() 209 | x_value = range(1, len(y_value)+1) 210 | bar_10k, bar_5k, bar_2k = pyplot.bar(x_value, y_value, width=0.4, align='center') 211 | bar_10k.set_facecolor('g') 212 | bar_10k.set_alpha(0.5) 213 | bar_5k.set_facecolor('lightskyblue') 214 | bar_2k.set_facecolor('blue') 215 | bar_2k.set_alpha(0.7) 216 | for index, value in zip(x_value, y_value): 217 | ax.text(index-0.1, value + 3, str(value), color='black') 218 | ax.set_xticks(x_value) 219 | ax.set_xticklabels(['Top 10k', 'Top 5k', 'Top 2k']) 220 | ax.set_xlim([0, 4]) 221 | ax.set_ylim([0, 100]) 222 | ax.set_ylabel(y_label) 223 | ax.spines['right'].set_visible(False) 224 | ax.spines['top'].set_visible(False) 225 | ax.xaxis.set_ticks_position('bottom') 226 | ax.yaxis.set_ticks_position('left') 227 | pyplot.savefig(figure_file) 228 | 229 | #~ denovo_only = [41.7, 22.1, 5.5] 230 | #~ draw_figure2_bar(denovo_only, 'Denovo only peptides on top of database (%)', 231 | #~ 'figure2.bar.denovo_only.png') 232 | #~ aa_accuracy = [76.2, 83.6, 94.2] 233 | #~ draw_figure2_bar(aa_accuracy, 'Amino acid accuracy (%)', 'figure2.bar.aa.png') 234 | #~ peptide_accuracy = [41.4, 53.0, 79.9] 235 | #~ draw_figure2_bar(peptide_accuracy, 'Peptide accuracy (%)', 'figure2.bar.peptide.png') 236 | 237 | 238 | def read_feature_accuracy(input_file, split_char): 239 | 240 | feature_list = [] 241 | with open(input_file, 'r') as handle: 242 | header_line = handle.readline() 243 | for line in handle: 244 | line = re.split(split_char, line) 245 | feature = {} 246 | feature["feature_id"] = line[0] 247 | feature["feature_area_log10"] = math.log10(max(float(line[1]), 1.0)) 248 | feature["predicted_score"] = float(line[4]) 249 | feature["recall_AA"] = float(line[5]) 250 | feature["predicted_len"] = float(line[6]) 251 | feature_list.append(feature) 252 | return feature_list 253 | 254 | 255 | # figure2.accuracy.area.png 256 | def draw_figure2_accuracy_area(accuracy_file): 257 | 258 | feature_list = read_feature_accuracy(accuracy_file, '\t|\r|\n') 259 | num_features = len(feature_list) 260 | feature_area_log10 = [f['feature_area_log10'] for f in feature_list] 261 | #~ x_max = int(max(feature_area_log10)) 262 | x_range = np.arange(3, 11, 1.0) 263 | x_value = [] 264 | y_accuracy = [] 265 | y_proportion = [] 266 | #~ for x in range(1, x_max+1): 267 | for x in x_range: 268 | feature_x = [f for f in feature_list if x-0.5 < f['feature_area_log10'] <= x+0.5] 269 | recall_AA = sum([f['recall_AA'] for f in feature_x]) 270 | target_len = sum([f['predicted_len'] for f in feature_x]) 271 | if target_len > 0: 272 | x_value.append(x) 273 | y_accuracy.append(100*recall_AA/target_len) 274 | y_proportion.append(100.0 * len(feature_x) / num_features) 275 | 276 | fig, left_ax = pyplot.subplots() 277 | pyplot.bar(x_value, y_proportion, width=0.6, align='center', color='salmon', alpha=0.75) 278 | left_ax.set_xlabel('Feature abundance (log10 scale)') 279 | left_ax.set_ylabel('Proportion of features (%)', color='salmon') 280 | left_ax.tick_params('y', colors='salmon') 281 | left_ax.set_xlim([0, 12]) 282 | left_ax.set_ylim([0, 100]) 283 | left_ax.spines['top'].set_visible(False) 284 | left_ax.xaxis.set_ticks_position('bottom') 285 | for index, value in zip(x_value, y_proportion): 286 | if value > 0: 287 | left_ax.text(index-0.2, value + 2, str(round(value,1)), fontsize=12, color='black') 288 | 289 | right_ax = left_ax.twinx() 290 | right_ax.plot(x_value, y_accuracy, '-o', linewidth=2.0, color='blue') 291 | right_ax.set_ylabel('Amino acid accuracy (%)', color='blue') 292 | right_ax.tick_params('y', colors='blue') 293 | right_ax.set_xlim([0, 12]) 294 | right_ax.set_ylim([0, 100]) 295 | for index, value in zip(x_value, y_accuracy): 296 | if value > 0: 297 | right_ax.text(index-0.2, value + 2, str(round(value,1)), fontsize=12, color='black') 298 | 299 | 300 | pyplot.savefig('figure2.accuracy.area.png') 301 | 302 | #~ accuracy_file = "data.training/dia.pecan.plasma.2018_03_29/testing_plasma.feature.csv.deepnovo_denovo.accuracy" 303 | #~ accuracy_file = "data.training/dia.hla.elife.jurkat_oxford/testing_jurkat_oxford.unlabeled.csv.deepnovo_denovo.accuracy" 304 | #~ accuracy_file = "Supplementary Table S6.txt" 305 | #~ draw_figure2_accuracy_area(accuracy_file) 306 | 307 | 308 | def get_accuracy_score(accuracy_file): 309 | 310 | feature_list = read_feature_accuracy(accuracy_file, '\t|\r|\n') 311 | num_value = 10 312 | step = 100 // num_value 313 | x_value = [x*step for x in range(1, num_value+1)] 314 | y_value = [] 315 | # find the accuracy for different cutoff 316 | for x in x_value: 317 | # ~ feature_x = [f for f in feature_list if x-(step//2) < 100*math.exp(f['predicted_score']) <= x+(step//2)] 318 | feature_x = [f for f in feature_list if x <= 100*math.exp(f['predicted_score'])] 319 | recall_AA = sum([f['recall_AA'] for f in feature_x]) 320 | predicted_len = sum([f['predicted_len'] for f in feature_x]) 321 | if predicted_len > 0: 322 | y_value.append(100*recall_AA/predicted_len) 323 | else: 324 | y_value.append(0) 325 | 326 | return x_value, y_value 327 | 328 | 329 | # figure2.accuracy.score.png 330 | def draw_figure2_accuracy_score(): 331 | 332 | accuracy_file = "data.training/aa.hla.bassani.nature_2016.mel_15.class_1/feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo.accuracy" 333 | accuracy_file_generic = "data.training/aa.hla.bassani.nature_2016.mel_15.class_1/train.exclude_mel_15/feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo.accuracy" 334 | x_test, y_test = get_accuracy_score(accuracy_file) 335 | x_test_generic, y_test_generic = get_accuracy_score(accuracy_file_generic) 336 | fig, ax = pyplot.subplots() 337 | plot_cutoff_y, = pyplot.plot([0, 100], [95, 95], ':', linewidth=1.0, color='black', markeredgecolor='black', alpha=1.0) 338 | plot_test_generic, = pyplot.plot(x_test_generic, y_test_generic, '--o', linewidth=1.0, color='orange', markeredgecolor='orange', alpha=1.0) 339 | plot_test, = pyplot.plot(x_test, y_test, '-s', linewidth=1.0, color='red', markeredgecolor='red', alpha=1.0) 340 | # ~ plot_cutoff_x, = pyplot.plot([59.5, 59.5], [0, 100], '--', linewidth=1.0, color='black', markeredgecolor='black', alpha=0.75) 341 | # ~ plot_cutoff_x, = pyplot.plot([61.9, 61.9], [0, 100], '--', linewidth=1.0, color='black', markeredgecolor='black', alpha=0.75) 342 | pyplot.legend([plot_test, plot_test_generic, plot_cutoff_y], ['Personalized model', 'Generic model', '95% cutoff'], loc='lower right') 343 | pyplot.yticks([80, 83.5, 86.7, 90, 95, 100], ['80', '83.5', '86.7', '90', '95', '100']) 344 | # ~ pyplot.title('DeepNovo confidence score for quality control') 345 | ax.set_xlabel('De novo confidence score') 346 | ax.set_xlim([0, 105]) 347 | ax.set_ylim([80, 101]) 348 | ax.set_ylabel('Amino acid accuracy (%)') 349 | ax.spines['right'].set_visible(False) 350 | ax.spines['top'].set_visible(False) 351 | ax.xaxis.set_ticks_position('bottom') 352 | ax.yaxis.set_ticks_position('left') 353 | pyplot.savefig('figure2.accuracy.score.png') 354 | # ~ pyplot.savefig('figure2.accuracy.score.svg') 355 | 356 | # ~ draw_figure2_accuracy_score() 357 | 358 | 359 | 360 | #~ db_file = "data.training/dia.pecan.plasma.2018_03_28/testing.feature.csv.deepnovo_denovo" 361 | #~ db_abundance = read_feature_abundance(db_file, '\t|\r|\n') 362 | #~ db_abundance_log10 = np.log10(np.array(db_abundance)) 363 | #~ denovo_top_file = "data.training/dia.pecan.plasma.2018_03_28/testing.unlabeled.csv.deepnovo_denovo.len_5.7k" 364 | #~ denovo_abundance = read_feature_abundance(denovo_top_file, '\t|\r|\n') 365 | #~ denovo_abundance_log10 = np.log10(np.array(denovo_abundance)) 366 | #~ print(len(db_abundance_log10)) 367 | #~ print(len(denovo_abundance_log10)) 368 | #~ n, bins, patches = pyplot.hist(db_abundance_log10, 50, facecolor='salmon', alpha=0.5) 369 | #~ n, bins, patches = pyplot.hist(denovo_abundance_log10, 50, facecolor='green', alpha=0.5) 370 | #~ pyplot.xlabel('Feature abundance (log10 scale)') 371 | #~ pyplot.ylabel('Number of features') 372 | #~ pyplot.savefig('figure2.hist.denovo.png') 373 | -------------------------------------------------------------------------------- /deepnovo_worker_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Hieu Tran. All Rights Reserved. 2 | # 3 | # DeepNovo is publicly available for non-commercial uses. 4 | # ============================================================================== 5 | 6 | """TODO(nh2tran): docstring.""" 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import re 13 | import sys 14 | 15 | import numpy as np 16 | 17 | import deepnovo_config 18 | 19 | class WorkerTest(object): 20 | """TODO(nh2tran): docstring. 21 | The WorkerTest should be stand-alone and separated from other workers. 22 | """ 23 | 24 | 25 | def __init__(self): 26 | """TODO(nh2tran): docstring.""" 27 | 28 | print("".join(["="] * 80)) # section-separating line 29 | print("WorkerTest.__init__()") 30 | 31 | # we currently use deepnovo_config to store both const & settings 32 | # the settings should be shown in __init__() to keep track carefully 33 | self.MZ_MAX = deepnovo_config.MZ_MAX 34 | 35 | self.target_file = deepnovo_config.target_file 36 | self.predicted_file = deepnovo_config.predicted_file 37 | self.predicted_format = deepnovo_config.predicted_format 38 | self.accuracy_file = deepnovo_config.accuracy_file 39 | self.denovo_only_file = deepnovo_config.denovo_only_file 40 | self.scan2fea_file = deepnovo_config.scan2fea_file 41 | self.multifea_file = deepnovo_config.multifea_file 42 | print("target_file = {0:s}".format(self.target_file)) 43 | print("predicted_file = {0:s}".format(self.predicted_file)) 44 | print("predicted_format = {0:s}".format(self.predicted_format)) 45 | print("accuracy_file = {0:s}".format(self.accuracy_file)) 46 | print("denovo_only_file = {0:s}".format(self.denovo_only_file)) 47 | print("scan2fea_file = {0:s}".format(self.scan2fea_file)) 48 | print("multifea_file = {0:s}".format(self.multifea_file)) 49 | 50 | self.target_dict = {} 51 | self.predicted_list = [] 52 | 53 | 54 | def test_accuracy(self, db_peptide_list=None): 55 | """TODO(nh2tran): docstring.""" 56 | 57 | print("".join(["="] * 80)) # section-separating line 58 | print("WorkerTest.test_accuracy()") 59 | 60 | # write the accuracy of predicted peptides 61 | accuracy_handle = open(self.accuracy_file, 'w') 62 | header_list = ["feature_id", 63 | "feature_area", 64 | "target_sequence", 65 | "predicted_sequence", 66 | "predicted_score", 67 | "recall_AA", 68 | "predicted_len", 69 | "target_len", 70 | "scan_list_middle", 71 | "scan_list_original"] 72 | header_row = "\t".join(header_list) 73 | print(header_row, file=accuracy_handle, end="\n") 74 | 75 | # write denovo_only peptides 76 | denovo_only_handle = open(self.denovo_only_file, 'w') 77 | header_list = ["feature_id", 78 | "feature_area", 79 | "predicted_sequence", 80 | "predicted_score", 81 | "predicted_score_max", 82 | "scan_list_middle", 83 | "scan_list_original"] 84 | header_row = "\t".join(header_list) 85 | print(header_row, file=denovo_only_handle, end="\n") 86 | 87 | self._get_target() 88 | target_count_total = len(self.target_dict) 89 | target_len_total = sum([len(x) for x in self.target_dict.itervalues()]) 90 | 91 | # this part is tricky! 92 | # some target peptides are reported by PEAKS DB but not found in 93 | # db_peptide_list due to mistakes in cleavage rules. 94 | # if db_peptide_list is given, we only consider those target peptides, 95 | # otherwise, use all target peptides 96 | target_dict_db = {} 97 | if db_peptide_list is not None: 98 | for feature_id, target in self.target_dict.iteritems(): 99 | target_simplied = target 100 | # remove the extension 'mod' from variable modifications 101 | target_simplied = ['M' if x=='M(Oxidation)' else x for x in target_simplied] 102 | target_simplied = ['N' if x=='N(Deamidation)' else x for x in target_simplied] 103 | target_simplied = ['Q' if x=='Q(Deamidation)' else x for x in target_simplied] 104 | if target_simplied in db_peptide_list: 105 | target_dict_db[feature_id] = target 106 | else: 107 | print("target not found: ", target_simplied) 108 | else: 109 | target_dict_db = self.target_dict 110 | target_count_db = len(target_dict_db) 111 | target_len_db = sum([len(x) for x in target_dict_db.itervalues()]) 112 | 113 | # we also skip target peptides with precursor_mass > MZ_MAX 114 | target_dict_db_mass = {} 115 | for feature_id, peptide in target_dict_db.iteritems(): 116 | if self._compute_peptide_mass(peptide) <= self.MZ_MAX: 117 | target_dict_db_mass[feature_id] = peptide 118 | target_count_db_mass = len(target_dict_db_mass) 119 | target_len_db_mass = sum([len(x) for x in target_dict_db_mass.itervalues()]) 120 | 121 | # read predicted peptides from deepnovo or peaks 122 | if deepnovo_config.predicted_format == "deepnovo": 123 | self._get_predicted() 124 | else: 125 | self._get_predicted_peaks() 126 | 127 | # note that the prediction has already skipped precursor_mass > MZ_MAX 128 | # we also skip predicted peptides whose feature_id's are not in target_dict_db_mass 129 | predicted_count_mass = len(self.predicted_list) 130 | predicted_count_mass_db = 0 131 | predicted_len_mass_db = 0 132 | predicted_only = 0 133 | # the recall is calculated on remaining peptides 134 | recall_AA_total = 0.0 135 | recall_peptide_total = 0.0 136 | 137 | # record scan with multiple features 138 | scan_dict = {} 139 | 140 | for index, predicted in enumerate(self.predicted_list): 141 | 142 | feature_id = predicted["feature_id"] 143 | feature_area = str(predicted["feature_area"]) 144 | feature_scan_list_middle = predicted["scan_list_middle"] 145 | feature_scan_list_original = predicted["scan_list_original"] 146 | if feature_scan_list_original: 147 | for scan in re.split(';|\r|\n', feature_scan_list_original): 148 | if scan in scan_dict: 149 | scan_dict[scan]["feature_count"] += 1 150 | scan_dict[scan]["feature_list"].append(feature_id) 151 | else: 152 | scan_dict[scan] = {} 153 | scan_dict[scan]["feature_count"] = 1 154 | scan_dict[scan]["feature_list"] = [feature_id] 155 | 156 | if feature_id in target_dict_db_mass: 157 | 158 | predicted_count_mass_db += 1 159 | 160 | target = target_dict_db_mass[feature_id] 161 | target_len= len(target) 162 | 163 | # if >= 1 denovo peptides reported, calculate the best accuracy 164 | best_recall_AA = 0 165 | best_predicted_sequence = predicted["sequence"][0] 166 | best_predicted_score = predicted["score"][0] 167 | for predicted_sequence, predicted_score in zip(predicted["sequence"], predicted["score"]): 168 | predicted_AA_id = [deepnovo_config.vocab[x] for x in predicted_sequence] 169 | target_AA_id = [deepnovo_config.vocab[x] for x in target] 170 | recall_AA = self._match_AA_novor(target_AA_id, predicted_AA_id) 171 | if (recall_AA > best_recall_AA 172 | or (recall_AA == best_recall_AA and predicted_score > best_predicted_score)): 173 | best_recall_AA = recall_AA 174 | best_predicted_sequence = predicted_sequence[:] 175 | best_predicted_score = predicted_score 176 | recall_AA = best_recall_AA 177 | predicted_sequence = best_predicted_sequence[:] 178 | predicted_score = best_predicted_score 179 | 180 | recall_AA_total += recall_AA 181 | if recall_AA == target_len: 182 | recall_peptide_total += 1 183 | predicted_len= len(predicted_sequence) 184 | predicted_len_mass_db += predicted_len 185 | 186 | # convert to string format to print out 187 | target_sequence = ",".join(target) 188 | predicted_sequence = ",".join(predicted_sequence) 189 | predicted_score = "{0:.2f}".format(predicted_score) 190 | recall_AA = "{0:d}".format(recall_AA) 191 | predicted_len = "{0:d}".format(predicted_len) 192 | target_len = "{0:d}".format(target_len) 193 | print_list = [feature_id, 194 | feature_area, 195 | target_sequence, 196 | predicted_sequence, 197 | predicted_score, 198 | recall_AA, 199 | predicted_len, 200 | target_len, 201 | feature_scan_list_middle, 202 | feature_scan_list_original] 203 | print_row = "\t".join(print_list) 204 | print(print_row, file=accuracy_handle, end="\n") 205 | else: 206 | predicted_only += 1 207 | predicted_sequence = ';'.join([','.join(x) for x in predicted["sequence"]]) 208 | predicted_score = ';'.join(['{0:.2f}'.format(x) for x in predicted["score"]]) 209 | if predicted["score"]: 210 | predicted_score_max = '{0:.2f}'.format(np.max(predicted["score"])) 211 | else: 212 | predicted_score_max = '' 213 | print_list = [feature_id, 214 | feature_area, 215 | predicted_sequence, 216 | predicted_score, 217 | predicted_score_max, 218 | feature_scan_list_middle, 219 | feature_scan_list_original] 220 | print_row = "\t".join(print_list) 221 | print(print_row, file=denovo_only_handle, end="\n") 222 | 223 | accuracy_handle.close() 224 | denovo_only_handle.close() 225 | 226 | multifea_dict = {} 227 | for scan_id, value in scan_dict.iteritems(): 228 | feature_count = value["feature_count"] 229 | feature_list = value["feature_list"] 230 | if feature_count > 1: 231 | for feature_id in feature_list: 232 | if feature_id in multifea_dict: 233 | multifea_dict[feature_id].append(scan_id + ':' + str(feature_count)) 234 | else: 235 | multifea_dict[feature_id] = [scan_id + ':' + str(feature_count)] 236 | 237 | with open(self.scan2fea_file, 'w') as handle: 238 | header_list = ["scan_id", 239 | "feature_count", 240 | "feature_list"] 241 | header_row = "\t".join(header_list) 242 | print(header_row, file=handle, end="\n") 243 | for scan_id, value in scan_dict.iteritems(): 244 | print_list = [scan_id, 245 | str(value["feature_count"]), 246 | ";".join(value["feature_list"])] 247 | print_row = "\t".join(print_list) 248 | print(print_row, file=handle, end="\n") 249 | 250 | with open(self.multifea_file, 'w') as handle: 251 | header_list = ["feature_id", 252 | "scan_list"] 253 | header_row = "\t".join(header_list) 254 | print(header_row, file=handle, end="\n") 255 | for feature_id, scan_list in multifea_dict.iteritems(): 256 | print_list = [feature_id, 257 | ";".join(scan_list)] 258 | print_row = "\t".join(print_list) 259 | print(print_row, file=handle, end="\n") 260 | 261 | print("target_count_total = {0:d}".format(target_count_total)) 262 | print("target_len_total = {0:d}".format(target_len_total)) 263 | print("target_count_db = {0:d}".format(target_count_db)) 264 | print("target_len_db = {0:d}".format(target_len_db)) 265 | print("target_count_db_mass: {0:d}".format(target_count_db_mass)) 266 | print("target_len_db_mass: {0:d}".format(target_len_db_mass)) 267 | print() 268 | 269 | print("predicted_count_mass: {0:d}".format(predicted_count_mass)) 270 | print("predicted_count_mass_db: {0:d}".format(predicted_count_mass_db)) 271 | print("predicted_len_mass_db: {0:d}".format(predicted_len_mass_db)) 272 | print("predicted_only: {0:d}".format(predicted_only)) 273 | print() 274 | 275 | print("recall_AA_total = {0:.4f}".format(recall_AA_total / target_len_total)) 276 | print("recall_AA_db = {0:.4f}".format(recall_AA_total / target_len_db)) 277 | print("recall_AA_db_mass = {0:.4f}".format(recall_AA_total / target_len_db_mass)) 278 | print("recall_peptide_total = {0:.4f}".format(recall_peptide_total / target_count_total)) 279 | print("recall_peptide_db = {0:.4f}".format(recall_peptide_total / target_count_db)) 280 | print("recall_peptide_db_mass = {0:.4f}".format(recall_peptide_total / target_count_db_mass)) 281 | print("precision_AA_mass_db = {0:.4f}".format(recall_AA_total / predicted_len_mass_db)) 282 | print("precision_peptide_mass_db = {0:.4f}".format(recall_peptide_total / predicted_count_mass_db)) 283 | 284 | 285 | def _compute_peptide_mass(self, peptide): 286 | """TODO(nh2tran): docstring. 287 | """ 288 | 289 | #~ print("".join(["="] * 80)) # section-separating line === 290 | #~ print("WorkerDB: _compute_peptide_mass()") 291 | 292 | peptide_mass = (deepnovo_config.mass_N_terminus 293 | + sum(deepnovo_config.mass_AA[aa] for aa in peptide) 294 | + deepnovo_config.mass_C_terminus) 295 | 296 | return peptide_mass 297 | 298 | 299 | def _get_predicted(self): 300 | """TODO(nh2tran): docstring.""" 301 | 302 | print("".join(["="] * 80)) # section-separating line 303 | print("WorkerTest._get_predicted()") 304 | 305 | predicted_list = [] 306 | col_feature_id = deepnovo_config.pcol_feature_id 307 | col_feature_area = deepnovo_config.pcol_feature_area 308 | col_sequence = deepnovo_config.pcol_sequence 309 | col_score = deepnovo_config.pcol_score 310 | col_scan_list_middle = deepnovo_config.pcol_scan_list_middle 311 | col_scan_list_original = deepnovo_config.pcol_scan_list_original 312 | with open(self.predicted_file, 'r') as handle: 313 | # header 314 | handle.readline() 315 | for line in handle: 316 | line_split = re.split('\t|\n', line) 317 | predicted = {} 318 | predicted["feature_id"] = line_split[col_feature_id] 319 | predicted["feature_area"] = float(line_split[col_feature_area]) 320 | predicted["scan_list_middle"] = line_split[col_scan_list_middle] 321 | predicted["scan_list_original"] = line_split[col_scan_list_original] 322 | if line_split[col_sequence]: # not empty sequence 323 | predicted["sequence"] = [re.split(',', x) 324 | for x in re.split(';', line_split[col_sequence])] 325 | predicted["score"] = [float(x) 326 | for x in re.split(';', line_split[col_score])] 327 | else: 328 | predicted["sequence"] = [[]] 329 | predicted["score"] = [-999] 330 | predicted_list.append(predicted) 331 | 332 | self.predicted_list = predicted_list 333 | 334 | 335 | def _get_predicted_peaks(self): 336 | """TODO(nh2tran): docstring.""" 337 | 338 | print("".join(["="] * 80)) # section-separating line 339 | print("WorkerTest._get_predicted_peaks()") 340 | 341 | predicted_list = [] 342 | col_fraction_id = 0 343 | fraction_id_map = {'1':'1', 344 | '2':'10', 345 | '3':'11', 346 | '4':'12', 347 | '5':'2', 348 | '6':'3', 349 | '7':'4', 350 | '8':'5', 351 | '9':'6', 352 | '10':'7', 353 | '11':'8', 354 | '12':'9', 355 | } 356 | col_scan_id = 1 357 | col_sequence = 3 358 | with open(self.predicted_file, 'r') as handle: 359 | # header 360 | handle.readline() 361 | for line in handle: 362 | line_split = re.split(',|\n', line) 363 | predicted = {} 364 | #~ predicted["feature_id"] = "F" + fraction_id_map[line_split[col_fraction_id]] + ":" + line_split[col_scan_id] 365 | predicted["feature_id"] = "F" + line_split[col_fraction_id] + ":" + line_split[col_scan_id] 366 | raw_sequence = line_split[col_sequence] 367 | assert raw_sequence, "Error: wrong format." 368 | predicted["sequence"] = self._parse_sequence(raw_sequence) 369 | # skip peptides with precursor_mass > MZ_MAX 370 | if self._compute_peptide_mass(predicted["sequence"]) > self.MZ_MAX: 371 | continue 372 | predicted["feature_area"] = 0 373 | predicted["scan_list_middle"] = "" 374 | predicted["scan_list_original"] = "" 375 | predicted["sequence"] = [predicted["sequence"]] 376 | predicted["score"] = [-999] 377 | predicted_list.append(predicted) 378 | 379 | self.predicted_list = predicted_list 380 | 381 | 382 | def _get_target(self): 383 | """TODO(nh2tran): docstring.""" 384 | 385 | print("".join(["="] * 80)) # section-separating line 386 | print("WorkerTest._get_target()") 387 | 388 | target_dict = {} 389 | with open(self.target_file, 'r') as handle: 390 | header_line = handle.readline() 391 | for line in handle: 392 | line = re.split(',|\r|\n', line) 393 | feature_id = line[0] 394 | raw_sequence = line[deepnovo_config.col_raw_sequence] 395 | assert raw_sequence, "Error: wrong target format." 396 | peptide = self._parse_sequence(raw_sequence) 397 | target_dict[feature_id] = peptide 398 | self.target_dict = target_dict 399 | 400 | 401 | def _parse_sequence(self, raw_sequence): 402 | """TODO(nh2tran): docstring.""" 403 | 404 | #~ print("".join(["="] * 80)) # section-separating line 405 | #~ print("WorkerTest._parse_sequence()") 406 | 407 | raw_sequence_len = len(raw_sequence) 408 | peptide = [] 409 | index = 0 410 | while index < raw_sequence_len: 411 | if raw_sequence[index] == "(": 412 | if peptide[-1] == "C" and raw_sequence[index:index+8] == "(+57.02)": 413 | peptide[-1] = "C(Carbamidomethylation)" 414 | index += 8 415 | elif peptide[-1] == 'M' and raw_sequence[index:index+8] == "(+15.99)": 416 | peptide[-1] = 'M(Oxidation)' 417 | index += 8 418 | elif peptide[-1] == 'N' and raw_sequence[index:index+6] == "(+.98)": 419 | peptide[-1] = 'N(Deamidation)' 420 | index += 6 421 | elif peptide[-1] == 'Q' and raw_sequence[index:index+6] == "(+.98)": 422 | peptide[-1] = 'Q(Deamidation)' 423 | index += 6 424 | else: # unknown modification 425 | print("ERROR: unknown modification!") 426 | print("raw_sequence = ", raw_sequence) 427 | sys.exit() 428 | else: 429 | peptide.append(raw_sequence[index]) 430 | index += 1 431 | 432 | return peptide 433 | 434 | 435 | def _match_AA_novor(self, target, predicted): 436 | """TODO(nh2tran): docstring.""" 437 | 438 | #~ print("".join(["="] * 80)) # section-separating line 439 | #~ print("WorkerTest._test_AA_match_novor()") 440 | 441 | num_match = 0 442 | target_len = len(target) 443 | predicted_len = len(predicted) 444 | target_mass = [deepnovo_config.mass_ID[x] for x in target] 445 | target_mass_cum = np.cumsum(target_mass) 446 | predicted_mass = [deepnovo_config.mass_ID[x] for x in predicted] 447 | predicted_mass_cum = np.cumsum(predicted_mass) 448 | 449 | i = 0 450 | j = 0 451 | while i < target_len and j < predicted_len: 452 | if abs(target_mass_cum[i] - predicted_mass_cum[j]) < 0.5: 453 | if abs(target_mass[i] - predicted_mass[j]) < 0.1: 454 | #~ if decoder_input[index_aa] == output[index_aa]: 455 | num_match += 1 456 | i += 1 457 | j += 1 458 | elif target_mass_cum[i] < predicted_mass_cum[j]: 459 | i += 1 460 | else: 461 | j += 1 462 | 463 | return num_match 464 | 465 | -------------------------------------------------------------------------------- /deepnovo_postprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Hieu Tran. All Rights Reserved. 2 | # 3 | # DeepNovo is publicly available for non-commercial uses. 4 | # ============================================================================== 5 | 6 | """TODO(nh2tran): docstring.""" 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import os 13 | import random 14 | import sys 15 | import re 16 | 17 | from Bio import SeqIO 18 | from Bio.SeqIO import FastaIO 19 | import Levenshtein 20 | 21 | import csv 22 | import numpy as np 23 | import math 24 | import deepnovo_config 25 | 26 | 27 | def compute_peptide_mass(peptide): 28 | """TODO(nh2tran): docstring. 29 | """ 30 | 31 | #~ print("".join(["="] * 80)) # section-separating line === 32 | #~ print("WorkerDB: _compute_peptide_mass()") 33 | 34 | peptide_mass = (deepnovo_config.mass_N_terminus 35 | + sum(deepnovo_config.mass_AA[aa] for aa in peptide) 36 | + deepnovo_config.mass_C_terminus) 37 | 38 | return peptide_mass 39 | 40 | # ~ peptide = 'TASSQRLR' 41 | # ~ print(peptide) 42 | # ~ print(compute_peptide_mass(peptide)) 43 | 44 | 45 | def read_feature_accuracy(input_file): 46 | 47 | feature_list = [] 48 | with open(input_file, 'r') as input_handle: 49 | csv_reader = csv.DictReader(input_handle, delimiter='\t') 50 | for row in csv_reader: 51 | feature = {} 52 | feature['feature_id'] = row['feature_id'] 53 | feature['feature_area'] = math.log10(max(float(row['feature_area']), 1.0)) 54 | feature['predicted_score'] = float(row['predicted_score']) 55 | feature['recall_AA'] = float(row['recall_AA']) 56 | feature['predicted_len'] = float(row['predicted_len']) 57 | feature_list.append(feature) 58 | return feature_list 59 | 60 | 61 | def find_score_cutoff(accuracy_file, accuracy_cutoff): 62 | """TODO(nh2tran): docstring.""" 63 | 64 | print("".join(["="] * 80)) # section-separating line 65 | print("find_score_cutoff()") 66 | 67 | print('accuracy_file =', accuracy_file) 68 | print('accuracy_cutoff =', accuracy_cutoff) 69 | 70 | feature_list = read_feature_accuracy(accuracy_file) 71 | feature_list_sorted = sorted(feature_list, key=lambda k: k['predicted_score'], reverse=True) 72 | recall_cumsum = np.cumsum([f['recall_AA'] for f in feature_list_sorted]) 73 | predicted_len_cumsum = np.cumsum([f['predicted_len'] for f in feature_list_sorted]) 74 | accuracy_cumsum = recall_cumsum / predicted_len_cumsum 75 | #cutoff_index = np.flatnonzero(accuracy_cumsum < accuracy_cutoff)[0] 76 | cutoff_index = np.flatnonzero(accuracy_cumsum >= accuracy_cutoff)[-1] 77 | cutoff_score = feature_list_sorted[cutoff_index]['predicted_score'] 78 | print('cutoff_index = ', cutoff_index) 79 | print('cutoff_score = ', cutoff_score) 80 | print('cutoff_score = ', 100*math.exp(cutoff_score)) 81 | 82 | return cutoff_score 83 | 84 | 85 | def select_top_score(input_file, output_file, score_cutoff): 86 | """Select a threshold of de novo confidence scores to filter de novo results. 87 | The score threshold is calculated based on a 95% cutoff of the testing accuracy. 88 | 89 | Usage: 90 | accuracy_cutoff = 0.95 91 | accuracy_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo.accuracy" 92 | score_cutoff = find_score_cutoff(accuracy_file, accuracy_cutoff) 93 | input_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo" 94 | output_file = input_file + ".top95" 95 | select_top_score(input_file, output_file, score_cutoff) 96 | """ 97 | 98 | print("".join(["="] * 80)) # section-separating line 99 | print("select_top_score()") 100 | 101 | print('input_file = ', input_file) 102 | print('output_file = ', output_file) 103 | print('score_cutoff = ', score_cutoff) 104 | 105 | total_feature = 0 106 | select_feature = 0 107 | with open(input_file, 'r') as input_handle: 108 | with open(output_file, 'w') as output_handle: 109 | csv_reader = csv.DictReader(input_handle, delimiter='\t') 110 | csv_writer = csv.DictWriter(output_handle, csv_reader.fieldnames, delimiter='\t') 111 | csv_writer.writeheader() 112 | for row in csv_reader: 113 | total_feature += 1 114 | predicted_score = float(row['predicted_score']) if row['predicted_score'] else -999 115 | if predicted_score >= score_cutoff: 116 | select_feature += 1 117 | csv_writer.writerow(row) 118 | print('total_feature = ', total_feature) 119 | print('select_feature = ', select_feature) 120 | 121 | 122 | def convert_I_to_L(input_file, output_file): 123 | """Convert I (Isoleucine) to L (Leucine). 124 | 125 | Usage: 126 | input_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo.top95" 127 | output_file = input_file + ".I_to_L" 128 | convert_I_to_L(input_file, output_file) 129 | """ 130 | 131 | print("".join(["="] * 80)) # section-separating line 132 | print("convert_I_to_L()") 133 | 134 | print('input_file = ', input_file) 135 | print('output_file = ', output_file) 136 | 137 | with open(input_file, 'r') as input_handle: 138 | with open(output_file, 'w') as output_handle: 139 | csv_reader = csv.DictReader(input_handle, delimiter='\t') 140 | csv_reader.fieldnames.append('before_I_to_L') 141 | csv_writer = csv.DictWriter(output_handle, csv_reader.fieldnames, delimiter='\t') 142 | csv_writer.writeheader() 143 | for row in csv_reader: 144 | predicted_sequence = row['predicted_sequence'] 145 | row['before_I_to_L'] = predicted_sequence 146 | row['predicted_sequence'] = predicted_sequence.replace('I', 'L') 147 | csv_writer.writerow(row) 148 | 149 | 150 | def compute_distance(predicted_sequence, consensus_sequence): 151 | """TODO(nh2tran): docstring. 152 | """ 153 | 154 | #~ print("".join(["="] * 80)) # section-separating line === 155 | #~ print("compute_distance()") 156 | 157 | # simplify the modifications 158 | modification_list = ['C(Carbamidomethylation)', 'M(Oxidation)', 'N(Deamidation)', 'Q(Deamidation)'] 159 | simplified_list = ['c', 'm', 'n', 'q'] 160 | for x in simplified_list: 161 | assert x not in deepnovo_config.vocab_reverse 162 | for x, y in zip(modification_list, simplified_list): 163 | predicted_sequence = [aa.replace(x, y) for aa in predicted_sequence] 164 | consensus_sequence = [aa.replace(x, y) for aa in consensus_sequence] 165 | predicted_sequence = ''.join(predicted_sequence) 166 | consensus_sequence = ''.join(consensus_sequence) 167 | 168 | return Levenshtein.distance(predicted_sequence, consensus_sequence) 169 | 170 | 171 | def correct_by_consensus(input_file, output_file): 172 | """Correct de novo sequencing errors as following: 173 | group predicted sequences of the same mass together; 174 | vote the consensus sequence; 175 | replace the predicted by the consensus to correct errors: AB-BA, Q-AG, N-GG, etc. 176 | 177 | Usage: 178 | input_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L" 179 | output_file = input_file + ".consensus" 180 | correct_by_consensus(input_file, output_file) 181 | """ 182 | 183 | print("".join(["="] * 80)) # section-separating line 184 | print("correct_by_consensus()") 185 | 186 | print('input_file = ', input_file) 187 | print('output_file = ', output_file) 188 | 189 | total_feature = 0 190 | empty_feature = 0 191 | mass_dict = {} 192 | with open(input_file, 'r') as input_handle: 193 | with open(output_file, 'w') as output_handle: 194 | csv_reader = csv.DictReader(input_handle, delimiter='\t') 195 | csv_reader.fieldnames.append('before_consensus') 196 | csv_writer = csv.DictWriter(output_handle, csv_reader.fieldnames, delimiter='\t') 197 | csv_writer.writeheader() 198 | 199 | # build the sequence mass dictionary 200 | # all sequences with the same mass are grouped together 201 | # (same mass up to resolution 1e4) 202 | for row in csv_reader: 203 | total_feature += 1 204 | predicted_sequence = row['predicted_sequence'] 205 | # skip empty sequences that DeepNovo couldn't find a suitable candidate with the given mass 206 | if predicted_sequence == '': 207 | empty_feature += 1 208 | continue 209 | # save the original predicted sequence before correcting it later 210 | row['before_consensus'] = predicted_sequence 211 | 212 | predicted_sequence = predicted_sequence.split(',') 213 | predicted_score = float(row['predicted_score']) 214 | sequence_mass_index = int(round(compute_peptide_mass(predicted_sequence) 215 | * deepnovo_config.KNAPSACK_AA_RESOLUTION)) 216 | feature = {'row': row, 217 | 'predicted_sequence': predicted_sequence, 218 | 'predicted_score': predicted_score} 219 | if sequence_mass_index in mass_dict: 220 | mass_dict[sequence_mass_index].append(feature) 221 | else: 222 | mass_dict[sequence_mass_index] = [feature] 223 | # check if all sequences have been assigned 224 | assigned_feature = sum([len(x) for x in mass_dict.values()]) 225 | assert total_feature - empty_feature == assigned_feature 226 | 227 | # for each group of sequences of the same mass, 228 | # vote the consensus sequence; 229 | # calculate Levenshtein distance between each sequence and the consensus; 230 | # if 1 <= distance <= 2, replace the sequence by the consensus; 231 | # (distance = 2 examples: AB-BA, Q-AG, N-GG) 232 | # write to output. 233 | for group in mass_dict.values(): 234 | if len(group) == 1: 235 | consensus_sequence = group[0]['predicted_sequence'] 236 | else: 237 | # vote the consensus sequence 238 | # the easy way is to find the sequence with the highest score and frequency 239 | # (more complicated ways: De Bruijn graph, alignment) 240 | consensus_candidate = {} 241 | for feature in group: 242 | predicted_sequence = feature['predicted_sequence'] 243 | predicted_score_prob = 100*math.exp(feature['predicted_score']) 244 | predicted_sequence = ','.join(predicted_sequence) 245 | if predicted_sequence in consensus_candidate: 246 | consensus_candidate[predicted_sequence] += predicted_score_prob 247 | else: 248 | consensus_candidate[predicted_sequence] = predicted_score_prob 249 | consensus_sequence = max(consensus_candidate.iterkeys(), key=(lambda key: consensus_candidate[key])) 250 | consensus_sequence = consensus_sequence.split(',') 251 | 252 | # calculate distance, correct sequence by the consensus, write to output 253 | for feature in group: 254 | distance = compute_distance(feature['predicted_sequence'], consensus_sequence) 255 | if 1 <= distance <= 2: 256 | feature['row']['predicted_sequence'] = ','.join(consensus_sequence) 257 | csv_writer.writerow(feature['row']) 258 | 259 | print('total_feature = ', total_feature) 260 | print('empty_feature = ', empty_feature) 261 | print('assigned_feature = ', assigned_feature) 262 | 263 | 264 | def filter_by_minlen(input_file, output_file, minlen): 265 | """Filter out sequences of length less than minlen. 266 | 267 | Usage: 268 | minlen = 5 269 | input_file = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus" 270 | output_file = input_file + ".minlen" + str(minlen) 271 | filter_by_minlen(input_file, output_file, minlen) 272 | """ 273 | 274 | print("".join(["="] * 80)) # section-separating line 275 | print("filter_by_minlen()") 276 | print('input_file = ', input_file) 277 | print('output_file = ', output_file) 278 | print('minlen = ', minlen) 279 | 280 | total_feature = 0 281 | minlen_feature = 0 282 | removed_feature = 0 283 | with open(input_file, 'r') as input_handle: 284 | with open(output_file, 'w') as output_handle: 285 | csv_reader = csv.DictReader(input_handle, delimiter='\t') 286 | csv_writer = csv.DictWriter(output_handle, csv_reader.fieldnames, delimiter='\t') 287 | csv_writer.writeheader() 288 | for row in csv_reader: 289 | total_feature += 1 290 | predicted_sequence_len = len(re.split(',', row['predicted_sequence'])) 291 | if predicted_sequence_len >= minlen: 292 | csv_writer.writerow(row) 293 | minlen_feature += 1 294 | else: 295 | removed_feature += 1 296 | print('total_feature = ', total_feature) 297 | print('minlen_feature = ', minlen_feature) 298 | print('removed_feature = ', removed_feature) 299 | 300 | 301 | def database_lookup(input_fasta_file, input_denovo_file, output_file, split_char, col_sequence): 302 | 303 | print("".join(["="] * 80)) # section-separating line 304 | print("database_lookup()") 305 | 306 | print('input_fasta_file = ', input_fasta_file) 307 | print('input_denovo_file = ', input_denovo_file) 308 | print('output_file = ', output_file) 309 | 310 | with open(input_fasta_file, 'r') as input_fasta_handle: 311 | record_list = list(SeqIO.parse(input_fasta_handle, "fasta")) 312 | print("Number of protein sequences: ", len(record_list)) 313 | 314 | total_count = 0 315 | db_count = 0 316 | denovo_count = 0 317 | with open(input_denovo_file, 'r') as input_denovo_handle: 318 | with open(output_file, 'w') as output_handle: 319 | # header 320 | header_line = input_denovo_handle.readline() 321 | print(header_line, file=output_handle, end="") 322 | for line in input_denovo_handle: 323 | total_count += 1 324 | line_split = re.split(split_char, line) 325 | line_split = line_split[:-1] # exclude the last empty "" 326 | predicted_sequence = line_split[col_sequence] 327 | predicted_sequence = predicted_sequence.replace(',', '') 328 | predicted_sequence = predicted_sequence.replace('C(Carbamidomethylation)', 'C') 329 | indb = False 330 | for record in record_list: 331 | if predicted_sequence in record.seq: 332 | indb = True 333 | break 334 | if indb: 335 | db_count += 1 336 | line_split.append("db") 337 | else: 338 | denovo_count += 1 339 | line_split.append("denovo") 340 | print('\t'.join(line_split), file=output_handle, end="\n") 341 | print('total_count = ', total_count) 342 | print('db_count = ', db_count) 343 | print('denovo_count = ', denovo_count) 344 | 345 | # ~ input_fasta_file = "data/uniprot_sprot.human.fasta" 346 | # ~ input_denovo_file = "data.training/bassani.hla.2018_10_18.correct_mass_shift/unidentified_features.csv.deepnovo_denovo.top95" 347 | # ~ output_file = input_denovo_file + ".lookup" 348 | # ~ split_char = '\t|\n' 349 | # ~ col_sequence = 2 350 | # ~ database_lookup(input_fasta_file, input_denovo_file, output_file, split_char, col_sequence) 351 | 352 | 353 | def select_top_k(input_file, output_file, top_k, split_char, col_score): 354 | """TODO(nh2tran): docstring.""" 355 | 356 | print("".join(["="] * 80)) # section-separating line 357 | print("select_top_k()") 358 | 359 | print('input_file = ', input_file) 360 | print('output_file = ', output_file) 361 | print('top_k = ', top_k) 362 | 363 | with open(input_file, 'r') as input_handle: 364 | with open(output_file, 'w') as output_handle: 365 | # header 366 | header_line = input_handle.readline() 367 | print(header_line, file=output_handle, end="") 368 | predicted_list = [] 369 | for line in input_handle: 370 | line_split = re.split(split_char, line) 371 | predicted = {} 372 | predicted["line"] = line 373 | predicted["score"] = float(line_split[col_score]) if line_split[col_score] else -999 374 | predicted_list.append(predicted) 375 | sorted_list = sorted(predicted_list, key=lambda k: k['score'], reverse=True) 376 | for entry in sorted_list[:top_k]: 377 | print(entry["line"], file=output_handle, end="") 378 | 379 | #~ top_k = 7673 380 | #~ split_char = '\t|\n' 381 | #~ col_score = deepnovo_config.pcol_score_max 382 | #~ input_file = "data.training/dia.pecan.plasma.2018_03_29/testing.unlabeled.csv.deepnovo_denovo" 383 | #~ output_file = input_file + ".topk" 384 | #~ select_top_k(input_file, output_file, top_k, split_char, col_score) 385 | #~ split_char = ',|\n' 386 | #~ col_score = 5 387 | #~ input_file = "data.training/dia.urine.2018_03_29/peaks.denovo.csv" 388 | 389 | 390 | # filter features of single-feature (DDA-like) scan or multi-feature scan (DIA) 391 | def filter_multifeature(input_file): 392 | """TODO(nh2tran): docstring.""" 393 | 394 | print("".join(["="] * 80)) # section-separating line 395 | print("filter_multifeature()") 396 | 397 | print('input_file = ', input_file) 398 | output_file_1 = input_file + '.1fea' 399 | output_file_2 = input_file + '.2fea' 400 | print('output_file_1 = ', output_file_1) 401 | print('output_file_2 = ', output_file_2) 402 | 403 | # read feature and record feature_dict, scan_dict 404 | with open(input_file, 'r') as input_handle: 405 | # header 406 | header_line = input_handle.readline() 407 | col_feature_id = deepnovo_config.col_feature_id 408 | col_scan_list = deepnovo_config.col_scan_list 409 | feature_dict = {} 410 | scan_dict = {} 411 | # read feature and record feature_dict, scan_dict 412 | for line in input_handle: 413 | line_split = re.split(',|\n', line) 414 | feature_id = line_split[col_feature_id] 415 | scan_list = re.split(';', line_split[col_scan_list]) 416 | feature_dict[feature_id] = {} 417 | feature_dict[feature_id]['line'] = line 418 | feature_dict[feature_id]['scan_list'] = scan_list 419 | for scan_id in scan_list: 420 | if scan_id in scan_dict: 421 | scan_dict[scan_id]['feature_list'].append(feature_id) 422 | else: 423 | scan_dict[scan_id] = {} 424 | scan_dict[scan_id]['feature_list'] = [feature_id] 425 | 426 | print('Total scan count = ', len(scan_dict)) 427 | print(' Scan with single-feature = ', 428 | sum([1 if (len(scan['feature_list'])==1) else 0 for _, scan in scan_dict.iteritems()])) 429 | print(' Scan with multi-feature = ', 430 | sum([1 if (len(scan['feature_list'])>=2) else 0 for _, scan in scan_dict.iteritems()])) 431 | 432 | # write feature to separate files, 433 | # depending on its scan is single-feature (DDA-like) or multi-feature (DIA) 434 | single_feature_count = 0 435 | multi_feature_count = 0 436 | with open(output_file_1, 'w') as output_handle_1: 437 | with open(output_file_2, 'w') as output_handle_2: 438 | # header 439 | print(header_line, file=output_handle_1, end="") 440 | print(header_line, file=output_handle_2, end="") 441 | for feature_id, feature in feature_dict.iteritems(): 442 | # assuming all scans are single-feature 443 | output_handle = output_handle_1 444 | single_feature_count += 1 445 | # at least 1 scan is multi-feature 446 | #~ for scan_id in feature['scan_list']: 447 | #~ if len(scan_dict[scan_id]['feature_list']) >= 2: 448 | #~ output_handle = output_handle_2 449 | #~ multi_feature_count += 1 450 | #~ single_feature_count -= 1 451 | #~ break 452 | # average feature count of scans 453 | feature_count = sum([len(scan_dict[scan_id]['feature_list']) for scan_id in feature['scan_list']]) 454 | feature_count /= float(len(feature['scan_list'])) 455 | if feature_count >= 2: 456 | output_handle = output_handle_2 457 | multi_feature_count += 1 458 | single_feature_count -= 1 459 | print(feature['line'], file=output_handle, end="") 460 | 461 | print('Total feature count = ', len(feature_dict)) 462 | print('Feature with single-feature scans = ', single_feature_count) 463 | print('Feature with at least 1 multi-feature scans = ', multi_feature_count) 464 | 465 | #~ input_file = "data.training/dia.urine.2018_03_29/testing_12.feature.csv" 466 | #~ filter_multifeature(input_file) 467 | 468 | -------------------------------------------------------------------------------- /aa_workflow.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | 7 | from deepnovo_preprocess import * 8 | from deepnovo_postprocess import * 9 | import aa_workflow_step_4_2 10 | import aa_workflow_step_5 11 | 12 | 13 | data_fasta_dir = "data.fasta/" 14 | patient_id = "Mel16" 15 | data_training_dir = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/" 16 | num_fractions = 11 17 | model_dir = "train.mel_16.class_1" # before training, create this empty folder at the same level as Python scripts. 18 | 19 | 20 | # ================================================================================ 21 | # Workflow of neoantigen discovery by personalized de novo sequencing. 22 | # ================================================================================ 23 | 24 | # Step-by-step instructions based on the following example dataset: 25 | 26 | # Patient Mel-16 (Bassani-Sternberg et al., Nature Communication, 2016) 27 | # HLA class 1: 12 raw files, 1 failed to run PEAKS 28 | 29 | # 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_1_A 30 | # 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_1_B 31 | # 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_2_A 32 | # 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_2_B 33 | # 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_3_A 34 | # 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_3_B 35 | # 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_1_A_1 36 | # 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_1_B_1, failed 37 | # 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_2_A_1 38 | # 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_2_B_1 39 | # 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_3_A_1 40 | # 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_3_B_1 41 | 42 | 43 | 44 | 45 | # ================================================================================ 46 | # Step 1: Build the immunopeptidome of the patient. 47 | # ================================================================================ 48 | 49 | # This step 1 took about ?? hours on a laptop with 4 CPU cores i7, 16 GB memory 50 | 51 | # ================================================================================ 52 | # Step 1.1: Run PEAKS X DB search on the raw files with the following parameters: 53 | # ================================================================================ 54 | 55 | # Enzyme: None 56 | # Instrument: Orbi-Orbi 57 | # Fragment: HCD 58 | # Acquisition: DDA 59 | 60 | # Parent Mass Error Tolerance: 15.0 ppm 61 | # Fragment Mass Error Tolerance: 0.05 Da 62 | # Precursor Mass Search Type: monoisotopic 63 | # Enzyme: None 64 | # Digest Mode: Unspecific 65 | # Max Missed Cleavages: 100 66 | # Variable Modifications: 67 | # Oxidation (M): 15.99 68 | # Deamidation (NQ): 0.98 69 | # Max Variable PTM Per Peptide: 3 70 | # Database: uniprot_sprot.human 71 | # Taxon: All 72 | # Contaminant Database: contaminants_maxquant 73 | # Searched Entry: 20488 74 | # FDR Estimation: Enabled 75 | # Merge Options: no merge 76 | # Precursor Options: corrected 77 | # Charge Options: no correction 78 | # Filter Options: no filter 79 | # Process: true 80 | # Associate chimera: no 81 | 82 | 83 | 84 | 85 | # ================================================================================ 86 | # Step 1.2: Set FDR 1.0%. 87 | # ================================================================================ 88 | 89 | # The number of MS/MS spectra is "694565", the number of peptide-spectrum matches (PSMs) is "207332", the number of peptide sequences is "26594". 90 | 91 | 92 | 93 | 94 | # ================================================================================ 95 | # Step 1.3: Right-click on the DB search node "??", select "Deep Denovo Export". 96 | # ================================================================================ 97 | 98 | # We will get the following 11 pairs of csv and mgf files in the PEAKS project folder: 99 | 100 | # export_0.csv, export_0.mgf 101 | # export_1.csv, export_1.mgf 102 | # export_2.csv, export_2.mgf 103 | # export_3.csv, export_3.mgf 104 | # export_4.csv, export_4.mgf 105 | # export_5.csv, export_5.mgf 106 | # export_6.csv, export_6.mgf 107 | # export_7.csv, export_7.mgf 108 | # export_8.csv, export_8.mgf 109 | # export_9.csv, export_9.mgf 110 | # export_10.csv, export_10.mgf 111 | 112 | 113 | 114 | 115 | # ================================================================================ 116 | # Step 2: Train personalized DeepNovo model. 117 | # ================================================================================ 118 | 119 | # This step 2 took about 12 hours on a server with GPU Titan X, 32 GB memory 120 | 121 | # Note that you will need to specify the paths to your own data and model folders when you run the Python scripts. The following scripts just show examples of my data and model folders. 122 | 123 | # ================================================================================ 124 | # Step 2.1: Prepare the training data. 125 | # ================================================================================ 126 | 127 | # Run merge_mgf_file() and merge_feature_file() 128 | # ======================= UNCOMMENT and RUN ====================================== 129 | # ~ folder_path = data_training_dir 130 | # ~ fraction_list = range(0, num_fractions) 131 | # ~ merge_mgf_file( 132 | # ~ input_file_list=[folder_path + "export_" + str(i) + ".mgf" for i in fraction_list], 133 | # ~ fraction_list=fraction_list, 134 | # ~ output_file=folder_path + "spectrum.mgf") 135 | # ~ merge_feature_file( 136 | # ~ input_file_list=[folder_path + "export_" + str(i) + ".csv" for i in fraction_list], 137 | # ~ fraction_list=fraction_list, 138 | # ~ output_file=folder_path + "feature.csv") 139 | # ================================================================================ 140 | # We will get two output files in the same folder: "spectrum.mgf" and "feature.csv". 141 | # Both functions also report the number of entries that have been processed: "counter = 694565". 142 | # That number should be the same as the total number of MS/MS spectra from the raw files. 143 | 144 | # Run split_feature_unlabel() 145 | # ======================= UNCOMMENT and RUN ====================================== 146 | # ~ input_feature_file = data_training_dir + "feature.csv" 147 | # ~ split_feature_unlabel(input_feature_file) 148 | # ================================================================================ 149 | # It will split the "feature.csv" into 2 files: "feature.csv.labeled" and "feature.csv.unlabeled". 150 | # It also reports the number of labeled and unlabel features: "num_labeled = 207332" and "num_unlabeled = 487233". 151 | # Note that "207332" is also the number of PSMs reported at FDR 1.0% in Step 1. 152 | 153 | # Run calculate_mass_shift_ppm() and correct_mass_shift_ppm() 154 | # ======================= UNCOMMENT and RUN ====================================== 155 | # ~ labeled_feature_file = data_training_dir + "feature.csv.labeled" 156 | # ~ ppm = calculate_mass_shift_ppm(labeled_feature_file) 157 | # ~ input_feature_file = data_training_dir + "feature.csv.labeled" 158 | # ~ correct_mass_shift_ppm(input_feature_file, ppm) 159 | # ~ input_feature_file = data_training_dir + "feature.csv" 160 | # ~ correct_mass_shift_ppm(input_feature_file, ppm) 161 | # ================================================================================ 162 | # The mass shift is calculated from "feature.csv.labeled". 163 | # The mass shift ppm (part per million) is reported as: "mean_precursor_ppm = 7.07514819678". 164 | # Then mass is corrected for 2 files: "feature.csv.labeled.mass_corrected" and "feature.csv.mass_corrected". 165 | 166 | # Run split_feature_training_noshare() 167 | # ======================= UNCOMMENT and RUN ====================================== 168 | # ~ input_feature_file = data_training_dir + "feature.csv.labeled.mass_corrected" 169 | # ~ proportion = [0.90, 0.05, 0.05] 170 | # ~ split_feature_training_noshare(input_feature_file, proportion) 171 | # ================================================================================ 172 | # It will split "feature.csv.labeled.mass_corrected" into train/valid/test sets with "proportion = [0.9, 0.05, 0.05]". 173 | # Those 3 sets do not share common peptides. 174 | # Their sizes are also reported. 175 | # "num_total = 207332" 176 | # "num_unique = 26656" 177 | # "num_train = 185823" 178 | # "num_valid = 10900" 179 | # "num_test = 10609" 180 | 181 | 182 | 183 | 184 | # ================================================================================ 185 | # Step 2.2: Training DeepNovo model. 186 | # ================================================================================ 187 | 188 | # Run DeepNovo training 189 | # The training will stop after 10 epoch. The model with best performance on the valid set, "ckpt-16200" is saved in the model folder "train.mel_16.class_1". 190 | # ======================= UNCOMMENT and RUN ====================================== 191 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --train"] 192 | # ~ command += ["--train_dir", model_dir] 193 | # ~ command += ["--train_spectrum", data_training_dir + "spectrum.mgf"] 194 | # ~ command += ["--train_feature", data_training_dir + "feature.csv.labeled.mass_corrected.train.noshare"] 195 | # ~ command += ["--valid_spectrum", data_training_dir + "spectrum.mgf"] 196 | # ~ command += ["--valid_feature", data_training_dir + "feature.csv.labeled.mass_corrected.valid.noshare"] 197 | # ~ command += ["--reset_step"] 198 | # ~ command = " ".join(command) 199 | # ~ print(command) 200 | # ~ os.system(command) 201 | # ================================================================================ 202 | 203 | # Run DeepNovo testing 204 | # ======================= UNCOMMENT and RUN ====================================== 205 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test_true_feeding"] 206 | # ~ command += ["--train_dir", model_dir] 207 | # ~ command += ["--test_spectrum", data_training_dir + "spectrum.mgf"] 208 | # ~ command += ["--test_feature", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"] 209 | # ~ command = " ".join(command) 210 | # ~ print(command) 211 | # ~ os.system(command) 212 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --search_denovo"] 213 | # ~ command += ["--train_dir", model_dir] 214 | # ~ command += ["--denovo_spectrum", data_training_dir + "spectrum.mgf"] 215 | # ~ command += ["--denovo_feature", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"] 216 | # ~ command = " ".join(command) 217 | # ~ print(command) 218 | # ~ os.system(command) 219 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test"] 220 | # ~ command += ["--target_file", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"] 221 | # ~ command += ["--predicted_file", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo"] 222 | # ~ command = " ".join(command) 223 | # ~ print(command) 224 | # ~ os.system(command) 225 | # ================================================================================ 226 | # The testing accuracy at the amino acid (AA) and peptide levels will be reported as following: 227 | # "precision_AA_mass_db = 0.8425" 228 | # "precision_peptide_mass_db = 0.6430" 229 | 230 | 231 | 232 | 233 | # ================================================================================ 234 | # Step 3: Perform personalized de novo sequencing with DeepNovo. 235 | # ================================================================================ 236 | 237 | # This step 3 took about 5 hours on a server with GPU Titan X, 32 GB memory 238 | 239 | # Run DeepNovo de novo sequencing on all features (label and unlabeled) 240 | # ======================= UNCOMMENT and RUN ====================================== 241 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --search_denovo"] 242 | # ~ command += ["--train_dir", model_dir] 243 | # ~ command += ["--denovo_spectrum", data_training_dir + "spectrum.mgf"] 244 | # ~ command += ["--denovo_feature", data_training_dir + "feature.csv.mass_corrected"] 245 | # ~ command = " ".join(command) 246 | # ~ print(command) 247 | # ~ os.system(command) 248 | # ================================================================================ 249 | # The de novo results will be written to the file "feature.csv.mass_corrected.deepnovo_denovo". 250 | # The tool will also report the number of features that have been processed: 251 | # "Total spectra: 694565" 252 | # "read: 690354" 253 | # "skipped: 4211" 254 | # "by mass: 4211" 255 | 256 | 257 | 258 | 259 | # ================================================================================ 260 | # Step 4: Quality control. 261 | # ================================================================================ 262 | 263 | # ================================================================================ 264 | # Step 4.1: Post-process de novo results to improve their accuracy. 265 | # ================================================================================ 266 | 267 | # Run select_top_score() 268 | # This script selects a threshold of de novo confidence scores and uses it to filter de novo results. 269 | # The score threshold is calculated based on a 95% cutoff of the testing accuracy obtained at the end of Step 2 above. 270 | # ======================= UNCOMMENT and RUN ====================================== 271 | # ~ accuracy_cutoff = 0.95 272 | # ~ accuracy_file = data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo.accuracy" 273 | # ~ score_cutoff = find_score_cutoff(accuracy_file, accuracy_cutoff) 274 | # ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo" 275 | # ~ output_file = input_file + ".top95" 276 | # ~ select_top_score(input_file, output_file, score_cutoff) 277 | # ================================================================================ 278 | # After this step we'll get the file "feature.csv.mass_corrected.deepnovo_denovo.top95". 279 | # The score cutoff and the number of selected features will also be reported: 280 | # "score_cutoff = -0.5" 281 | # "total_feature = 690354" 282 | # "select_feature = 233589" 283 | 284 | # Run convert_I_to_L() 285 | # This script converts I (Isoleucine) to L (Leucine) in all de novo peptides, because de novo sequencing is not able to distinguish them. 286 | # ======================= UNCOMMENT and RUN ====================================== 287 | # ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95" 288 | # ~ output_file = input_file + ".I_to_L" 289 | # ~ convert_I_to_L(input_file, output_file) 290 | # ================================================================================ 291 | 292 | # Run correct_by_consensus() 293 | # This script corrects de novo sequencing errors by grouping predicted sequences of the same mass together and voting the consensus sequence. 294 | # ======================= UNCOMMENT and RUN ====================================== 295 | # ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L" 296 | # ~ output_file = input_file + ".consensus" 297 | # ~ correct_by_consensus(input_file, output_file) 298 | # ================================================================================ 299 | 300 | # Run filter_by_minlen() 301 | # This script filters out sequences of length less than 5 amino acids. 302 | # ======================= UNCOMMENT and RUN ====================================== 303 | # ~ minlen = 5 304 | # ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus" 305 | # ~ output_file = input_file + ".minlen" + str(minlen) 306 | # ~ filter_by_minlen(input_file, output_file, minlen) 307 | # ================================================================================ 308 | # The numbers of features will be reported as: 309 | # "total_feature = 233589" 310 | # "minlen_feature = 223507" 311 | # "removed_feature = 10082" 312 | 313 | # Up to this step, we get the following file: 314 | # "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5" 315 | # We test its accuracy against the test set: 316 | # Run DeepNovo testing 317 | # ======================= UNCOMMENT and RUN ====================================== 318 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test"] 319 | # ~ command += ["--target_file", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"] 320 | # ~ command += ["--predicted_file", data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5"] 321 | # ~ command = " ".join(command) 322 | # ~ print(command) 323 | # ~ os.system(command) 324 | # ================================================================================ 325 | # We get these results: 326 | # "precision_AA_mass_db = 0.9530" 327 | # "precision_peptide_mass_db = 0.8441" 328 | 329 | # Repeat the same testing but now against all labeled features: 330 | # Run DeepNovo testing 331 | # ====================== UNCOMMENT and RUN ======================================= 332 | # ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test"] 333 | # ~ command += ["--target_file", data_training_dir + "feature.csv.labeled.mass_corrected"] 334 | # ~ command += ["--predicted_file", data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5"] 335 | # ~ command = " ".join(command) 336 | # ~ print(command) 337 | # ~ os.system(command) 338 | # ================================================================================ 339 | # We get these results: 340 | # "precision_AA_mass_db = 0.9797" 341 | # "precision_peptide_mass_db = 0.9371" 342 | # Note that these accuracy results look better than those against the test set because the test set was not used for training the model. 343 | # The number of de novo only features is also reported as 344 | # "predicted_only: 68721" 345 | # and they are written to the file 346 | # "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5.denovo_only" 347 | 348 | 349 | 350 | 351 | # ================================================================================ 352 | # Step 4.2: Run second round of PEAKS X DB search against the list of database and de novo peptides. 353 | # ================================================================================ 354 | 355 | # Before running PEAKS, we need to combine database and de novo peptides into a list. 356 | # This script will select unique de novo peptides, filter out those that belong to the human Swiss-Prot protein database, and combine the remaining de novo peptides and the database peptides identified from Step 1 into a fasta file. 357 | # ======================= UNCOMMENT and RUN ====================================== 358 | # ~ aa_workflow_step_4_2.preprocess( 359 | # ~ denovo_file=data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5.denovo_only", 360 | # ~ db_fasta_file=data_fasta_dir + "uniprot_sprot.human.plus_contaminants.fasta", 361 | # ~ labeled_feature_file=data_training_dir + "feature.csv.labeled.mass_corrected", 362 | # ~ peptide_list_fasta=data_training_dir + "aa_workflow.step_4.peptide_list.fasta") 363 | # ================================================================================ 364 | # The numbers of de novo and database peptides are reported as following: 365 | # "Number of top-scoring denovo peptides: 17318" 366 | # "num_db_peptides = 25274" 367 | # "num_denovo_peptides = 6444" (not in database) 368 | 369 | # Run PEAKS X DB search with as following: 370 | # Select the DENOVO node result from Step 1.1, and select PEAKS DB search; 371 | # Select option "No digestion" for "Digest mode"; 372 | # Select the fasta file "aa_workflow.step_4.peptide_list.fasta" as the only database, no contaminant; 373 | # Leave other settings the same as in Step 1.1. 374 | # Set FDR 1.0% and export the "DB search psm.csv" file, rename it to "aa_workflow.step_4.psm.csv". 375 | 376 | # Extract de novo peptides from the PSMs of PEAKS X DB search round 2. 377 | # ======================= UNCOMMENT and RUN ====================================== 378 | # ~ aa_workflow_step_4_2.postprocess( 379 | # ~ psm_file = data_training_dir + "aa_workflow.step_4.psm.csv", 380 | # ~ output_denovo_peptide_file = data_training_dir + "aa_workflow.step_4.output_peptide_list") 381 | # ================================================================================ 382 | # The number of de novo peptides is reported as following: 383 | # "num_denovo_peptides = 1259" 384 | 385 | 386 | 387 | 388 | # ================================================================================ 389 | # Step 5: Neoantigen selection. 390 | # ================================================================================ 391 | # ~ aa_workflow_step_5.step_5( 392 | # ~ psm_file=data_training_dir + "aa_workflow.step_4.psm.csv", 393 | # ~ netmhc_file=data_training_dir + "aa_workflow.step_5.netmhcpan.csv", 394 | # ~ immunogenicity_file=data_training_dir + "aa_workflow.step_5.immunogenicity.csv", 395 | # ~ db_fasta_file=data_fasta_dir + "uniprot_sprot.human.plus_contaminants.fasta", 396 | # ~ labeled_feature_file=data_training_dir + "feature.csv.labeled", 397 | # ~ snp_file=data_training_dir + "aa_workflow.step_5.supp_data5_snp.csv", 398 | # ~ snp_enst_fasta=data_training_dir + "aa_workflow.step_5.supp_data5_snp_enst.fasta", 399 | # ~ snp_sample_id=patient_id, 400 | # ~ output_neoantigen_criteria=data_training_dir + "aa_workflow.step_5.output_neoantigen_criteria.csv", 401 | # ~ output_protein_mutation=data_training_dir + "aa_workflow.step_5.protein_mutation.csv") 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | -------------------------------------------------------------------------------- /deepnovo_worker_db.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Hieu Tran. All Rights Reserved. 2 | # 3 | # DeepNovo is publicly available for non-commercial uses. 4 | # ============================================================================== 5 | 6 | """TODO(nh2tran): docstring.""" 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import sys 13 | import time 14 | import re 15 | from random import shuffle 16 | from itertools import combinations 17 | 18 | from Bio import SeqIO 19 | from pyteomics import parser 20 | import numpy as np 21 | import tensorflow as tf 22 | 23 | import deepnovo_config 24 | from deepnovo_cython_modules import get_candidate_intensity 25 | 26 | 27 | class WorkerDB(object): 28 | """TODO(nh2tran): docstring. 29 | This class contains the database search module. 30 | We use "db" for "database". 31 | We use "pepmod" to refer to a modified version of a "peptide" 32 | """ 33 | 34 | 35 | def __init__(self, db_fasta_file): 36 | """TODO(nh2tran): docstring.""" 37 | 38 | print("".join(["="] * 80)) # section-separating line 39 | print("WorkerDB: __init__()") 40 | 41 | # search_db and search_hybrid could use different fasta files for their 42 | # worker_db objects. So it's better to have fasta files as input. 43 | self.db_fasta_file = db_fasta_file 44 | 45 | # we currently use deepnovo_config to store both const & settings 46 | # the settings should be shown in __init__() to keep track carefully 47 | # input info to build a db 48 | self.cleavage_rule = deepnovo_config.cleavage_rule 49 | self.num_missed_cleavage = deepnovo_config.num_missed_cleavage 50 | self.fixed_mod_list = deepnovo_config.fixed_mod_list 51 | self.var_mod_list = deepnovo_config.var_mod_list 52 | self.num_mod = deepnovo_config.num_mod 53 | self.precursor_mass_tolerance = deepnovo_config.precursor_mass_tolerance 54 | self.precursor_mass_ppm = deepnovo_config.precursor_mass_ppm 55 | self.decoy = deepnovo_config.FLAGS.decoy 56 | print("db_fasta_file = {0:s}".format(self.db_fasta_file)) 57 | print("cleavage_rule = {0:s}".format(self.cleavage_rule)) 58 | print("num_missed_cleavage = {0:d}".format(self.num_missed_cleavage)) 59 | print("fixed_mod_list = {0}".format(self.fixed_mod_list)) 60 | print("var_mod_list = {0}".format(self.var_mod_list)) 61 | print("num_mod = {0}".format(self.num_mod)) 62 | print("precursor_mass_tolerance = {0:.4f}".format(self.precursor_mass_tolerance)) 63 | print("precursor_mass_ppm = {0:.6f}".format(self.precursor_mass_ppm)) 64 | 65 | # data structure to store a db 66 | # all attributes will be built/loaded by build_db() 67 | self.peptide_count = None 68 | self.peptide_list = None 69 | self.peptide_mass_array = None 70 | self.pepmod_maxmass_array = None 71 | 72 | self.test_time = 0.0 73 | 74 | 75 | def build_db(self): 76 | """TODO(nh2tran): docstring.""" 77 | 78 | print("".join(["="] * 80)) # section-separating line 79 | print("WorkerDB: build_db()") 80 | 81 | # parse the input fasta file into a list of sequences 82 | # more about SeqIO and SeqRecord: http://biopython.org/wiki/SeqRecord 83 | with open(self.db_fasta_file, "r") as handle: 84 | record_iterator = SeqIO.parse(handle, "fasta") 85 | record_list = list(record_iterator) 86 | print("Number of protein sequences: {0:d}".format(len(record_list))) 87 | 88 | # cleave protein sequences into a list of unique peptides 89 | # more about pyteomics.parser.cleave and cleavage rules: 90 | # https://pythonhosted.org/pyteomics/api/parser.html 91 | 92 | # create a peptide to protein accession id map. 93 | peptide_2_protein_id = {} 94 | for record in record_list: 95 | protein_sequence = str(record.seq) 96 | protein_id = str(record.id) 97 | cleaved_peptide_set = parser.cleave( 98 | sequence=protein_sequence, 99 | rule=parser.expasy_rules[self.cleavage_rule], 100 | missed_cleavages=self.num_missed_cleavage) 101 | for peptide in cleaved_peptide_set: 102 | if any(x in peptide for x in ['X', 'B', 'U', 'Z']): 103 | # skip peptides with undetermined amino acid ['X', 'B', 'U', 'Z'] 104 | continue 105 | if peptide not in peptide_2_protein_id: 106 | peptide_2_protein_id[peptide] = {protein_id} 107 | else: 108 | peptide_2_protein_id[peptide].add(protein_id) 109 | 110 | peptide_list = [list(peptide) for peptide in peptide_2_protein_id.keys()] 111 | peptide_list = [[x + 'mod' if x in self.fixed_mod_list else x for x in peptide] for peptide in peptide_list ] 112 | 113 | peptide_count = len(peptide_list) 114 | print("Number of peptides: {0:d}".format(peptide_count)) 115 | 116 | # for each peptide, find the mass and the max modification mass 117 | peptide_mass_array = np.zeros(peptide_count) 118 | pepmod_maxmass_array = np.zeros(peptide_count) 119 | for index, peptide in enumerate(peptide_list): 120 | peptide_mass_array[index] = self._compute_peptide_mass(peptide) 121 | pepmod = [x + 'mod' if x in self.var_mod_list else x for x in peptide] 122 | pepmod_maxmass_array[index] = self._compute_peptide_mass(pepmod) 123 | 124 | self.peptide_count = peptide_count 125 | self.peptide_list = peptide_list 126 | self.peptide_mass_array = peptide_mass_array 127 | self.pepmod_maxmass_array = pepmod_maxmass_array 128 | self.peptide_2_protein_id = peptide_2_protein_id 129 | 130 | 131 | def search_db(self, model, worker_io, predicted_denovo_list=None): 132 | """TODO(nh2tran): docstring.""" 133 | 134 | print("".join(["="] * 80)) # section-separating line 135 | print("WorkerDB: search_db()") 136 | 137 | # move load/build db here? 138 | 139 | # if provided, convert predicted_denovo_list to dictionary for easy lookup 140 | denovo_peptide_dict = None 141 | if predicted_denovo_list is not None: 142 | print("WorkerDB: search_db() - read denovo peptides") 143 | denovo_peptide_dict = {} 144 | for predicted in predicted_denovo_list: 145 | feature_id = predicted["feature_id"] 146 | sequence = predicted["sequence"] 147 | denovo_peptide_dict[feature_id] = sequence 148 | 149 | print("WorkerDB: search_db() - open tensorflow session") 150 | session = tf.Session() 151 | model.restore_model(session) 152 | 153 | worker_io.open_input() 154 | worker_io.get_location() 155 | worker_io.split_feature_index() 156 | worker_io.open_output() 157 | 158 | print("".join(["="] * 80)) # section-separating line 159 | print("WorkerDB: search_db() - search loop") 160 | 161 | for index, feature_index_batch in enumerate(worker_io.feature_index_batch_list): 162 | print("Read {0:d}/{1:d} batches".format(index + 1, 163 | worker_io.feature_index_batch_count)) 164 | spectrum_batch = worker_io.get_spectrum(feature_index_batch) 165 | predicted_batch = self._search_db_batch(spectrum_batch, 166 | model, 167 | session, 168 | denovo_peptide_dict) 169 | worker_io.write_prediction(predicted_batch) 170 | 171 | print("Total spectra: {0:d}".format(worker_io.feature_count["total"])) 172 | print(" read: {0:d}".format(worker_io.feature_count["read"])) 173 | print(" skipped: {0:d}".format(worker_io.feature_count["skipped"])) 174 | print(" by mass: {0:d}".format(worker_io.feature_count["skipped_mass"])) 175 | 176 | worker_io.close_input() 177 | worker_io.close_output() 178 | session.close() 179 | 180 | 181 | def _compute_peptide_mass(self, peptide): 182 | """TODO(nh2tran): docstring. 183 | """ 184 | 185 | #~ print("".join(["="] * 80)) # section-separating line === 186 | #~ print("WorkerDB: _compute_peptide_mass()") 187 | 188 | peptide_mass = (deepnovo_config.mass_N_terminus 189 | + sum(deepnovo_config.mass_AA[aa] for aa in peptide) 190 | + deepnovo_config.mass_C_terminus) 191 | 192 | return peptide_mass 193 | 194 | 195 | def _expand_peptide_modification(self, peptide): 196 | """TODO(nh2tran): docstring. 197 | May also use parser.isoforms 198 | """ 199 | 200 | #~ print("".join(["="] * 80)) # section-separating line 201 | #~ print("WorkerDB: _expand_peptide_modification()") 202 | 203 | # all possible positions for modification 204 | position_list = [position for position, aa in enumerate(peptide) 205 | if aa in self.var_mod_list] 206 | position_count = len(position_list) 207 | # max number of modifications allowed 208 | num_mod = min(position_count, self.num_mod) 209 | # find all combinations upto num_mod 210 | position_combination_list = [] 211 | for x in xrange(1, num_mod+1): 212 | position_combination_list += combinations(position_list, x) 213 | # find all pepmod 214 | pepmod_list = [peptide] 215 | for position_combination in position_combination_list: 216 | pepmod = peptide[:] 217 | for position in position_combination: 218 | pepmod[position] += 'mod' 219 | pepmod_list.append(pepmod) 220 | 221 | return pepmod_list 222 | 223 | 224 | def _filter_by_mass(self, precursor_mass): 225 | """TODO(nh2tran): docstring. 226 | """ 227 | 228 | #~ print("".join(["="] * 80)) # section-separating line 229 | #~ print("WorkerDB: _filter_by_mass()") 230 | 231 | # use precursor_mass_ppm instead of absolute precursor_mass_tolerance 232 | #~ precursor_mass_tolerance = self.precursor_mass_tolerance 233 | precursor_mass_tolerance = self.precursor_mass_ppm * precursor_mass 234 | 235 | # 1st filter by the peptide mass and the max pepmod mass 236 | filter1_index = np.flatnonzero(np.logical_and( 237 | np.less_equal(self.peptide_mass_array, 238 | precursor_mass + precursor_mass_tolerance), 239 | np.greater_equal(self.pepmod_maxmass_array, 240 | precursor_mass - precursor_mass_tolerance))) 241 | 242 | # find all possible modifications 243 | pepmod_list = [] 244 | for index in filter1_index: 245 | peptide = self.peptide_list[index] 246 | pepmod_list += self._expand_peptide_modification(peptide) 247 | pepmod_mass_array = np.array([self._compute_peptide_mass(pepmod) 248 | for pepmod in pepmod_list]) 249 | 250 | # 2nd filter by exact pepmod mass 251 | filter2_index = np.flatnonzero(np.logical_and( 252 | np.less_equal(pepmod_mass_array, 253 | precursor_mass + precursor_mass_tolerance), 254 | np.greater_equal(pepmod_mass_array, 255 | precursor_mass - precursor_mass_tolerance))) 256 | 257 | candidate_list = [pepmod_list[x] for x in filter2_index] 258 | 259 | return candidate_list 260 | 261 | 262 | def _score_spectrum(self, 263 | precursor_mass, 264 | spectrum_original, 265 | state0_c, 266 | state0_h, 267 | candidate_list, 268 | model, 269 | model_output_logprob, 270 | model_lstm_state, 271 | session, 272 | direction): 273 | """TODO(nh2tran): docstring.""" 274 | 275 | #~ print("".join(["="] * 80)) # section-separating line 276 | #~ print("WorkerDB: _score()") 277 | 278 | # convert symbols into id 279 | candidate_list = [[deepnovo_config.vocab[x] for x in candidate] 280 | for candidate in candidate_list] 281 | 282 | # we shall group candidates into minibatches 283 | # === candidate_len === 284 | # s 285 | # i 286 | # z 287 | # e 288 | # ===================== 289 | minibatch_size = len(candidate_list) # number of candidates 290 | candidate_len = len(candidate_list[0]) # length of each candidate 291 | 292 | # candidates share the same state0, so repeat into [minibatch_size, 512] 293 | # the states will also be updated after every iteration 294 | state0_c = state0_c.reshape((1, -1)) # reshape to [1, 512] 295 | state0_h = state0_h.reshape((1, -1)) 296 | minibatch_state_c = np.repeat(state0_c, minibatch_size, axis=0) 297 | minibatch_state_h = np.repeat(state0_h, minibatch_size, axis=0) 298 | 299 | # mass of each candidate, will be accumulated everytime an AA is appended 300 | minibatch_prefix_mass = np.zeros(minibatch_size) 301 | 302 | # output is a list of candidate_len arrays of shape [minibatch_size, 26] 303 | # each row is log of probability distribution over 26 classes/symbols 304 | output_logprob_list = [] 305 | 306 | # recurrent iterations 307 | for position in range(candidate_len): 308 | 309 | # gather minibatch data 310 | minibatch_AA_id = np.zeros(minibatch_size) 311 | for index, candidate in enumerate(candidate_list): 312 | AA = candidate[position] 313 | minibatch_AA_id[index] = AA 314 | minibatch_prefix_mass[index] += deepnovo_config.mass_ID[AA] 315 | 316 | # this is the most time-consuming ~70-75% 317 | minibatch_intensity = [get_candidate_intensity(spectrum_original, 318 | precursor_mass, 319 | prefix_mass, 320 | direction) 321 | for prefix_mass in np.nditer(minibatch_prefix_mass)] 322 | 323 | # final shape [minibatch_size, 26, 8, 10] 324 | minibatch_intensity = np.array(minibatch_intensity) 325 | 326 | # model feed 327 | input_feed = {} 328 | input_feed[model.input_dict["AAid"][1].name] = minibatch_AA_id 329 | input_feed[model.input_dict["intensity"].name] = minibatch_intensity 330 | input_feed[model.input_dict["lstm_state"][0].name] = minibatch_state_c 331 | input_feed[model.input_dict["lstm_state"][1].name] = minibatch_state_h 332 | # and run 333 | output_feed = [model_output_logprob, model_lstm_state] 334 | output_logprob, (minibatch_state_c, minibatch_state_h) = session.run( 335 | fetches=output_feed, 336 | feed_dict=input_feed) 337 | 338 | output_logprob_list.append(output_logprob) 339 | 340 | return output_logprob_list 341 | 342 | 343 | def _search_db_batch(self, 344 | spectrum_batch, 345 | model, 346 | session, 347 | denovo_peptide_dict): 348 | """TODO(nh2tran): docstring. 349 | Inputs: 350 | spectrum_batch: a list of spectrum, each is a dictionary 351 | spectrum["feature_id"] 352 | spectrum["precursor_mass"] 353 | spectrum["spectrum_holder"] 354 | spectrum["spectrum_original_forward"] 355 | spectrum["spectrum_original_backward"] 356 | Outputs: 357 | predicted_batch: a list of predicted, each is a dictionary 358 | predicted["feature_id"] 359 | predicted["sequence"] 360 | predicted["score"] 361 | predicted["position_score"] 362 | """ 363 | 364 | #~ print("".join(["="] * 80)) # section-separating line 365 | #~ print("WorkerDB: _search_db_batch()") 366 | 367 | # initialize the lstm using the spectrum 368 | # for faster speed, we initialize the whole spectrum_batch instead of 1-by-1 369 | input_feed = {} 370 | spectrum_holder = np.array([spectrum["spectrum_holder"] 371 | for spectrum in spectrum_batch]) 372 | input_feed[model.input_dict["spectrum"].name] = spectrum_holder 373 | output_feed = [model.output_forward["lstm_state0"], 374 | model.output_backward["lstm_state0"]] 375 | ((state0_c_forward, state0_h_forward), 376 | (state0_c_backward, state0_h_backward)) = session.run(fetches=output_feed, 377 | feed_dict=input_feed) 378 | 379 | predicted_batch = [] 380 | # we search spectrum by spectrum 381 | # a faster way is to process them in parallel, but hard to debug 382 | #~ test_id = "F12:7420" 383 | for spectrum_index, spectrum in enumerate(spectrum_batch): 384 | #~ if spectrum["feature_id"] != test_id: 385 | #~ continue 386 | 387 | predicted = {"feature_id": spectrum["feature_id"], 388 | "sequence": [], 389 | "score": -float("inf"), 390 | "position_score": [], 391 | "precursor_mz": spectrum["precursor_mz"], 392 | "precursor_charge": spectrum["precursor_charge"], 393 | "protein_access_id": "", 394 | "scan_list_middle": spectrum["scan_list_middle"]} 395 | 396 | # filter by precursor mass 397 | # example: [['M', 'D', 'K', 'F', 'Nmod', 'K', 'K']] 398 | precursor_mass = spectrum["precursor_mass"] 399 | candidate_list = self._filter_by_mass(precursor_mass) 400 | 401 | # add denovo peptide if provided 402 | feature_id = spectrum["feature_id"] 403 | if denovo_peptide_dict is not None and feature_id in denovo_peptide_dict: 404 | sequence = denovo_peptide_dict[feature_id] 405 | # TODO(nh2tran): change the precursor_mass_tolerance of denovo 406 | sequence_mass = self._compute_peptide_mass(sequence) 407 | precursor_mass_tolerance = precursor_mass * self.precursor_mass_ppm 408 | if abs(precursor_mass - sequence_mass) <= precursor_mass_tolerance: 409 | candidate_list.append(sequence) 410 | 411 | # if no candidate found, return empty sequence for this spectrum. 412 | if not candidate_list: 413 | predicted_batch.append(predicted) 414 | continue 415 | 416 | # if decoy is activated, randomly shuffle amino acids to form decoy db. 417 | if self.decoy: 418 | for x in candidate_list: 419 | shuffle(x) # this function works in place and returns None. 420 | 421 | # add special GO/EOS and reverse 422 | # example: [['_GO', 'M', 'D', 'K', 'F', 'Nmod', 'K', 'K', '_EOS']] 423 | candidate_forward_list = [[deepnovo_config._GO] + x + [deepnovo_config._EOS] 424 | for x in candidate_list] 425 | candidate_backward_list = [x[::-1] for x in candidate_forward_list] 426 | 427 | # add PAD to all candidates to the same max length 428 | # [['_GO', 'M', 'D', 'K', 'F', 'Nmod', 'K', 'K', '_EOS', '_PAD', '_PAD']] 429 | # due to the same precursor mass, candidates have very similar lengths 430 | candidate_len_list = [len(x) for x in candidate_list] 431 | candidate_maxlen = max(candidate_len_list) 432 | for index, length in enumerate(candidate_len_list): 433 | if length < candidate_maxlen: 434 | pad_size = candidate_maxlen - length 435 | candidate_forward_list[index] += [deepnovo_config._PAD] * pad_size 436 | candidate_backward_list[index] += [deepnovo_config._PAD] * pad_size 437 | 438 | # score the spectrum against its candidates 439 | # using the forward model 440 | logprob_forward_list = self._score_spectrum( 441 | spectrum["precursor_mass"], 442 | spectrum["spectrum_original_forward"], 443 | state0_c_forward[spectrum_index], 444 | state0_h_forward[spectrum_index], 445 | candidate_forward_list, 446 | model, 447 | model.output_forward["logprob"], 448 | model.output_forward["lstm_state"], 449 | session, 450 | direction=0) 451 | # and using the backward model 452 | logprob_backward_list = self._score_spectrum( 453 | spectrum["precursor_mass"], 454 | spectrum["spectrum_original_backward"], 455 | state0_c_backward[spectrum_index], 456 | state0_h_backward[spectrum_index], 457 | candidate_backward_list, 458 | model, 459 | model.output_backward["logprob"], 460 | model.output_backward["lstm_state"], 461 | session, 462 | direction=1) 463 | 464 | # note that the candidates are grouped into minibatches 465 | # === candidate_len === 466 | # s 467 | # i 468 | # z 469 | # e 470 | # ===================== 471 | # logprob_forward_list is a list of candidate_maxlen arrays of shape 472 | # [minibatch_size, 26] 473 | # each row is log of probability distribution over 26 classes/symbols 474 | 475 | # find the best scoring candidate 476 | #~ test_handle = open("test_file", 'w') 477 | for index, candidate in enumerate(candidate_list): 478 | 479 | # only calculate score on the actual length, not on GO/EOS/PAD 480 | candidate_len = candidate_len_list[index] 481 | 482 | # align forward and backward logprob 483 | logprob_forward = [logprob_forward_list[position][index] 484 | for position in range(candidate_len)] 485 | logprob_backward = [logprob_backward_list[position][index] 486 | for position in range(candidate_len)] 487 | logprob_backward = logprob_backward[::-1] 488 | 489 | # score is the sum of logprob(AA) of the candidate in both directions 490 | # averaged by the candidate length 491 | position_score = [] 492 | for position in range(candidate_len): 493 | AA = candidate[position] 494 | AA_id = deepnovo_config.vocab[AA] 495 | position_score.append(logprob_forward[position][AA_id] 496 | + logprob_backward[position][AA_id]) 497 | score = sum(position_score) / candidate_len 498 | if score > predicted["score"]: 499 | predicted["sequence"] = candidate 500 | predicted["score"] = score 501 | predicted["position_score"] = position_score 502 | protein_access_id = self.peptide_2_protein_id.get( 503 | ''.join(candidate).replace('mod', ''), 504 | 'DENOVO') 505 | if isinstance(protein_access_id, set): 506 | protein_access_id = ','.join(list(protein_access_id)) 507 | predicted["protein_access_id"] = protein_access_id 508 | 509 | #~ if spectrum["feature_id"] == test_id: 510 | #~ print_candidate = ",".join(candidate) 511 | #~ print_score = "{0:.2f}".format(score) 512 | #~ print_row = "\t".join([print_candidate, print_score]) 513 | #~ print(print_row, file=test_handle, end="\n") 514 | #~ test_handle.close() 515 | #~ print(abc) 516 | predicted_batch.append(predicted) 517 | 518 | return predicted_batch 519 | 520 | 521 | -------------------------------------------------------------------------------- /aa_workflow_step_5.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | 4 | import sys 5 | import csv 6 | import re 7 | from Bio import SeqIO 8 | from Bio.SeqIO import FastaIO 9 | import Levenshtein 10 | import multiprocessing 11 | num_processes = 8 12 | import time 13 | 14 | 15 | WEAK_BINDING = 2.0 # NetMHC weak binding rank 16 | STRONG_BINDING = 0.5 # NetMHC strong binding rank 17 | 18 | AA_3_to_1 = { 19 | 'Ala':'A', 20 | 'Arg':'R', 21 | 'Asn':'N', 22 | 'Asp':'D', 23 | 'Cys':'C', 24 | 'Glu':'E', 25 | 'Gln':'Q', 26 | 'Gly':'G', 27 | 'His':'H', 28 | 'Ile':'I', 29 | 'Leu':'L', 30 | 'Lys':'K', 31 | 'Met':'M', 32 | 'Phe':'F', 33 | 'Pro':'P', 34 | 'Ser':'S', 35 | 'Thr':'T', 36 | 'Trp':'W', 37 | 'Tyr':'Y', 38 | 'Val':'V'} 39 | 40 | CODON_AA = { # dictionary {codon: aa} 41 | 'TTT':'F', 42 | 'TTC':'F', 43 | 'TTA':'L', 44 | 'TTG':'L', 45 | 'TCT':'S', 46 | 'TCC':'S', 47 | 'TCA':'S', 48 | 'TCG':'S', 49 | 'TAT':'Y', 50 | 'TAC':'Y', 51 | 'TAA':'X', 52 | 'TAG':'X', 53 | 'TGT':'C', 54 | 'TGC':'C', 55 | 'TGA':'X', 56 | 'TGG':'W', 57 | 'CTT':'L', 58 | 'CTC':'L', 59 | 'CTA':'L', 60 | 'CTG':'L', 61 | 'CCT':'P', 62 | 'CCC':'P', 63 | 'CCA':'P', 64 | 'CCG':'P', 65 | 'CAT':'H', 66 | 'CAC':'H', 67 | 'CAA':'Q', 68 | 'CAG':'Q', 69 | 'CGT':'R', 70 | 'CGC':'R', 71 | 'CGA':'R', 72 | 'CGG':'R', 73 | 'ATT':'I', 74 | 'ATC':'I', 75 | 'ATA':'I', 76 | 'ATG':'M', 77 | 'ACT':'T', 78 | 'ACC':'T', 79 | 'ACA':'T', 80 | 'ACG':'T', 81 | 'AAT':'N', 82 | 'AAC':'N', 83 | 'AAA':'K', 84 | 'AAG':'K', 85 | 'AGT':'S', 86 | 'AGC':'S', 87 | 'AGA':'R', 88 | 'AGG':'R', 89 | 'GTT':'V', 90 | 'GTC':'V', 91 | 'GTA':'V', 92 | 'GTG':'V', 93 | 'GCT':'A', 94 | 'GCC':'A', 95 | 'GCA':'A', 96 | 'GCG':'A', 97 | 'GAT':'D', 98 | 'GAC':'D', 99 | 'GAA':'E', 100 | 'GAG':'E', 101 | 'GGT':'G', 102 | 'GGC':'G', 103 | 'GGA':'G', 104 | 'GGG':'G'} 105 | 106 | AA_CODON ={} # dictionary {aa: list of codons} 107 | for codon, aa in CODON_AA.iteritems(): 108 | if aa in AA_CODON: 109 | AA_CODON[aa].append(codon) 110 | else: 111 | AA_CODON[aa] = [codon] 112 | 113 | AA_PAIRWISE_DISTANCE = {} # dictionary {(aa1, aa2): min_distance} 114 | for aa1 in AA_CODON: 115 | for aa2 in AA_CODON: 116 | if (aa1, aa2) not in AA_PAIRWISE_DISTANCE: 117 | min_distance = 3 118 | for codon1 in AA_CODON[aa1]: 119 | for codon2 in AA_CODON[aa2]: 120 | distance = Levenshtein.hamming(codon1, codon2) 121 | assert distance <= 3, "Error: codon distance > 3" 122 | min_distance = min(min_distance, distance) 123 | AA_PAIRWISE_DISTANCE[(aa1, aa2)] = min_distance 124 | AA_PAIRWISE_DISTANCE[(aa2, aa1)] = min_distance 125 | 126 | # a mutation pair (aa1, aa2) is missense if their codons are different by 1 nucleotide 127 | AA_PAIR_MISSENSE = [(aa1, aa2) for (aa1, aa2), min_distance in AA_PAIRWISE_DISTANCE.iteritems() 128 | if min_distance == 1] 129 | # for now, remove N-D, Q-E because not sure mutations or modifications 130 | AA_PAIR_MISSENSE.remove(('N', 'D')) 131 | AA_PAIR_MISSENSE.remove(('D', 'N')) 132 | AA_PAIR_MISSENSE.remove(('Q', 'E')) 133 | AA_PAIR_MISSENSE.remove(('E', 'Q')) 134 | 135 | 136 | def drop_mod_peaks(peptide): 137 | peptide = peptide.replace("M(+15.99)", "M") 138 | peptide = peptide.replace("N(+.98)", "N") 139 | peptide = peptide.replace("Q(+.98)", "Q") 140 | return peptide 141 | 142 | 143 | def read_denovo_psm(psm_file): 144 | 145 | print("read_denovo_psm()") 146 | print("psm_file:", psm_file) 147 | 148 | # store PSM of denovo peptides in a dictionary 149 | # {peptide: {'num_psm': , 'total_score': , 'total_abundance'}} 150 | denovo_peptide_psm = {} 151 | with open(psm_file, 'r') as input_handle: 152 | csv_reader = csv.DictReader(input_handle, delimiter=',') 153 | for row in csv_reader: 154 | accession = drop_mod_peaks(row['Accession']) 155 | if accession == 'DENOVO': 156 | peptide = drop_mod_peaks(row['Peptide']) 157 | score = float(row['-10lgP']) 158 | abundance = float(row['Area']) if row['Area'] else 0 159 | if peptide not in denovo_peptide_psm: 160 | denovo_peptide_psm[peptide] = {'num_psm': 1, 161 | 'total_score': score, 162 | 'total_abundance': abundance} 163 | else: 164 | denovo_peptide_psm[peptide]['num_psm'] += 1 165 | denovo_peptide_psm[peptide]['total_score'] += score 166 | denovo_peptide_psm[peptide]['total_abundance'] += abundance 167 | 168 | print("Number of denovo peptides:", len(denovo_peptide_psm)) 169 | num_psm_list = [x['num_psm'] for x in denovo_peptide_psm.values()] 170 | print("Number of denovo peptides with >= 1 psm: ", len([x for x in num_psm_list if x >= 1])) 171 | print("Number of denovo peptides with >= 2 psm: ", len([x for x in num_psm_list if x >= 2])) 172 | print("Number of denovo peptides with >= 3 psm: ", len([x for x in num_psm_list if x >= 3])) 173 | print() 174 | 175 | return denovo_peptide_psm 176 | 177 | 178 | def read_netmhc(netmhc_file): 179 | 180 | print("read_netmhc()") 181 | print("netmhc_file:", netmhc_file) 182 | 183 | # store NetMHC predictions of denovo peptides in a dictionary 184 | # {peptide: {'best_nM': , 'best_rank': , 'is_weak_binding': , 'is_strong_binding': }} 185 | peptide_netmhc = {} 186 | with open(netmhc_file, 'r') as input_handle: 187 | csv_reader = csv.DictReader(input_handle, delimiter=',') 188 | for row in csv_reader: 189 | peptide = row['Peptide'] 190 | if peptide not in peptide_netmhc: 191 | best_nM = min([float(row[x]) for x in ['nM1', 'nM2', 'nM3', 'nM4', 'nM5', 'nM6'] if x in csv_reader.fieldnames]) 192 | best_rank = min([float(row[x]) for x in ['Rank1', 'Rank2', 'Rank3', 'Rank4', 'Rank5', 'Rank6'] if x in csv_reader.fieldnames]) 193 | is_weak_binding = int(best_rank <= WEAK_BINDING) 194 | is_strong_binding = int(best_rank <= STRONG_BINDING) 195 | peptide_netmhc[peptide] = { 196 | 'best_nM': best_nM, 197 | 'best_rank': best_rank, 198 | 'is_weak_binding': is_weak_binding, 199 | 'is_strong_binding': is_strong_binding} 200 | else: 201 | print("Warning: duplicate peptide found in peptide_netmhc:", peptide) 202 | 203 | print("Number of peptides:", len(peptide_netmhc)) 204 | print("Number of peptides with weak binding: ", sum([x['is_weak_binding'] for x in peptide_netmhc.values()])) 205 | print("Number of peptides with strong binding: ", sum([x['is_strong_binding'] for x in peptide_netmhc.values()])) 206 | print() 207 | 208 | return peptide_netmhc 209 | 210 | 211 | def read_immunogenicity(immunogenicity_file): 212 | 213 | print("read_immunogenicity()") 214 | print("immunogenicity_file:", immunogenicity_file) 215 | 216 | # store immunogenicity of denovo peptides in a dictionary 217 | # {peptide: {'immunogenicity': }} 218 | peptide_immunogenicity = {} 219 | with open(immunogenicity_file, 'r') as input_handle: 220 | csv_reader = csv.DictReader(input_handle, delimiter=',') 221 | for row in csv_reader: 222 | peptide = row['peptide'] 223 | if peptide not in peptide_immunogenicity: 224 | score = float(row['score']) 225 | peptide_immunogenicity[peptide] = {'immunogenicity': score} 226 | else: 227 | print("Warning: duplicate peptide found in peptide_immunogenicity:", peptide) 228 | 229 | print("Number of peptides:", len(peptide_immunogenicity)) 230 | print() 231 | 232 | return peptide_immunogenicity 233 | 234 | 235 | def read_fasta(fasta_file, 236 | get_uniprot_id=False, 237 | get_enst_id=False, 238 | get_gene_name=False): 239 | 240 | print("read_fasta()") 241 | print("fasta_file:", fasta_file) 242 | print("get_uniprot_id:", get_uniprot_id) 243 | print("get_enst_id:", get_enst_id) 244 | print("get_gene_name:", get_gene_name) 245 | 246 | with open(fasta_file, 'r') as file_handle: 247 | record_list = list(SeqIO.parse(file_handle, "fasta")) 248 | protein_list = [] 249 | for record in record_list: 250 | uniprot_id = '' 251 | enst_id = '' 252 | gene_name = '' 253 | name = str(record.name) 254 | if get_uniprot_id: 255 | uniprot_id = name.split('|')[1] 256 | if get_enst_id: 257 | enst_id = name 258 | if get_gene_name: 259 | description_list = str(record.description).strip().split(' ') 260 | gene_name_list = [x for x in description_list if 'GN=' in x] 261 | if len(gene_name_list) == 1: 262 | gene_name = gene_name_list[0].split('=')[1] 263 | seq = str(record.seq) 264 | protein_list.append({'name': name, 265 | 'uniprot_id': uniprot_id, 266 | 'enst_id': enst_id, 267 | 'gene_name': gene_name, 268 | 'seq': seq}) 269 | 270 | print("Number of protein sequences in the fasta file: ", len(protein_list)) 271 | print() 272 | 273 | return protein_list 274 | 275 | 276 | def read_db_peptide(labeled_feature_file): 277 | 278 | print("read_db_peptide()") 279 | print("labeled_feature_file:", labeled_feature_file) 280 | 281 | db_peptide_set = set() 282 | with open(labeled_feature_file, 'r') as input_handle: 283 | csv_reader = csv.DictReader(input_handle, delimiter=',') 284 | for row in csv_reader: 285 | peptide = drop_mod_peaks(row['seq']) 286 | db_peptide_set.add(peptide) 287 | print("Number of db peptides identified at step 1: ", len(db_peptide_set)) 288 | print() 289 | 290 | return db_peptide_set 291 | 292 | 293 | def hamming1_align((peptide, protein_list)): 294 | 295 | # I and L are considered the same in this alignment 296 | query = peptide.replace('I', 'L') 297 | query_length = len(query) 298 | match_list = [] 299 | for protein in protein_list: 300 | subject = protein['seq'].replace('I', 'L') 301 | subject_length = len(subject) 302 | 303 | # First, find candidate locations by pigeonhole principle: 304 | # if hamming distance is 1, the left or right half must be exact match 305 | # Then, calculate hamming distance at candidate locations and return those equal to 1 306 | query_left = query[:query_length/2] 307 | query_right = query[query_length/2:] 308 | left_index = [x.start() for x in re.finditer(query_left, subject)] 309 | right_index = [x.start() for x in re.finditer(query_right, subject)] 310 | right_index = [(x - query_length/2) for x in right_index] 311 | candidate_index = left_index + right_index 312 | candidate_index = [x for x in candidate_index if x >= 0 and (x + query_length) <= subject_length] 313 | hamming1_index = [x for x in candidate_index 314 | if Levenshtein.hamming(query, subject[x : (x + query_length)]) == 1] 315 | 316 | if hamming1_index: 317 | match_list += [{'protein': protein, 'match_index': index} 318 | for index in hamming1_index] 319 | 320 | return peptide, match_list 321 | 322 | 323 | def find_mutation(peptide_list, protein_list): 324 | 325 | print("find_mutation()") 326 | 327 | print("Align peptides against protein sequences with 1 mismatch ...") 328 | print("Number of peptides: ", len(peptide_list)) 329 | print("Number of protein sequences:", len(protein_list)) 330 | print("I and L are considered the same in this alignment") 331 | start_time = time.time() 332 | pool = multiprocessing.Pool(processes=num_processes) 333 | search_list = [(peptide, protein_list) for peptide in peptide_list] 334 | result_list = pool.map(hamming1_align, search_list) 335 | print(time.time() - start_time, "seconds") 336 | print() 337 | 338 | peptide_mutation = {} 339 | protein_mutation = {} 340 | for peptide, match_list in result_list: 341 | missense_list = [] 342 | peptide_length = len(peptide) 343 | peptide_ItoL = peptide.replace('I', 'L') 344 | for match in match_list: 345 | protein = match['protein'] 346 | match_index = match['match_index'] 347 | 348 | wildtype = protein['seq'][match_index : (match_index + peptide_length)] 349 | wildtype_ItoL = wildtype.replace('I', 'L') 350 | mutation_index = [x for x in range(len(peptide_ItoL)) if peptide_ItoL[x] != wildtype_ItoL[x]] 351 | assert len(mutation_index) == 1, "Error: not 1 mutation found" 352 | mutation_index = mutation_index[0] 353 | mutation_wildtype = wildtype[mutation_index] 354 | mutation_aa = peptide[mutation_index] 355 | match['wildtype'] = wildtype 356 | match['mutation_pos'] = mutation_index + 1 357 | match['mutation_wt'] = mutation_wildtype 358 | match['mutation_aa'] = mutation_aa 359 | match['is_missense'] = int((mutation_aa, mutation_wildtype) in AA_PAIR_MISSENSE) 360 | notflanking = int(match['mutation_pos'] != 1 and match['mutation_pos'] != len(peptide)) 361 | match['is_missense_notflanking'] = match['is_missense'] * notflanking 362 | 363 | if match['is_missense_notflanking']: 364 | protein_mutation_entry = {'peptide': peptide, 'match_index': match['match_index']} 365 | if not protein['name'] in protein_mutation: 366 | protein_mutation[protein['name']] = [protein_mutation_entry] 367 | else: 368 | protein_mutation[protein['name']].append(protein_mutation_entry) 369 | 370 | num_hits = len(match_list) 371 | num_missense = len([x for x in match_list if x['is_missense'] == 1]) 372 | num_missense_notflanking = len([x for x in match_list if x['is_missense_notflanking'] == 1]) 373 | peptide_mutation[peptide] = {'num_hits': num_hits, 374 | 'num_missense': num_missense, 375 | 'num_missense_notflanking': num_missense_notflanking, 376 | 'match_list': match_list} 377 | 378 | print("Number of denovo peptides with >= 1 hits:", 379 | len([x for x in peptide_mutation.values() if x['num_hits'] >= 1])) 380 | print("Number of denovo peptides with >= 1 missense hits:", 381 | len([x for x in peptide_mutation.values() if x['num_missense'] >= 1])) 382 | print("Number of denovo peptides with >= 1 missense, not flanking hits:", 383 | len([x for x in peptide_mutation.values() if x['num_missense_notflanking'] >= 1])) 384 | print() 385 | 386 | return peptide_mutation, protein_mutation 387 | 388 | 389 | def read_missense_snp(snp_file, snp_enst_fasta, snp_sample_id): 390 | 391 | print("read_missense_snp()") 392 | print("snp_file:", snp_file) 393 | print("snp_enst_fasta:", snp_enst_fasta) 394 | print("snp_sample_id:", snp_sample_id) 395 | 396 | # read missense SNP 397 | snp_list = [] 398 | with open(snp_file, 'r') as input_handle: 399 | csv_reader = csv.DictReader(input_handle, delimiter=',') 400 | for row in csv_reader: 401 | mutation_type = row['Effect'] 402 | if mutation_type == 'missense_variant' and snp_sample_id == row['Sample ID']: 403 | enst_id = row['ENSEMBL Transcript ID'] 404 | mutation_change = row['Aa change'] 405 | snp_list.append({'enst_id': enst_id, 'mutation_change': mutation_change}) 406 | print("Number of missense SNPs:", len(snp_list)) 407 | print() 408 | 409 | # read SNP Ensembl Transcript fasta 410 | protein_list = read_fasta(snp_enst_fasta, get_enst_id=True) 411 | # clean letter 'X' from the 1st position of some enst protein sequences 412 | for protein in protein_list: 413 | if protein['seq'][0] == 'X': 414 | protein['seq'] = protein['seq'][1:] 415 | # convert protein_list to a dictionary with key as Ensembl Transcript ID 416 | protein_dict = {} 417 | for protein in protein_list: 418 | enst_id = protein['enst_id'] 419 | assert enst_id not in protein_dict, "Error: duplicate enst_id" 420 | protein_dict[enst_id] = protein 421 | 422 | # cross-check snp_list and snp_enst_fasta for enst_id, location, and identity of mutated amino acid 423 | # because some transcripts were removed or updated, so their SNPs are no longer correct 424 | num_not_missense = 0 425 | snp_confirmed_list = [] 426 | enst_id_confirmed_set = set() 427 | for snp in snp_list: 428 | # example: Pro575Leu; note that the location is 1-based, not 0-based 429 | aa_3letter_ref = snp['mutation_change'][:3] 430 | aa_loc = int(snp['mutation_change'][3:-3]) 431 | aa_3letter_alt = snp['mutation_change'][-3:] 432 | aa_ref = AA_3_to_1[aa_3letter_ref] 433 | aa_alt = AA_3_to_1[aa_3letter_alt] 434 | enst_id = snp['enst_id'] 435 | if enst_id in protein_dict: 436 | protein = protein_dict[enst_id] 437 | if aa_loc-1 < len(protein['seq']) and aa_ref == protein['seq'][aa_loc-1]: 438 | snp_confirmed_list.append({'enst_id':snp['enst_id'], 439 | 'aa_loc': aa_loc, 440 | 'aa_ref': aa_ref, 441 | 'aa_alt': aa_alt}) 442 | enst_id_confirmed_set.add(enst_id) 443 | protein_confirmed_list = [protein_dict[enst_id] for enst_id in enst_id_confirmed_set] 444 | 445 | print("len(snp_list):", len(snp_list)) 446 | print("len(snp_confirmed_list):", len(snp_confirmed_list)) 447 | print("len(protein_dict):", len(protein_dict)) 448 | print("len(protein_confirmed_list):", len(protein_confirmed_list)) 449 | print() 450 | 451 | return snp_confirmed_list, protein_confirmed_list 452 | 453 | 454 | def match_peptide_snp(peptide_list, snp_file, snp_enst_fasta, snp_sample_id): 455 | 456 | print('match_peptide_snp()') 457 | 458 | snp_list, protein_list = read_missense_snp(snp_file, snp_enst_fasta, snp_sample_id) 459 | peptide_mutation, _ = find_mutation(peptide_list, protein_list) 460 | peptide_snp = {} 461 | for peptide, mutation in peptide_mutation.iteritems(): 462 | peptide_snp[peptide] = {'snp_list': []} 463 | if mutation['num_hits'] > 0: 464 | for match in mutation['match_list']: 465 | enst_id = match['protein']['enst_id'] 466 | match_index = match['match_index'] 467 | for snp in snp_list: 468 | if (enst_id == snp['enst_id'] 469 | and match_index + match['mutation_pos'] == snp['aa_loc'] 470 | and match['mutation_wt'] == snp['aa_ref'] 471 | and match['mutation_aa'].replace('I', 'L') == snp['aa_alt'].replace('I', 'L')): 472 | match_snp = snp 473 | match_snp.update({'wildtype': match['wildtype']}) 474 | peptide_snp[peptide]['snp_list'].append(match_snp) 475 | 476 | num_peptide_snp = len([x for x in peptide_snp.values() if x['snp_list']]) 477 | print('Number of peptide mutations match to SNPs:', num_peptide_snp) 478 | for peptide in peptide_snp: 479 | if peptide_snp[peptide]['snp_list']: 480 | print(peptide, peptide_snp[peptide]['snp_list']) 481 | print() 482 | 483 | return peptide_snp 484 | 485 | 486 | def step_5(psm_file, netmhc_file, immunogenicity_file, db_fasta_file, labeled_feature_file, 487 | snp_file, snp_enst_fasta, snp_sample_id, 488 | output_neoantigen_criteria, output_protein_mutation): 489 | 490 | print("".join(["="] * 80)) # section-separating line 491 | print("step_5()") 492 | 493 | denovo_psm = read_denovo_psm(psm_file) 494 | if netmhc_file: 495 | denovo_netmhc = read_netmhc(netmhc_file) 496 | else: 497 | denovo_netmhc = None 498 | denovo_peptide_list = denovo_psm.keys() 499 | if immunogenicity_file: 500 | denovo_immunogenicity = read_immunogenicity(immunogenicity_file) 501 | else: 502 | denovo_immunogenicity = None 503 | denovo_peptide_list = denovo_psm.keys() 504 | 505 | print("Find denovo mutations with respect to the reference fasta:") 506 | protein_list = read_fasta(db_fasta_file) 507 | denovo_mutation, protein_mutation = find_mutation(denovo_peptide_list, protein_list) 508 | 509 | print("Write protein with missense and not flanking mutations:") 510 | print("output_protein_mutation:", output_protein_mutation) 511 | print() 512 | with open(output_protein_mutation, 'w') as output_handle: 513 | fieldnames = ['protein_name', 'num_peptide', 'peptide_list'] 514 | csv_writer = csv.DictWriter(output_handle, fieldnames=fieldnames, delimiter=',') 515 | csv_writer.writeheader() 516 | for protein_name, peptide_list in protein_mutation.iteritems(): 517 | row = {'protein_name': protein_name, 518 | 'num_peptide': len(peptide_list), 519 | 'peptide_list': peptide_list} 520 | csv_writer.writerow(row) 521 | 522 | print("Find wildtypes in identified db peptides") 523 | db_peptide_set = read_db_peptide(labeled_feature_file) 524 | for peptide in denovo_mutation: 525 | num_missense_db = 0 526 | num_missense_notflanking_db = 0 527 | for match in denovo_mutation[peptide]['match_list']: 528 | match['is_missense_db'] = match['is_missense'] * int(match['wildtype'] in db_peptide_set) 529 | match['is_missense_notflanking_db'] = match['is_missense_notflanking'] * int(match['wildtype'] in db_peptide_set) 530 | num_missense_db += match['is_missense_db'] 531 | num_missense_notflanking_db += match['is_missense_notflanking_db'] 532 | denovo_mutation[peptide]['num_missense_db'] = num_missense_db 533 | denovo_mutation[peptide]['num_missense_notflanking_db'] = num_missense_notflanking_db 534 | print("Number of denovo peptides with >= 1 missense_db hits:", 535 | len([x for x in denovo_mutation.values() if x['num_missense_db'] >= 1])) 536 | print("Number of denovo peptides with >= 1 missense_notflanking_db hits:", 537 | len([x for x in denovo_mutation.values() if x['num_missense_notflanking_db'] >= 1])) 538 | print() 539 | 540 | if snp_file: 541 | print("Find denovo mutations match to SNPs:") 542 | denovo_snp = match_peptide_snp(denovo_peptide_list, snp_file, snp_enst_fasta, snp_sample_id) 543 | else: 544 | denovo_snp = None 545 | 546 | print("Write neoantigen criteria:") 547 | print("output_neoantigen_criteria:", output_neoantigen_criteria) 548 | print() 549 | with open(output_neoantigen_criteria, 'w') as output_handle: 550 | fieldnames = ['peptide', 551 | 'num_psm', 552 | 'total_score', 553 | 'total_abundance', 554 | 'best_nM', 555 | 'best_rank', 556 | 'is_weak_binding', 557 | 'is_strong_binding', 558 | 'immunogenicity', 559 | 'num_hits', 560 | 'num_missense', 561 | 'num_missense_notflanking', 562 | 'num_missense_db', 563 | 'num_missense_notflanking_db', 564 | 'match_list', 565 | 'snp_list'] 566 | csv_writer = csv.DictWriter(output_handle, fieldnames=fieldnames, delimiter=',') 567 | csv_writer.writeheader() 568 | for peptide in denovo_peptide_list: 569 | row = {'peptide': peptide} 570 | row.update(denovo_psm[peptide]) 571 | if denovo_netmhc is not None and peptide in denovo_netmhc: 572 | row.update(denovo_netmhc[peptide]) 573 | if denovo_immunogenicity is not None and peptide in denovo_immunogenicity: 574 | row.update(denovo_immunogenicity[peptide]) 575 | row.update(denovo_mutation[peptide]) 576 | if denovo_snp is not None: 577 | row.update(denovo_snp[peptide]) 578 | for match in row['match_list']: 579 | match['protein'] = match['protein']['name'] 580 | csv_writer.writerow(row) 581 | 582 | print("Selection criteria: >= 1 missense, not flanking hits AND >= 2 psm") 583 | num_selection = len([peptide for peptide in denovo_peptide_list 584 | if denovo_mutation[peptide]['num_missense_notflanking'] >= 1 585 | and denovo_psm[peptide]['num_psm'] >= 2]) 586 | print("num_selection :", num_selection) 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | -------------------------------------------------------------------------------- /deepnovo_worker_io.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Hieu Tran. All Rights Reserved. 2 | # 3 | # DeepNovo is publicly available for non-commercial uses. 4 | # ============================================================================== 5 | 6 | """TODO(nh2tran): docstring.""" 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import re 13 | import os 14 | import numpy as np 15 | import pickle 16 | 17 | import deepnovo_config 18 | from deepnovo_cython_modules import process_spectrum 19 | 20 | 21 | class WorkerIO(object): 22 | """TODO(nh2tran): docstring. 23 | """ 24 | 25 | 26 | def __init__(self, input_spectrum_file, input_feature_file, output_file=None): 27 | """TODO(nh2tran): docstring. 28 | The input_file could be input_file or input_file_train/valid/test. 29 | The output_file is None for train/valid/test cases. 30 | During training we use two separate WorkerIO objects for train and valid. 31 | """ 32 | 33 | print("".join(["="] * 80)) # section-separating line 34 | print("WorkerIO: __init__()") 35 | 36 | # we currently use deepnovo_config to store both const & settings 37 | # the settings should be shown in __init__() to keep track carefully 38 | self.MZ_MAX = deepnovo_config.MZ_MAX 39 | self.MZ_SIZE = deepnovo_config.MZ_SIZE 40 | self.batch_size = deepnovo_config.batch_size 41 | self.header_seq = deepnovo_config.FLAGS.header_seq 42 | self.neighbor_size = deepnovo_config.neighbor_size 43 | print("neighbor_size = {0:d}".format(self.neighbor_size)) 44 | self.dia_window = deepnovo_config.dia_window 45 | 46 | self.input_spectrum_file = input_spectrum_file 47 | self.input_feature_file = input_feature_file 48 | self.output_file = output_file 49 | print("input_spectrum_file = {0:s}".format(self.input_spectrum_file)) 50 | print("input_feature_file = {0:s}".format(self.input_feature_file)) 51 | print("output_file = {0:s}".format(self.output_file)) 52 | # keep the file handles open throughout the process to read/write batches 53 | self.input_spectrum_handle = None 54 | self.input_feature_handle = None 55 | self.output_handle = None 56 | 57 | # split data into batches 58 | self.feature_index_list = [] 59 | self.feature_index_batch_list = [] 60 | self.feature_index_batch_count = 0 61 | 62 | ### store file location of each feature for random access 63 | self.feature_location_list = [] 64 | 65 | # store the file location of all spectra for random access 66 | self.spectrum_location_dict = {} 67 | self.spectrum_rtinseconds_dict = {} 68 | 69 | # record the status of spectra that have been read 70 | self.feature_count = {"total": 0, 71 | "read": 0, 72 | "skipped": 0, 73 | "skipped_mass": 0} 74 | self.spectrum_count = 0 75 | 76 | 77 | def close_input(self): 78 | """TODO(nh2tran): docstring.""" 79 | 80 | print("".join(["="] * 80)) # section-separating line 81 | print("WorkerIO: close_input()") 82 | 83 | self.input_spectrum_handle.close() 84 | self.input_feature_handle.close() 85 | 86 | 87 | def close_output(self): 88 | """TODO(nh2tran): docstring.""" 89 | 90 | print("".join(["="] * 80)) # section-separating line 91 | print("WorkerIO: close_output()") 92 | 93 | self.output_handle.close() 94 | 95 | 96 | def get_spectrum(self, feature_index_batch): 97 | """TODO(nh2tran): docstring.""" 98 | 99 | #~ print("".join(["="] * 80)) # section-separating line 100 | #~ print("WorkerIO: get_spectrum()") 101 | 102 | spectrum_list = [] 103 | for feature_index in feature_index_batch: 104 | # parse a feature 105 | feature_location = self.feature_location_list[feature_index] 106 | feature_id, feature_area, precursor_mz, precursor_charge, rt_mean, raw_sequence, scan_list, ms1_list = self._parse_feature(feature_location) 107 | # skip if precursor_mass > MZ_MAX 108 | precursor_mass = precursor_mz * precursor_charge - deepnovo_config.mass_H * precursor_charge 109 | if precursor_mass > self.MZ_MAX: 110 | self.feature_count["skipped"] += 1 111 | self.feature_count["skipped_mass"] += 1 112 | continue 113 | self.feature_count["read"] += 1 114 | # parse and process spectrum 115 | (spectrum_holder, 116 | spectrum_original_forward, 117 | spectrum_original_backward, 118 | scan_list_middle, 119 | scan_list_original, 120 | ms1_profile) = self._parse_spectrum(precursor_mz, precursor_mass, rt_mean, scan_list, ms1_list) 121 | # update dataset 122 | spectrum = {"feature_id": feature_id,#str(feature_index),#scan, 123 | "feature_area": feature_area, 124 | "raw_sequence": raw_sequence, 125 | "precursor_mass": precursor_mass, 126 | "spectrum_holder": spectrum_holder, 127 | "spectrum_original_forward": spectrum_original_forward, 128 | "spectrum_original_backward": spectrum_original_backward, 129 | "precursor_mz": precursor_mz, 130 | "precursor_charge": precursor_charge, 131 | "scan_list_middle": scan_list_middle, 132 | "scan_list_original": scan_list_original, 133 | "ms1_profile": ms1_profile} 134 | spectrum_list.append(spectrum) 135 | 136 | return spectrum_list 137 | 138 | 139 | def get_location(self): 140 | """TODO(nh2tran): docstring.""" 141 | 142 | print("".join(["="] * 80)) # section-separating line 143 | print("WorkerIO: get_location()") 144 | 145 | ### store file location of each spectrum for random access {scan:location} 146 | ### since mgf file can be rather big, cache the locations for each spectrum mgf file. 147 | spectrum_location_file = self.input_spectrum_file + '.locations.pkl' 148 | if os.path.exists(spectrum_location_file): 149 | print("WorkerIO: read cached spectrum locations") 150 | with open(spectrum_location_file, 'rb') as fr: 151 | data = pickle.load(fr) 152 | self.spectrum_location_dict, self.spectrum_rtinseconds_dict, self.spectrum_count = data 153 | else: 154 | print("WorkerIO: build spectrum location from scratch") 155 | spectrum_location_dict = {} 156 | spectrum_rtinseconds_dict = {} 157 | line = True 158 | while line: 159 | current_location = self.input_spectrum_handle.tell() 160 | line = self.input_spectrum_handle.readline() 161 | if "BEGIN IONS" in line: 162 | spectrum_location = current_location 163 | elif "SCANS=" in line: 164 | scan = re.split('=|\r|\n', line)[1] 165 | spectrum_location_dict[scan] = spectrum_location 166 | elif "RTINSECONDS=" in line: 167 | rtinseconds = float(re.split('=|\r|\n', line)[1]) 168 | spectrum_rtinseconds_dict[scan] = rtinseconds 169 | self.spectrum_location_dict = spectrum_location_dict 170 | self.spectrum_rtinseconds_dict = spectrum_rtinseconds_dict 171 | self.spectrum_count = len(spectrum_location_dict) 172 | with open(spectrum_location_file, 'wb') as fw: 173 | pickle.dump((self.spectrum_location_dict, self.spectrum_rtinseconds_dict, self.spectrum_count), fw) 174 | 175 | ### store file location of each feature for random access 176 | feature_location_list = [] 177 | # skip header line 178 | _ = self.input_feature_handle.readline() 179 | line = True 180 | while line: 181 | feature_location = self.input_feature_handle.tell() 182 | feature_location_list.append(feature_location) 183 | line = self.input_feature_handle.readline() 184 | feature_location_list = feature_location_list[:-1] 185 | self.feature_location_list = feature_location_list 186 | self.feature_count["total"] = len(feature_location_list) 187 | self.feature_index_list = range(self.feature_count["total"]) 188 | 189 | print("spectrum_count = {0:d}".format(self.spectrum_count)) 190 | print("feature_count[total] = {0:d}".format(self.feature_count["total"])) 191 | 192 | 193 | def open_input(self): 194 | """TODO(nh2tran): docstring.""" 195 | 196 | print("".join(["="] * 80)) # section-separating line 197 | print("WorkerIO: open_input()") 198 | 199 | self.input_spectrum_handle = open(self.input_spectrum_file, 'r') 200 | self.input_feature_handle = open(self.input_feature_file, 'r') 201 | 202 | 203 | def open_output(self): 204 | """TODO(nh2tran): docstring.""" 205 | 206 | print("".join(["="] * 80)) # section-separating line 207 | print("WorkerIO: open_output()") 208 | 209 | self.output_handle = open(self.output_file, 'w') 210 | self._print_prediction_header() 211 | 212 | 213 | def split_feature_index(self): 214 | """TODO(nh2tran): docstring.""" 215 | 216 | print("".join(["="] * 80)) # section-separating line 217 | print("WorkerIO: split_index()") 218 | 219 | index_batch_list = [self.feature_index_list[i:(i+self.batch_size)] 220 | for i in range(0, 221 | self.feature_count["total"], 222 | self.batch_size)] 223 | 224 | self.feature_index_batch_list = index_batch_list 225 | self.feature_index_batch_count = len(self.feature_index_batch_list) 226 | 227 | 228 | def write_prediction(self, predicted_batch): 229 | """TODO(nh2tran): docstring.""" 230 | 231 | #~ print("".join(["="] * 80)) # section-separating line 232 | #~ print("WorkerIO: write_prediction()") 233 | 234 | for predicted in predicted_batch: 235 | feature_id = predicted["feature_id"] 236 | feature_area = str(predicted["feature_area"]) 237 | precursor_mz = str(predicted["precursor_mz"]) 238 | precursor_charge = str(predicted["precursor_charge"]) 239 | scan_list_middle = ";".join(predicted["scan_list_middle"]) 240 | scan_list_original = ";".join(predicted["scan_list_original"]) 241 | if predicted["sequence"]: 242 | predicted_sequence = ';'.join([','.join(x) for x in predicted["sequence"]]) 243 | predicted_score = ';'.join(['{0:.2f}'.format(x) for x in predicted["score"]]) 244 | predicted_score_max = '{0:.2f}'.format(np.max(predicted["score"])) 245 | predicted_position_score = ';'.join([ 246 | ','.join(['{0:.2f}'.format(y) for y in x]) 247 | for x in predicted["position_score"]]) 248 | if "protein_access_id" in predicted: 249 | # predicted_batch is returned from search_db 250 | protein_access_id = predicted['protein_access_id'] 251 | else: 252 | # predicted_batch is returned from search_denovo 253 | protein_access_id = 'DENOVO' 254 | else: # if no peptide found, write empty sequence to the output file 255 | predicted_sequence = "" 256 | predicted_score = "" 257 | predicted_score_max = "" 258 | predicted_position_score = "" 259 | protein_access_id = "" 260 | predicted_row = "\t".join([feature_id, 261 | feature_area, 262 | predicted_sequence, 263 | predicted_score, 264 | predicted_position_score, 265 | precursor_mz, 266 | precursor_charge, 267 | protein_access_id, 268 | scan_list_middle, 269 | scan_list_original, 270 | predicted_score_max]) 271 | print(predicted_row, file=self.output_handle, end="\n") 272 | 273 | 274 | def _parse_spectrum(self, precursor_mz, precursor_mass, rt_mean, scan_list, ms1_list): 275 | """TODO(nh2tran): docstring.""" 276 | 277 | #~ print("".join(["="] * 80)) # section-separating line 278 | #~ print("WorkerIO: _parse_spectrum()") 279 | 280 | spectrum_holder_list = [] 281 | spectrum_original_forward_list = [] 282 | spectrum_original_backward_list = [] 283 | 284 | ### select best neighbors from the scan_list by their distance to rt_mean 285 | # probably move this selection to get_location(), run once rather than repeating 286 | neighbor_count = len(scan_list) 287 | best_scan_index = None 288 | best_distance = float('inf') 289 | for scan_index, scan in enumerate(scan_list): 290 | distance = abs(self.spectrum_rtinseconds_dict[scan] - rt_mean) 291 | if distance < best_distance: 292 | best_distance = distance 293 | best_scan_index = scan_index 294 | neighbor_center = best_scan_index 295 | neighbor_left_count = neighbor_center 296 | neighbor_right_count = neighbor_count - neighbor_left_count - 1 297 | neighbor_size_half = self.neighbor_size // 2 298 | neighbor_left_count = min(neighbor_left_count, neighbor_size_half) 299 | neighbor_right_count = min(neighbor_right_count, neighbor_size_half) 300 | 301 | ### padding zero arrays to the left if not enough neighbor spectra 302 | if neighbor_left_count < neighbor_size_half: 303 | for x in range(neighbor_size_half - neighbor_left_count): 304 | spectrum_holder_list.append(np.zeros( 305 | shape=(1, self.MZ_SIZE), 306 | dtype=np.float32)) 307 | spectrum_original_forward_list.append(np.zeros( 308 | shape=(1, self.MZ_SIZE), 309 | dtype=np.float32)) 310 | spectrum_original_backward_list.append(np.zeros( 311 | shape=(1, self.MZ_SIZE), 312 | dtype=np.float32)) 313 | 314 | ### parse and add neighbor spectra 315 | scan_list_middle = [] 316 | ms1_intensity_list_middle = [] 317 | for index in range(neighbor_center - neighbor_left_count, neighbor_center + neighbor_right_count + 1): 318 | scan = scan_list[index] 319 | scan_list_middle.append(scan) 320 | ms1_entry = ms1_list[index] 321 | ms1_intensity = float(re.split(':', ms1_entry)[1]) 322 | ms1_intensity_list_middle.append(ms1_intensity) 323 | ms1_intensity_max = max(ms1_intensity_list_middle) 324 | assert ms1_intensity_max > 0.0, "Error: Zero ms1_intensity_max" 325 | ms1_intensity_list_middle = [x/ms1_intensity_max for x in ms1_intensity_list_middle] 326 | for scan, ms1_intensity in zip(scan_list_middle, ms1_intensity_list_middle): 327 | spectrum_location = self.spectrum_location_dict[scan] 328 | self.input_spectrum_handle.seek(spectrum_location) 329 | # parse header lines 330 | line = self.input_spectrum_handle.readline() 331 | assert "BEGIN IONS" in line, "Error: wrong input BEGIN IONS" 332 | line = self.input_spectrum_handle.readline() 333 | assert "TITLE=" in line, "Error: wrong input TITLE=" 334 | line = self.input_spectrum_handle.readline() 335 | assert "PEPMASS=" in line, "Error: wrong input PEPMASS=" 336 | line = self.input_spectrum_handle.readline() 337 | assert "CHARGE=" in line, "Error: wrong input CHARGE=" 338 | line = self.input_spectrum_handle.readline() 339 | assert "SCANS=" in line, "Error: wrong input SCANS=" 340 | line = self.input_spectrum_handle.readline() 341 | assert "RTINSECONDS=" in line, "Error: wrong input RTINSECONDS=" 342 | # parse fragment ions 343 | mz_list, intensity_list = self._parse_spectrum_ion() 344 | # pre-process spectrum 345 | (spectrum_holder, 346 | spectrum_original_forward, 347 | spectrum_original_backward) = process_spectrum(mz_list, 348 | intensity_list, 349 | precursor_mass) 350 | # normalize by each individual spectrum 351 | #~ spectrum_holder /= np.max(spectrum_holder) 352 | #~ spectrum_original_forward /= np.max(spectrum_original_forward) 353 | #~ spectrum_original_backward /= np.max(spectrum_original_backward) 354 | # weight by ms1 profile 355 | #~ spectrum_holder *= ms1_intensity 356 | #~ spectrum_original_forward *= ms1_intensity 357 | #~ spectrum_original_backward *= ms1_intensity 358 | # add spectrum to the neighbor list 359 | spectrum_holder_list.append(spectrum_holder) 360 | spectrum_original_forward_list.append(spectrum_original_forward) 361 | spectrum_original_backward_list.append(spectrum_original_backward) 362 | ### padding zero arrays to the right if not enough neighbor spectra 363 | if neighbor_right_count < neighbor_size_half: 364 | for x in range(neighbor_size_half - neighbor_right_count): 365 | spectrum_holder_list.append(np.zeros( 366 | shape=(1, self.MZ_SIZE), 367 | dtype=np.float32)) 368 | spectrum_original_forward_list.append(np.zeros( 369 | shape=(1, self.MZ_SIZE), 370 | dtype=np.float32)) 371 | spectrum_original_backward_list.append(np.zeros( 372 | shape=(1, self.MZ_SIZE), 373 | dtype=np.float32)) 374 | 375 | spectrum_holder = np.vstack(spectrum_holder_list) 376 | spectrum_original_forward = np.vstack(spectrum_original_forward_list) 377 | spectrum_original_backward = np.vstack(spectrum_original_backward_list) 378 | assert spectrum_holder.shape == (self.neighbor_size, 379 | self.MZ_SIZE), "Error:shape" 380 | # spectrum-CNN normalization: by feature 381 | spectrum_holder /= np.max(spectrum_holder) 382 | 383 | # ms1_profile 384 | for x in range(neighbor_size_half - neighbor_left_count): 385 | ms1_intensity_list_middle = [0.0] + ms1_intensity_list_middle 386 | for x in range(neighbor_size_half - neighbor_right_count): 387 | ms1_intensity_list_middle = ms1_intensity_list_middle + [0.0] 388 | assert len(ms1_intensity_list_middle) == self.neighbor_size, "Error: ms1 profile" 389 | ms1_profile = np.array(ms1_intensity_list_middle) 390 | 391 | return spectrum_holder, spectrum_original_forward, spectrum_original_backward, scan_list_middle, scan_list, ms1_profile 392 | 393 | 394 | def _parse_feature(self, feature_location): 395 | """TODO(nh2tran): docstring.""" 396 | 397 | #~ print("".join(["="] * 80)) # section-separating line 398 | #~ print("WorkerIO: _parse_feature()") 399 | 400 | self.input_feature_handle.seek(feature_location) 401 | line = self.input_feature_handle.readline() 402 | line = re.split(',|\r|\n', line) 403 | feature_id = line[deepnovo_config.col_feature_id] 404 | feature_area_str = line[deepnovo_config.col_feature_area] 405 | feature_area = float(feature_area_str) if feature_area_str else 1.0 406 | precursor_mz = float(line[deepnovo_config.col_precursor_mz]) 407 | precursor_charge = float(line[deepnovo_config.col_precursor_charge]) 408 | rt_mean = float(line[deepnovo_config.col_rt_mean]) 409 | raw_sequence = line[deepnovo_config.col_raw_sequence] 410 | scan_list = re.split(';', line[deepnovo_config.col_scan_list]) 411 | ms1_list = re.split(';', line[deepnovo_config.col_ms1_list]) 412 | assert len(scan_list) == len(ms1_list), "Error: scan_list and ms1_list not matched." 413 | 414 | return feature_id, feature_area, precursor_mz, precursor_charge, rt_mean, raw_sequence, scan_list, ms1_list 415 | 416 | 417 | def _parse_spectrum_ion(self): 418 | """TODO(nh2tran): docstring.""" 419 | 420 | #~ print("".join(["="] * 80)) # section-separating line 421 | #~ print("WorkerIO: _parse_spectrum_ion()") 422 | 423 | # ion 424 | mz_list = [] 425 | intensity_list = [] 426 | line = self.input_spectrum_handle.readline() 427 | while not "END IONS" in line: 428 | mz, intensity = re.split(' |\n', line)[:2] 429 | mz_float = float(mz) 430 | intensity_float = float(intensity) 431 | # skip an ion if its mass > MZ_MAX 432 | if mz_float > self.MZ_MAX: 433 | line = self.input_spectrum_handle.readline() 434 | continue 435 | mz_list.append(mz_float) 436 | intensity_list.append(intensity_float) 437 | line = self.input_spectrum_handle.readline() 438 | 439 | return mz_list, intensity_list 440 | 441 | 442 | def _print_prediction_header(self): 443 | """TODO(nh2tran): docstring.""" 444 | 445 | print("".join(["="] * 80)) # section-separating line 446 | print("WorkerIO: _print_prediction_header()") 447 | 448 | header_list = ["feature_id", 449 | "feature_area", 450 | "predicted_sequence", 451 | "predicted_score", 452 | "predicted_position_score", 453 | "precursor_mz", 454 | "precursor_charge", 455 | "protein_access_id", 456 | "scan_list_middle", 457 | "scan_list_original", 458 | "predicted_score_max"] 459 | header_row = "\t".join(header_list) 460 | print(header_row, file=self.output_handle, end="\n") 461 | 462 | class WorkerI(object): 463 | """ 464 | This is a helper class designed for multi-process get_spectrum 465 | """ 466 | def __init__(self, worker_io): 467 | self.MZ_MAX = worker_io.MZ_MAX 468 | self.MZ_SIZE = worker_io.MZ_SIZE 469 | self.batch_size = worker_io.batch_size 470 | self.header_seq = worker_io.header_seq 471 | self.neighbor_size = worker_io.neighbor_size 472 | 473 | self.dia_window = worker_io.dia_window 474 | 475 | self.input_spectrum_file = worker_io.input_spectrum_file 476 | self.input_feature_file = worker_io.input_feature_file 477 | self.output_file = worker_io.output_file 478 | 479 | # split data into batches 480 | self.feature_index_list = worker_io.feature_index_list 481 | self.feature_index_batch_list = worker_io.feature_index_batch_list 482 | self.feature_index_batch_count = worker_io.feature_index_batch_count 483 | 484 | ### store file location of each feature for random access 485 | self.feature_location_list = worker_io.feature_location_list 486 | 487 | # store the file location of all spectra for random access 488 | self.spectrum_location_dict = worker_io.spectrum_location_dict 489 | self.spectrum_rtinseconds_dict = worker_io.spectrum_rtinseconds_dict 490 | 491 | # record the status of spectra that have been read 492 | self.feature_count = worker_io.feature_count 493 | self.spectrum_count = worker_io.spectrum_count 494 | 495 | def get_spectrum(self, feature_index_batch, input_feature_file_handle, input_spectrum_file_handle): 496 | """TODO(nh2tran): docstring.""" 497 | 498 | #~ print("".join(["="] * 80)) # section-separating line 499 | #~ print("WorkerIO: get_spectrum()") 500 | 501 | spectrum_list = [] 502 | for feature_index in feature_index_batch: 503 | # parse a feature 504 | feature_location = self.feature_location_list[feature_index] 505 | feature_id, feature_area, precursor_mz, precursor_charge, rt_mean, raw_sequence, scan_list, ms1_list = self._parse_feature(feature_location, input_feature_file_handle) 506 | # skip if precursor_mass > MZ_MAX 507 | precursor_mass = precursor_mz * precursor_charge - deepnovo_config.mass_H * precursor_charge 508 | if precursor_mass > self.MZ_MAX: 509 | self.feature_count["skipped"] += 1 510 | self.feature_count["skipped_mass"] += 1 511 | continue 512 | self.feature_count["read"] += 1 513 | 514 | # parse and process spectrum 515 | (spectrum_holder, 516 | spectrum_original_forward, 517 | spectrum_original_backward, 518 | scan_list_middle, 519 | scan_list_original, 520 | ms1_profile) = self._parse_spectrum(precursor_mz, precursor_mass, rt_mean, scan_list, ms1_list, input_spectrum_file_handle) 521 | # update dataset 522 | spectrum = {"feature_id": feature_id,#str(feature_index),#scan, 523 | "feature_area": feature_area, 524 | "raw_sequence": raw_sequence, 525 | "precursor_mass": precursor_mass, 526 | "spectrum_holder": spectrum_holder, 527 | "spectrum_original_forward": spectrum_original_forward, 528 | "spectrum_original_backward": spectrum_original_backward, 529 | "precursor_mz": precursor_mz, 530 | "precursor_charge": precursor_charge, 531 | "scan_list_middle": scan_list_middle, 532 | "scan_list_original": scan_list_original, 533 | "ms1_profile": ms1_profile} 534 | spectrum_list.append(spectrum) 535 | 536 | return spectrum_list 537 | 538 | def _parse_feature(self, feature_location, input_file_handle): 539 | """TODO(nh2tran): docstring.""" 540 | 541 | #~ print("".join(["="] * 80)) # section-separating line 542 | #~ print("WorkerIO: _parse_feature()") 543 | 544 | input_file_handle.seek(feature_location) 545 | line = input_file_handle.readline() 546 | line = re.split(',|\r|\n', line) 547 | feature_id = line[deepnovo_config.col_feature_id] 548 | feature_area = 0#float(line[deepnovo_config.col_feature_area]) 549 | precursor_mz = float(line[deepnovo_config.col_precursor_mz]) 550 | precursor_charge = float(line[deepnovo_config.col_precursor_charge]) 551 | rt_mean = float(line[deepnovo_config.col_rt_mean]) 552 | raw_sequence = line[deepnovo_config.col_raw_sequence] 553 | scan_list = re.split(';', line[deepnovo_config.col_scan_list]) 554 | ms1_list = re.split(';', line[deepnovo_config.col_ms1_list]) 555 | assert len(scan_list) == len(ms1_list), "Error: scan_list and ms1_list not matched." 556 | 557 | return feature_id, feature_area, precursor_mz, precursor_charge, rt_mean, raw_sequence, scan_list, ms1_list 558 | 559 | def _parse_spectrum(self, precursor_mz, precursor_mass, rt_mean, scan_list, ms1_list, input_file_handle): 560 | """TODO(nh2tran): docstring.""" 561 | 562 | #~ print("".join(["="] * 80)) # section-separating line 563 | #~ print("WorkerIO: _parse_spectrum()") 564 | 565 | spectrum_holder_list = [] 566 | spectrum_original_forward_list = [] 567 | spectrum_original_backward_list = [] 568 | 569 | ### select best neighbors from the scan_list by their distance to rt_mean 570 | # probably move this selection to get_location(), run once rather than repeating 571 | neighbor_count = len(scan_list) 572 | best_scan_index = None 573 | best_distance = float('inf') 574 | for scan_index, scan in enumerate(scan_list): 575 | distance = abs(self.spectrum_rtinseconds_dict[scan] - rt_mean) 576 | if distance < best_distance: 577 | best_distance = distance 578 | best_scan_index = scan_index 579 | neighbor_center = best_scan_index 580 | neighbor_left_count = neighbor_center 581 | neighbor_right_count = neighbor_count - neighbor_left_count - 1 582 | neighbor_size_half = self.neighbor_size // 2 583 | neighbor_left_count = min(neighbor_left_count, neighbor_size_half) 584 | neighbor_right_count = min(neighbor_right_count, neighbor_size_half) 585 | 586 | ### padding zero arrays to the left if not enough neighbor spectra 587 | if neighbor_left_count < neighbor_size_half: 588 | for x in range(neighbor_size_half - neighbor_left_count): 589 | spectrum_holder_list.append(np.zeros( 590 | shape=(1, self.MZ_SIZE), 591 | dtype=np.float32)) 592 | spectrum_original_forward_list.append(np.zeros( 593 | shape=(1, self.MZ_SIZE), 594 | dtype=np.float32)) 595 | spectrum_original_backward_list.append(np.zeros( 596 | shape=(1, self.MZ_SIZE), 597 | dtype=np.float32)) 598 | 599 | ### parse and add neighbor spectra 600 | scan_list_middle = [] 601 | ms1_intensity_list_middle = [] 602 | for index in range(neighbor_center - neighbor_left_count, neighbor_center + neighbor_right_count + 1): 603 | scan = scan_list[index] 604 | scan_list_middle.append(scan) 605 | ms1_entry = ms1_list[index] 606 | ms1_intensity = float(re.split(':', ms1_entry)[1]) 607 | ms1_intensity_list_middle.append(ms1_intensity) 608 | ms1_intensity_max = max(ms1_intensity_list_middle) 609 | assert ms1_intensity_max > 0.0, "Error: Zero ms1_intensity_max" 610 | ms1_intensity_list_middle = [x/ms1_intensity_max for x in ms1_intensity_list_middle] 611 | for scan, ms1_intensity in zip(scan_list_middle, ms1_intensity_list_middle): 612 | spectrum_location = self.spectrum_location_dict[scan] 613 | input_file_handle.seek(spectrum_location) 614 | # parse header lines 615 | line = input_file_handle.readline() 616 | assert "BEGIN IONS" in line, "Error: wrong input BEGIN IONS" 617 | line = input_file_handle.readline() 618 | assert "TITLE=" in line, "Error: wrong input TITLE=" 619 | line = input_file_handle.readline() 620 | assert "PEPMASS=" in line, "Error: wrong input PEPMASS=" 621 | line = input_file_handle.readline() 622 | assert "CHARGE=" in line, "Error: wrong input CHARGE=" 623 | line = input_file_handle.readline() 624 | assert "SCANS=" in line, "Error: wrong input SCANS=" 625 | line = input_file_handle.readline() 626 | assert "RTINSECONDS=" in line, "Error: wrong input RTINSECONDS=" 627 | # parse fragment ions 628 | mz_list, intensity_list = self._parse_spectrum_ion(input_file_handle) 629 | # pre-process spectrum 630 | (spectrum_holder, 631 | spectrum_original_forward, 632 | spectrum_original_backward) = process_spectrum(mz_list, 633 | intensity_list, 634 | precursor_mass) 635 | # normalize by each individual spectrum 636 | #~ spectrum_holder /= np.max(spectrum_holder) 637 | #~ spectrum_original_forward /= np.max(spectrum_original_forward) 638 | #~ spectrum_original_backward /= np.max(spectrum_original_backward) 639 | # weight by ms1 profile 640 | #~ spectrum_holder *= ms1_intensity 641 | #~ spectrum_original_forward *= ms1_intensity 642 | #~ spectrum_original_backward *= ms1_intensity 643 | # add spectrum to the neighbor list 644 | spectrum_holder_list.append(spectrum_holder) 645 | spectrum_original_forward_list.append(spectrum_original_forward) 646 | spectrum_original_backward_list.append(spectrum_original_backward) 647 | ### padding zero arrays to the right if not enough neighbor spectra 648 | if neighbor_right_count < neighbor_size_half: 649 | for x in range(neighbor_size_half - neighbor_right_count): 650 | spectrum_holder_list.append(np.zeros( 651 | shape=(1, self.MZ_SIZE), 652 | dtype=np.float32)) 653 | spectrum_original_forward_list.append(np.zeros( 654 | shape=(1, self.MZ_SIZE), 655 | dtype=np.float32)) 656 | spectrum_original_backward_list.append(np.zeros( 657 | shape=(1, self.MZ_SIZE), 658 | dtype=np.float32)) 659 | 660 | spectrum_holder = np.vstack(spectrum_holder_list) 661 | spectrum_original_forward = np.vstack(spectrum_original_forward_list) 662 | spectrum_original_backward = np.vstack(spectrum_original_backward_list) 663 | assert spectrum_holder.shape == (self.neighbor_size, 664 | self.MZ_SIZE), "Error:shape" 665 | # spectrum-CNN normalization: by feature 666 | spectrum_holder /= np.max(spectrum_holder) 667 | 668 | # ms1_profile 669 | for x in range(neighbor_size_half - neighbor_left_count): 670 | ms1_intensity_list_middle = [0.0] + ms1_intensity_list_middle 671 | for x in range(neighbor_size_half - neighbor_right_count): 672 | ms1_intensity_list_middle = ms1_intensity_list_middle + [0.0] 673 | assert len(ms1_intensity_list_middle) == self.neighbor_size, "Error: ms1 profile" 674 | ms1_profile = np.array(ms1_intensity_list_middle) 675 | 676 | return spectrum_holder, spectrum_original_forward, spectrum_original_backward, scan_list_middle, scan_list, ms1_profile 677 | 678 | def _parse_spectrum_ion(self, input_file_handle): 679 | """TODO(nh2tran): docstring.""" 680 | 681 | #~ print("".join(["="] * 80)) # section-separating line 682 | #~ print("WorkerIO: _parse_spectrum_ion()") 683 | 684 | # ion 685 | mz_list = [] 686 | intensity_list = [] 687 | line = input_file_handle.readline() 688 | while not "END IONS" in line: 689 | mz, intensity = re.split(' |\n', line)[:2] 690 | mz_float = float(mz) 691 | intensity_float = float(intensity) 692 | # skip an ion if its mass > MZ_MAX 693 | if mz_float > self.MZ_MAX: 694 | line = input_file_handle.readline() 695 | continue 696 | mz_list.append(mz_float) 697 | intensity_list.append(intensity_float) 698 | line = input_file_handle.readline() 699 | 700 | return mz_list, intensity_list 701 | --------------------------------------------------------------------------------