├── INSTALL ├── VERSION ├── identipy ├── __init__.py ├── adv.txt ├── customparser.py ├── cparser.pyx ├── main.py ├── default.cfg ├── identipy2pin.py ├── cutils.pyx ├── extras.py ├── cli.py ├── peptide_centric.py ├── scoring.py └── utils.py ├── act_payload.json ├── .gitignore ├── requirements.txt ├── pyproject.toml ├── test.sh ├── NOTICE ├── .github └── workflows │ └── publish.yml ├── setup.py ├── README.md └── LICENSE /INSTALL: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.3.9 2 | -------------------------------------------------------------------------------- /identipy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /act_payload.json: -------------------------------------------------------------------------------- 1 | { 2 | "push": { 3 | "ref": "v0.4.2" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **.pyc 2 | **.so 3 | identipy/*.c 4 | **/__pycache__ 5 | build/ 6 | *.egg-info/ 7 | test_data 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | pandas 4 | cython>=3.0a7 5 | lxml 6 | pyteomics 7 | pyteomics.cythonize 8 | -------------------------------------------------------------------------------- /identipy/adv.txt: -------------------------------------------------------------------------------- 1 | hillPeakFactorMinLength=800 2 | hillPeakFactor=4 3 | hillBatchSize=800 4 | hillValleyFactor=1.6 5 | hillMinLength=3 6 | hillNBoots=600 7 | maxBootSize=1200 8 | noHillSplit=True 9 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", 3 | "wheel", 4 | "Cython>=3.0a7", 5 | "numpy", 6 | "pyteomics.cythonize", 7 | ] 8 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cd test_data 4 | identipy -cfg identipy.cfg -o . *.mgf 5 | pyteomics pepxml info *.pep.xml 6 | 7 | echo "Reference values (1% FDR):" 8 | echo 26718 9 | echo 17670 10 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | When using or redistributing IdentiPy, or parts of it, please cite the following paper: 2 | 3 | Levitsky, L. I., Ivanov, M. V, Lobas, A. A., Bubis, J. A., Tarasova, I. A., Solovyeva, E. M., 4 | Pridatchenko, M. L., Gorshkov, M. V. (2018). IdentiPy: An Extensible Search Engine 5 | for Protein Identification in Shotgun Proteomics. Journal of Proteome Research, 17(7), 2249–2255. 6 | https://doi.org/10.1021/acs.jproteome.7b00640 7 | 8 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | jobs: 9 | publish: 10 | name: Publish for ${{ matrix.os }} 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [ubuntu-20.04, windows-2019, macos-11] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Build wheels 19 | uses: pypa/cibuildwheel@v2.16.2 20 | env: 21 | CIBW_SKIP: "cp36-* cp37-* *-win32 *_i686 pp*" 22 | - name: Upload binaries to release 23 | uses: svenstaro/upload-release-action@v2 24 | with: 25 | repo_token: ${{ secrets.GITHUB_TOKEN }} 26 | file: wheelhouse/*.whl 27 | tag: ${{ github.ref }} 28 | overwrite: false 29 | file_glob: true 30 | -------------------------------------------------------------------------------- /identipy/customparser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import deque 3 | import itertools as it 4 | 5 | def cleave(sequence, rule, missed_cleavages=0, min_length=None): 6 | """Cleaves a polypeptide sequence using a given rule. 7 | 8 | Parameters 9 | ---------- 10 | sequence : str 11 | The sequence of a polypeptide. 12 | 13 | .. note:: 14 | The sequence is expected to be in one-letter uppercase notation. 15 | Otherwise, some of the cleavage rules in :py:data:`expasy_rules` 16 | will not work as expected. 17 | 18 | rule : str or compiled regex 19 | A regular expression describing the site of cleavage. It is recommended 20 | to design the regex so that it matches only the residue whose C-terminal 21 | bond is to be cleaved. All additional requirements should be specified 22 | using `lookaround assertions 23 | `_. 24 | :py:data:`expasy_rules` contains cleavage rules for popular cleavage agents. 25 | missed_cleavages : int, optional 26 | Maximum number of allowed missed cleavages. Defaults to 0. 27 | min_length : int or None, optional 28 | Minimum peptide length. Defaults to :py:const:`None`. 29 | 30 | ..note :: 31 | This checks for string length, which is only correct for one-letter 32 | notation and not for full *modX*. Use :py:func:`length` manually if 33 | you know what you are doing and apply :py:func:`cleave` to *modX* 34 | sequences. 35 | 36 | Returns 37 | ------- 38 | out : set 39 | A set of unique (!) peptides. 40 | 41 | Examples 42 | -------- 43 | >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'} 44 | True 45 | >>> cleave('GKGKYKCK', expasy_rules['trypsin'], 2) == \ 46 | {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'} 47 | True 48 | 49 | """ 50 | return set(_cleave(sequence, rule, missed_cleavages, min_length)) 51 | 52 | def _cleave(sequence, rule, missed_cleavages=0, min_length=None): 53 | """Like :py:func:`cleave`, but the result is a list. Refer to 54 | :py:func:`cleave` for explanation of parameters. 55 | """ 56 | # cdef list cleavage_sites 57 | peptides = [] 58 | ml = missed_cleavages+2 59 | trange = range(ml) 60 | cleavage_sites = deque([0], maxlen=ml) 61 | cl = 1 62 | for i in it.chain([x.end() for x in re.finditer(rule, sequence)], 63 | [None]): 64 | cleavage_sites.append(i) 65 | if cl < ml: 66 | cl += 1 67 | for j in trange[:cl-1]: 68 | seq = sequence[cleavage_sites[j]:cleavage_sites[-1]] 69 | if seq: 70 | if min_length is None or len(seq) >= min_length: 71 | peptides.append((seq, cleavage_sites[j])) 72 | return peptides -------------------------------------------------------------------------------- /identipy/cparser.pyx: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import deque 3 | import itertools as it 4 | 5 | def cleave(sequence, rule, missed_cleavages=0, min_length=None): 6 | """Cleaves a polypeptide sequence using a given rule. 7 | 8 | Parameters 9 | ---------- 10 | sequence : str 11 | The sequence of a polypeptide. 12 | 13 | .. note:: 14 | The sequence is expected to be in one-letter uppercase notation. 15 | Otherwise, some of the cleavage rules in :py:data:`expasy_rules` 16 | will not work as expected. 17 | 18 | rule : str or compiled regex 19 | A regular expression describing the site of cleavage. It is recommended 20 | to design the regex so that it matches only the residue whose C-terminal 21 | bond is to be cleaved. All additional requirements should be specified 22 | using `lookaround assertions 23 | `_. 24 | :py:data:`expasy_rules` contains cleavage rules for popular cleavage agents. 25 | missed_cleavages : int, optional 26 | Maximum number of allowed missed cleavages. Defaults to 0. 27 | min_length : int or None, optional 28 | Minimum peptide length. Defaults to :py:const:`None`. 29 | 30 | ..note :: 31 | This checks for string length, which is only correct for one-letter 32 | notation and not for full *modX*. Use :py:func:`length` manually if 33 | you know what you are doing and apply :py:func:`cleave` to *modX* 34 | sequences. 35 | 36 | Returns 37 | ------- 38 | out : set 39 | A set of unique (!) peptides. 40 | 41 | Examples 42 | -------- 43 | >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'} 44 | True 45 | >>> cleave('GKGKYKCK', expasy_rules['trypsin'], 2) == \ 46 | {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'} 47 | True 48 | 49 | """ 50 | return set(_cleave(sequence, rule, missed_cleavages, min_length)) 51 | 52 | def _cleave(sequence, rule, missed_cleavages=0, min_length=None): 53 | """Like :py:func:`cleave`, but the result is a list. Refer to 54 | :py:func:`cleave` for explanation of parameters. 55 | """ 56 | cdef int ml, cl 57 | # cdef int i, cl 58 | cdef str seq 59 | cdef list peptides 60 | cdef list trange 61 | # cdef list cleavage_sites 62 | peptides = [] 63 | ml = missed_cleavages+2 64 | trange = list(range(ml)) 65 | cleavage_sites = deque([0], maxlen=ml) 66 | cl = 1 67 | for i in it.chain([x.end() for x in re.finditer(rule, sequence)], 68 | [None]): 69 | cleavage_sites.append(i) 70 | if cl < ml: 71 | cl += 1 72 | for j in trange[:cl-1]: 73 | seq = sequence[cleavage_sites[j]:cleavage_sites[-1]] 74 | if seq: 75 | if min_length is None or len(seq) >= min_length: 76 | peptides.append((seq, cleavage_sites[j])) 77 | return peptides 78 | -------------------------------------------------------------------------------- /identipy/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from . import utils, peptide_centric 3 | import logging 4 | import re 5 | import sys 6 | logger = logging.getLogger(__name__) 7 | 8 | def process_file(fname, settings, initial_run=True): 9 | if initial_run: 10 | fmods = settings.get('modifications', 'fixed') 11 | if fmods: 12 | for mod in re.split(r'[,;]\s*', fmods): 13 | if initial_run and mod.startswith('-'): 14 | mod_label = mod[1:] 15 | mass_change = settings.getfloat('modifications', mod_label) 16 | prev_cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage') 17 | settings.set('modifications', 'protein cterm cleavage', prev_cterm_mass + mass_change) 18 | elif initial_run and mod.endswith('-'): 19 | mod_label = mod[:-1] 20 | mass_change = settings.getfloat('modifications', mod_label) 21 | prev_nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage') 22 | settings.set('modifications', 'protein nterm cleavage', prev_nterm_mass + mass_change) 23 | 24 | utils.generate_database(settings) 25 | stage1 = settings.get('misc', 'first stage') 26 | if stage1: 27 | return double_run(fname, settings, utils.import_(stage1)) 28 | else: 29 | logger.debug('Starting one-stage search.') 30 | utils.seen_target.clear() 31 | utils.seen_decoy.clear() 32 | logger.debug((peptide_centric.process_peptides, fname, settings)) 33 | return peptide_centric.process_peptides(fname, settings) 34 | 35 | 36 | def double_run(fname, settings, stage1): 37 | logger.info('[double run] stage 1 starting ...') 38 | settings.set('misc', 'fast first stage', 1) 39 | new_settings = stage1(fname, settings) 40 | logger.info('[double run] stage 2 starting ...') 41 | new_settings.set('misc', 'fast first stage', 0) 42 | return process_file(fname, new_settings, initial_run=False) 43 | 44 | 45 | def settings(fname=None, default_name=os.path.join( 46 | os.path.dirname(os.path.abspath(__file__)), 'default.cfg')): 47 | """Read a configuration file and return a :py:class:`RawConfigParser` object. 48 | """ 49 | kwargs = dict(dict_type=dict, allow_no_value=True) 50 | if sys.version_info.major == 3: 51 | kwargs['inline_comment_prefixes'] = ('#', ';') 52 | 53 | raw_config = utils.CustomRawConfigParser(**kwargs) 54 | if default_name: 55 | logger.info('Reading defaults from %s', default_name) 56 | if not os.path.isfile(default_name): 57 | logger.error('FILE NOT FOUND: %s', default_name) 58 | raw_config.read(default_name) 59 | if fname: 60 | logger.info('Reading config from %s', fname) 61 | if not os.path.isfile(fname): 62 | logger.error('FILE NOT FOUND: %s', fname) 63 | raw_config.read(fname) 64 | 65 | acc_unit = raw_config.get('search', 'product accuracy unit') 66 | if acc_unit == 'ppm': 67 | acc_ppm = raw_config.getfloat('search', 'product accuracy') 68 | acc_raw = acc_ppm / 1e6 * 2000 69 | raw_config.set('search', 'product accuracy', acc_raw) 70 | raw_config.set('search', 'product accuracy ppm', acc_ppm) 71 | 72 | return raw_config 73 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | setup.py file for identipy 5 | ''' 6 | import os 7 | from setuptools import setup, Extension 8 | import subprocess 9 | import sys 10 | 11 | def get_version(): 12 | try: 13 | version = subprocess.check_output(['git', 'describe']).strip().decode('ascii').replace('-', '.') 14 | if version[0] == 'v': 15 | version = version[1:] 16 | head, tail = version.rsplit('.', 1) 17 | if not tail.isdigit(): 18 | version = head 19 | except subprocess.CalledProcessError: 20 | version = open('VERSION').readline().strip() 21 | return version 22 | 23 | 24 | def make_extensions(): 25 | is_ci = bool(os.getenv("CI", "")) 26 | include_diagnostics = False 27 | try: 28 | import numpy 29 | except ImportError: 30 | print("C Extensions require `numpy`") 31 | raise 32 | try: 33 | from pyteomics import _capi 34 | except ImportError: 35 | print("C Extensions require `pyteomics.cythonize`") 36 | raise 37 | try: 38 | from Cython.Build import cythonize 39 | cython_directives = { 40 | 'embedsignature': True, 41 | 'profile': include_diagnostics, 42 | 'language_level': sys.version_info.major 43 | } 44 | macros = [] 45 | if include_diagnostics: 46 | macros.append(("CYTHON_TRACE_NOGIL", "1")) 47 | if is_ci and include_diagnostics: 48 | cython_directives['linetrace'] = True 49 | 50 | extensions = cythonize([ 51 | Extension(name='identipy.cparser', sources=['identipy/cparser.pyx']), 52 | Extension(name='identipy.cutils', sources=['identipy/cutils.pyx'], 53 | include_dirs=[numpy.get_include(), _capi.get_include()]) 54 | ], compiler_directives=cython_directives) 55 | except ImportError: 56 | extensions = [ 57 | Extension(name='identipy.cparser', sources=['identipy/cparser.c']), 58 | Extension(name='identipy.cutils', sources=['identipy/cutils.c'], 59 | include_dirs=[numpy.get_include(), _capi.get_include()]) 60 | 61 | ] 62 | return extensions 63 | 64 | 65 | def do_setup(cext=True): 66 | setup( 67 | name = 'identipy', 68 | version = get_version(), 69 | description = '''Pyteomics-based search engine''', 70 | long_description = (''.join(open('README.md').readlines()) + '\n' 71 | + ''.join(open('INSTALL').readlines())), 72 | author = 'Lev Levitsky & Mark Ivanov', 73 | author_email = 'pyteomics@googlegroups.com', 74 | url = 'https://github.com/levitsky/identipy', 75 | packages = ['identipy', ], 76 | package_data = {'identipy': ['default.cfg', ]}, 77 | install_requires = [line.strip() for line in open('requirements.txt')], 78 | ext_modules = make_extensions() if cext else None, 79 | classifiers = ['Intended Audience :: Science/Research', 80 | 'Programming Language :: Python :: 2.7', 81 | 'Programming Language :: Python :: 3', 82 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 83 | 'Topic :: Software Development :: Libraries'], 84 | license = 'License :: OSI Approved :: Apache Software License', 85 | entry_points = {'console_scripts': ['identipy = identipy.cli:run', 86 | 'identipy2pin = identipy.identipy2pin:run']} 87 | ) 88 | 89 | 90 | try: 91 | do_setup(True) 92 | except Exception as err: 93 | print("*" * 60) 94 | print("Could not compile C Extensions due to %r, attempting pure Python installation." % (err,)) 95 | print("*" * 60) 96 | do_setup(False) 97 | print("Could not compile C Extensions due to %r, speedups are not enabled." % (err,)) 98 | -------------------------------------------------------------------------------- /identipy/default.cfg: -------------------------------------------------------------------------------- 1 | # Default configuration file for IdentiPy search enfine. 2 | # It will be used for everything that is missing in your custom config file. 3 | # Interpolation of values and line continuation are not supported. 4 | 5 | # Caution: comments on empty lines must start at the beginning of the line. 6 | 7 | [search] 8 | precursor accuracy unit: ppm ; can be ppm or Th or Da 9 | # (the latter two mean the same, Th is correct) 10 | precursor accuracy left: 100 11 | precursor accuracy right: 100 12 | # any of the above two can be negative if you want to account for 13 | # systematic error 14 | product accuracy: 0.1 ; mass accuracy of fragments 15 | product accuracy unit: Da ; can be ppm or Da 16 | product minimum m/z: 100 17 | peptide maximum length: 60 18 | peptide minimum length: 6 19 | peptide maximum mass: 6000 20 | peptide minimum mass: 250 21 | enzyme: trypsin ; can be a name of enzyme from pyteomics.parser.expasy_rules or 22 | # can be written in X!Tandem enzyme style. Examples: 23 | # [RK]|{P} means cleave after R and K, but not before P 24 | # [X]|[D] means cleave before D 25 | # [RK]|{P},[M]|[X] means mix of trypsin and cnbr 26 | number of missed cleavages: 1 27 | semitryptic: 0 28 | maximum charge: 9 29 | minimum charge: 1 30 | maximum unknown charge: 0 31 | minimum unknown charge: 0 32 | precursor isotope mass error: 0 ; When the value for this parameter is not 0, 33 | ;the parent ion mass tolerance is expanded by opening up multiple tolerance windows centered on the given number of 13C isotope peaks for a peptide. 34 | shifts: 0 ; example: 0, 16.000, 23.000, 12 35 | snp: 0 ; 1 means make SNP (point mutations) check for ALL peptides 36 | # use only for small protein databases because search time increases significantly 37 | clip N-terminal methionine: true 38 | rapid_check: 0 ; 1 means leave only 2000 random spectra for processing 39 | 40 | [modifications] 41 | # Examples: camC, oxM, acetyl-, -anyctermmodification 42 | # Must be written in lowercase 43 | fixed: camC 44 | variable: 45 | protein variable: 46 | maximum variable mods: 2 47 | protein nterm cleavage: 1.007825 48 | protein cterm cleavage: 17.002735 49 | p = 79.966331 50 | ox = 15.994915 51 | cam = 57.021464 52 | 53 | [output] 54 | format: pepXML ; can be pepxml or tsv 55 | # separator can be specified for csv/tsv format. Default is comma for csv and tab for tsv 56 | separator: 57 | 58 | # in case of label-based quantitation Identipy can write tags intensities in the output pepXML file. 59 | tags: 60 | # Can be tmt10plex, tmt6plex or custom format label1:mass1,label2:mass2... 61 | # Example for custom tmt6plex - tmt_126:126.12773,tmt_127N:127.12476... 62 | 63 | #path: 64 | candidates: 1 ; 0 means all sequence candidates 65 | score threshold: 0 66 | minimum matched: 4 ; minimum matched ions for reporting identification 67 | # higher value reduces analysis time 68 | show empty: no 69 | precursor accuracy unit: ; can be ppm or Th or Da 70 | # (the latter two mean the same, Th is correct) 71 | generated database: 72 | # with "add decoy", the filename of generated database can be specified for reuse e.g. in post-processing 73 | 74 | [input] 75 | database: 76 | add decoy: no 77 | # enable if your DB does not have decoy sequences 78 | # but you want to add them to the search 79 | decoy method: reverse ; one of 'reverse' or 'shuffle' 80 | decoy prefix: DECOY_ ; prefix for decoy protein description 81 | # if the decoy label is somewhere in the middle of the protein header, use this: 82 | decoy infix: 83 | deisotoping mass tolerance: 0.3 84 | deisotope: yes 85 | 86 | [scoring] 87 | score: identipy.scoring.RNHS 88 | # this can be 'identipy.scoring.RNHS2', 'identipy.scoring.RNHS', 89 | # 'identipy.scoring.hyperscore', 'identipy.scoring.morpheusscore' or a dot-delimited 90 | # name of a third-party function. It will be given a spectrum, 91 | # a sequence of a candidate and config. 92 | # score is supposed to be higher for better matches 93 | condition: 94 | # condition can be a path to a function (or a function object added dynamically 95 | # within your Python program) that wil be called and given the same arguments 96 | # as the score function. If this function returns a falsy value, the candidate 97 | # is discarded. 98 | minimum peaks: 4 ; minimum number of peaks in fragmentation spectrum 99 | maximum peaks: 200 ; Getting only N peaks with max intensity from the fragmentation spectrum 100 | # set to 0 to disable 101 | dynamic range: 1000 ; affected by 'spectrum processor' 102 | # disregard all peaks that are less than 103 | # 1/x of the highest in the spectrum, where X is this value 104 | # 0 means no filtering 105 | e-values for candidates: 1 106 | maximum fragment charge: 1 107 | 108 | [optimization] 109 | increase precursor mass tolerance: yes 110 | 111 | [performance] 112 | processes: 0 ; 0 means auto 113 | out queue size: 40000 114 | pre-calculation: some 115 | folder: 116 | # where to store/look for precalculated files 117 | 118 | [misc] 119 | first stage: 120 | # use identipy.extras.optimization for auto-tune 121 | # if you want custom refinement, put your function name here 122 | # the function will be given the search results and expected 123 | # to return new settings for a second search. 124 | # The same technique is used for searching with variable mods. 125 | 126 | hash: md5 127 | # used for hashing of the database contents 128 | 129 | -------------------------------------------------------------------------------- /identipy/identipy2pin.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pyteomics import pepxml, achrom, auxiliary as aux, mass, fasta 3 | import numpy as np 4 | from os import path 5 | from collections import Counter 6 | import argparse 7 | 8 | def calc_RT(seq, RC): 9 | try: 10 | return achrom.calculate_RT(seq, RC) 11 | except: 12 | return 0 13 | 14 | def is_decoy(proteins, decoy_prefix): 15 | return all(z.startswith(decoy_prefix) for z in proteins) 16 | 17 | def parse_mods(df_raw): 18 | mods_counter = Counter() 19 | sequence, mods = df_raw['peptide'], df_raw['modifications'] 20 | if isinstance(mods, list): 21 | for mod in mods: 22 | mod_mass, aa_ind = mod.split('@') 23 | mod_mass = float(mod_mass) 24 | aa_ind = int(aa_ind) 25 | if aa_ind == 0: 26 | aa = 'N_term' 27 | mod_mass = round(mod_mass - 1.007825, 3) 28 | elif aa_ind == len(sequence) + 1: 29 | aa = 'C_term' 30 | mod_mass = round(mod_mass - 17.002735, 3) 31 | else: 32 | aa = sequence[aa_ind-1] 33 | mod_mass = round(mod_mass - mass.std_aa_mass[aa], 3) 34 | mod_name = 'mass shift %.3f at %s' % (mod_mass, aa) 35 | mods_counter[mod_name] += 1 36 | return mods_counter 37 | 38 | def add_mod_info(df_raw, mod): 39 | sequence, mods_counter = df_raw['peptide'], df_raw['mods_counter'] 40 | mod_aa = mod.split(' at ')[1] 41 | if 'term' not in mod_aa and mod_aa not in sequence: 42 | return -1 43 | else: 44 | return mods_counter.get(mod, 0) 45 | 46 | def prepare_mods(df): 47 | all_mods = set() 48 | for cnt in df['mods_counter'].values: 49 | for k in cnt.keys(): 50 | all_mods.add(k) 51 | for mod in all_mods: 52 | df[mod] = df.apply(add_mod_info, axis=1, mod=mod) 53 | return df 54 | 55 | def getlabel(decoy): 56 | return -1 if decoy else 1 57 | 58 | def prepare_dataframe(infile_path, decoy_prefix='DECOY_', use_rt=1): 59 | df1 = pepxml.DataFrame(infile_path, read_schema=False) 60 | df1['length'] = df1['peptide'].apply(len) 61 | try: 62 | df1['y-b_ions'] = df1['matched_y1_ions'] - df1['matched_b1_ions'] 63 | except: 64 | pass 65 | df1 = df1[df1['length'] >= 6] 66 | df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINSECONDS')[0]) 67 | df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0))) 68 | df1['massdiff_ppm'] = 1e6 * df1['massdiff'] / df1['calc_neutral_pep_mass'] 69 | df1['decoy'] = df1['protein'].apply(is_decoy, decoy_prefix=decoy_prefix) 70 | df1['mods_counter'] = df1.apply(parse_mods, axis=1) 71 | df1 = prepare_mods(df1) 72 | 73 | if use_rt: 74 | try: 75 | df1['RT exp'] = df1['retention_time_sec'] / 60 76 | df1 = df1.drop(['retention_time_sec', ], axis=1) 77 | df1_f = aux.filter(df1, fdr=0.01, key='expect', is_decoy='decoy', correction=1) 78 | print('Default target-decoy filtering, 1%% PSM FDR: Number of target PSMs = %d' \ 79 | % (df1_f[~df1_f['decoy']].shape[0])) 80 | print('Calibrating retention model...') 81 | retention_coefficients = achrom.get_RCs_vary_lcp(df1_f['peptide'].values, \ 82 | df1_f['RT exp'].values) 83 | df1_f['RT pred'] = df1_f['peptide'].apply(lambda x: calc_RT(x, retention_coefficients)) 84 | df1['RT pred'] = df1['peptide'].apply(lambda x: calc_RT(x, retention_coefficients)) 85 | _, _, r_value, std_value = aux.linear_regression(df1_f['RT pred'], df1_f['RT exp']) 86 | print('R^2 = %f , std = %f' % (r_value**2, std_value)) 87 | df1['RT diff'] = df1['RT pred'] - df1['RT exp'] 88 | except: 89 | pass 90 | 91 | df1['Label'] = df1['decoy'].apply(getlabel) 92 | df1['SpecId'] = df1['index'] + 1 93 | df1['ScanNr'] = df1['index'] + 1 94 | try: 95 | prev_aa = df1['peptide_prev_aa'][0] 96 | next_aa = df1['peptide_next_aa'][0] 97 | df1['Peptide'] = df1['peptide'].apply(lambda x: prev_aa + '.' + x + '.' + next_aa) 98 | except: 99 | df1['Peptide'] = df1['peptide'].apply(lambda x: 'K.' + x + '.K') 100 | df1['Proteins'] = df1['protein'] 101 | 102 | return df1 103 | 104 | def get_features(dataframe): 105 | feature_columns = dataframe.columns 106 | columns_to_remove = [] 107 | for feature in feature_columns: 108 | if feature not in ['expect', 'hyperscore', 'calc_neutral_pep_mass', 'bscore', 'yscore', \ 109 | 'massdiff', 'massdiff_ppm', 'RT pred', 'RT diff', \ 110 | 'sumI', 'RT exp', 'precursor_neutral_mass', 'massdiff_int', \ 111 | 'num_missed_cleavages', 'tot_num_ions', 'num_matched_ions', 'length',\ 112 | 'SpecId', 'Label', 'ScanNr', 'Peptide', 'Proteins', 113 | 'matched_y1_ions', 'matched_b1_ions', 'y-b_ions', 'fragmentMT']: 114 | if not feature.startswith('mass shift'): 115 | columns_to_remove.append(feature) 116 | feature_columns = feature_columns.drop(columns_to_remove) 117 | return feature_columns 118 | 119 | def run(): 120 | parser = argparse.ArgumentParser( 121 | description='Convert Identipy pep.xml to pin for Percolator', 122 | epilog=''' 123 | 124 | Example usage 125 | ------------- 126 | $ identipy2pin input.pep.xml 127 | ------------- 128 | 129 | Also can be used for MSFragger and X!Tandem pep.xml files. 130 | ''', 131 | formatter_class=argparse.RawDescriptionHelpFormatter) 132 | 133 | parser.add_argument('file', help='input .pep.xml file') 134 | parser.add_argument('-out', help='path to output .pin file. By default put pin file near the pep.xml', default='') 135 | parser.add_argument('-prefix', help='decoy prefix', default='DECOY_') 136 | parser.add_argument('-rt', help='add RT prediction to features. 1 or 0', default=1, type=int) 137 | 138 | 139 | args = vars(parser.parse_args()) 140 | infile = args['file'] 141 | prefix = args['prefix'] 142 | use_rt = args['rt'] 143 | out = args['out'] 144 | if out: 145 | outfile = out 146 | else: 147 | outfile = infile.replace('.pep.xml', '.pin') 148 | df1 = prepare_dataframe(infile, decoy_prefix=prefix, use_rt=use_rt) 149 | df00 = df1[get_features(df1)] 150 | df00_col = list(df00.columns.values) 151 | df00_col.remove('SpecId') 152 | df00_col.remove('Label') 153 | df00_col.remove('ScanNr') 154 | df00_col.remove('Peptide') 155 | df00_col.remove('Proteins') 156 | 157 | df00_col.insert(0, 'ScanNr') 158 | df00_col.insert(0, 'Label') 159 | df00_col.insert(0, 'SpecId') 160 | df00_col.append('Peptide') 161 | df00_col.append('Proteins') 162 | 163 | dft = df00.reindex(columns=df00_col) 164 | dft['Proteins'] = dft['Proteins'].apply(lambda x: 'proteinsplittmp'.join(x)) 165 | dft.to_csv(path_or_buf=outfile, index=False, sep='\t') 166 | with open(outfile, 'r') as f : 167 | lines = list(f.readlines()) 168 | outf = open(outfile, 'w') 169 | for l in lines: 170 | tmp = l.split('\t') 171 | outf.write('\t'.join(tmp[:-1]) + '\t' + '\t'.join(tmp[-1].split('proteinsplittmp'))) 172 | outf.close() 173 | 174 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **IdentiPy** is a search engine for bottom-up proteomics written in Python. 2 | 3 | # Citation # 4 | 5 | IdentiPy is described in this JPR paper: http://dx.doi.org/10.1021/acs.jproteome.7b00640 6 | 7 | Please cite it when using IdentiPy or its parts. 8 | 9 | # License # 10 | 11 | IdentiPy is published under the Apache 2.0 license. 12 | 13 | # How to install # 14 | 15 | ``` 16 | pip install git+https://github.com/levitsky/identipy.git 17 | ``` 18 | or: 19 | 20 | ``` 21 | $ git clone https://github.com/levitsky/identipy 22 | $ cd identipy 23 | $ pip install . 24 | 25 | ``` 26 | 27 | # Requirements # 28 | 29 | See [requirements.txt](requirements.txt). Key dependencies are: 30 | 31 | - Python 32 | - scipy 33 | - pyteomics 34 | - lxml 35 | - cython 36 | - pyteomics.cythonize 37 | 38 | # How to use # 39 | 40 | ## GUI way ## 41 | 42 | You can separately install a web-based GUI for IdentiPy, [IdentiPy Server](https://github.com/levitsky/identipy_server). 43 | Please refer to the linked page for system requirements and installation instructions. 44 | 45 | ## CLI way ## 46 | 47 | A typical command to process a file would look like this: 48 | 49 | ``` 50 | $ identipy -cfg my.cfg spectra.mgf 51 | ``` 52 | 53 | Here, `my.cfg` is a settings file specifying the search parameters. Allowed parameters and their default values are listed in the 54 | [default configuration file](identipy/default.cfg). 55 | Settings not specified in `my.cfg` will be taken from the default file. 56 | 57 | Search settings can also be overriden using command-line options. 58 | 59 | For help on command-line usage, run: 60 | 61 | ``` 62 | $ identipy --help 63 | ``` 64 | 65 | You will see a message like this: 66 | 67 | ``` 68 | $ identipy --help 69 | usage: identipy [-h] [-db FASTA] [-cfg CONFIG_FILE] [-out PATH] [-of FORMAT] 70 | [-sep SEP] [-at] [-nopwide] [-punit UNIT] [-ptol VALUE] 71 | [-lptol VALUE] [-rptol VALUE] [-funit UNIT] [-ftol VALUE] 72 | [-fminmz VALUE] [-lmin N] [-lmax N] [-massmin VALUE] 73 | [-massmax VALUE] [-e RULE] [-mc N] [-semi] [-noclip] [-cmin N] 74 | [-cmax N] [-cumin N] [-cumax N] [-ime N] [-shifts SHIFTS] 75 | [-snp SNP] [-rapid] [-mm N] [-ad] [-prefix PREFIX] 76 | [-infix INFIX] [-method {reverse,shuffle}] [-deis] 77 | [-deistol DEISTOL] 78 | [-score {RNHS2,RNHS,hyperscore,morpheusscore}] [-minp N] 79 | [-maxp N] [-dyn DYN] [-mfc N] [-nproc N] [-maxmods N] 80 | [-ncleave NCLEAVE] [-ccleave CCLEAVE] [-fmods FMODS] 81 | [-vmods VMODS] [-pmods PMODS] [-tags TAGS] [-debug] 82 | [-dino DINO] [-dinoargs [DINOARGS ...]] [-demixing] [-pif] 83 | file 84 | 85 | Search proteins using LC-MS/MS spectra 86 | 87 | positional arguments: 88 | file input .mzML or .mgf file with MS/MS spectra 89 | 90 | options: 91 | -h, --help show this help message and exit 92 | -db FASTA path to protein FASTA file 93 | -cfg CONFIG_FILE path to file with parameters 94 | -out PATH, -o PATH output path 95 | -of FORMAT output format 96 | -sep SEP output column separator (for table format) 97 | -at Use auto-tuning of search parameters 98 | -nopwide Do not increase initial precursor mass accuracy for 99 | auto-tuning 100 | -punit UNIT precursor mass tolerance unit 101 | -ptol VALUE precursor mass tolerance 102 | -lptol VALUE *left precursor mass tolerance 103 | -rptol VALUE *right precursor mass tolerance 104 | -funit UNIT fragment mass tolerance unit 105 | -ftol VALUE fragment mass tolerance 106 | -fminmz VALUE fragment min m/z 107 | -lmin N min length of peptides 108 | -lmax N max length of peptides 109 | -massmin VALUE min mass of peptides 110 | -massmax VALUE max mass of peptides 111 | -e RULE cleavage rule in quotes!. X!Tandem style for cleavage 112 | rules 113 | -mc N number of missed cleavages 114 | -semi include semitryptic peptides 115 | -noclip Disable clipping of N-terminal methionine 116 | -cmin N min precursor charge 117 | -cmax N max precursor charge 118 | -cumin N min unknown precursor charge 119 | -cumax N max unknown precursor charge 120 | -ime N precursor isotope mass error. The parent ion mass 121 | tolerance is expanded by opening up multiple tolerance 122 | windows centered on the given number of 13C isotope 123 | peaks for a peptide. 124 | -shifts SHIFTS shifts. example: 0,16.000,23.000,12 125 | -snp SNP 1 means make SNP changes for ALL peptides 126 | -rapid leave only 2000 random spectra for processing 127 | -mm N number of minimum matched ions 128 | -ad add decoy 129 | -prefix PREFIX decoy prefix 130 | -infix INFIX decoy infix 131 | -method {reverse,shuffle} 132 | decoy method; reverse or shuffle 133 | -deis use MS/MS deisotoping 134 | -deistol DEISTOL deisotope mass accuracy 135 | -score {RNHS2,RNHS,hyperscore,morpheusscore} 136 | used scoring function 137 | -minp N minumum peaks in MS/MS spectra 138 | -maxp N maximum peaks in MS/MS spectra 139 | -dyn DYN dynamic range 140 | -mfc N maximum fragment charge 141 | -nproc N number of processes. 0 means auto 142 | -maxmods N maximum variable mods per sequence 143 | -ncleave NCLEAVE protein nterm cleavage 144 | -ccleave CCLEAVE protein cterm cleavage 145 | -fmods FMODS fixed modifications. Format: 146 | mass1@aminoacid1,mass2@aminoacid2 147 | -vmods VMODS variable modifications. Format: 148 | mass1@aminoacid1,mass2@aminoacid2 149 | -pmods PMODS variable protein terminal modifications 150 | -tags TAGS Add quantitation tags to the pepXML output. Can be 151 | tmt10plex, tmt6plex, tmt11plex or custom format 152 | label1:mass1,label2:mass2... 153 | -debug Print debugging messages 154 | -dino DINO path to Dinosaur JAR file or Biosaur executable. Used 155 | for chimeric spectrum processing and MS1 Intensity 156 | calculation 157 | -dinoargs [DINOARGS ...] 158 | extra arguments to Dinosaur or Biosaur. 159 | -demixing Use demixing 160 | -pif Calculate PIF 161 | 162 | Example usage 163 | ------------- 164 | $ identipy input.mgf -db human.fasta 165 | ------------- 166 | 167 | ``` 168 | 169 | 170 | # Related projects # 171 | 172 | - Pyteomics: https://github.com/levitsky/pyteomics 173 | 174 | - pyteomics.cythonize: https://github.com/mobiusklein/pyteomics.cythonize 175 | 176 | - Scavager: https://github.com/markmipt/scavager 177 | 178 | - IdentiPy Server: https://github.com/levitsky/identipy_server 179 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | IdentiPy is distributed under the conditions of the 2 | Apache License, Version 2.0: 3 | http://www.opensource.org/licenses/Apache-2.0 4 | 5 | Copyright (c) 2018, Lev Levitsky & Mark Ivanov 6 | 7 | Apache License, Version 2.0 8 | Apache License 9 | Version 2.0, January 2004 10 | http://www.apache.org/licenses/ 11 | 12 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 13 | 14 | 1. Definitions. 15 | 16 | "License" shall mean the terms and conditions for use, reproduction, and 17 | distribution as defined by Sections 1 through 9 of this document. 18 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 19 | owner that is granting the License. 20 | 21 | "Legal Entity" shall mean the union of the acting entity and all other entities 22 | that control, are controlled by, or are under common control with that entity. 23 | For the purposes of this definition, "control" means (i) the power, direct or 24 | indirect, to cause the direction or management of such entity, whether by 25 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 26 | outstanding shares, or (iii) beneficial ownership of such entity. 27 | "You" (or "Your") shall mean an individual or Legal Entity exercising 28 | permissions granted by this License. 29 | 30 | "Source" form shall mean the preferred form for making modifications, including 31 | but not limited to software source code, documentation source, and configuration 32 | files. 33 | 34 | "Object" form shall mean any form resulting from mechanical transformation or 35 | translation of a Source form, including but not limited to compiled object code, 36 | generated documentation, and conversions to other media types. 37 | 38 | "Work" shall mean the work of authorship, whether in Source or Object form, made 39 | available under the License, as indicated by a copyright notice that is included 40 | in or attached to the work (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object form, that 43 | is based on (or derived from) the Work and for which the editorial revisions, 44 | annotations, elaborations, or other modifications represent, as a whole, an 45 | original work of authorship. For the purposes of this License, Derivative Works 46 | shall not include works that remain separable from, or merely link (or bind by 47 | name) to the interfaces of, the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including the original version 50 | of the Work and any modifications or additions to that Work or Derivative Works 51 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 52 | by the copyright owner or by an individual or Legal Entity authorized to submit 53 | on behalf of the copyright owner. For the purposes of this definition, 54 | "submitted" means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, and 57 | issue tracking systems that are managed by, or on behalf of, the Licensor for 58 | the purpose of discussing and improving the Work, but excluding communication 59 | that is conspicuously marked or otherwise designated in writing by the copyright 60 | owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 63 | of whom a Contribution has been received by Licensor and subsequently 64 | incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. 67 | 68 | Subject to the terms and conditions of this License, each Contributor hereby 69 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 70 | irrevocable copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the Work and such 72 | Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. 75 | 76 | Subject to the terms and conditions of this License, each Contributor hereby 77 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 78 | irrevocable (except as stated in this section) patent license to make, have 79 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 80 | such license applies only to those patent claims licensable by such Contributor 81 | that are necessarily infringed by their Contribution(s) alone or by combination 82 | of their Contribution(s) with the Work to which such Contribution(s) was 83 | submitted. If You institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 85 | Contribution incorporated within the Work constitutes direct or contributory 86 | patent infringement, then any patent licenses granted to You under this License 87 | for that Work shall terminate as of the date such litigation is filed. 88 | 89 | 4. Redistribution. 90 | 91 | You may reproduce and distribute copies of the Work or Derivative Works thereof 92 | in any medium, with or without modifications, and in Source or Object form, 93 | provided that You meet the following conditions: 94 | 95 | You must give any other recipients of the Work or Derivative Works a copy of 96 | this License; and 97 | You must cause any modified files to carry prominent notices stating that You 98 | changed the files; and 99 | You must retain, in the Source form of any Derivative Works that You distribute, 100 | all copyright, patent, trademark, and attribution notices from the Source form 101 | of the Work, excluding those notices that do not pertain to any part of the 102 | Derivative Works; and 103 | If the Work includes a "NOTICE" text file as part of its distribution, then any 104 | Derivative Works that You distribute must include a readable copy of the 105 | attribution notices contained within such NOTICE file, excluding those notices 106 | that do not pertain to any part of the Derivative Works, in at least one of the 107 | following places: within a NOTICE text file distributed as part of the 108 | Derivative Works; within the Source form or documentation, if provided along 109 | with the Derivative Works; or, within a display generated by the Derivative 110 | Works, if and wherever such third-party notices normally appear. The contents of 111 | the NOTICE file are for informational purposes only and do not modify the 112 | License. You may add Your own attribution notices within Derivative Works that 113 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 114 | provided that such additional attribution notices cannot be construed as 115 | modifying the License. 116 | 117 | You may add Your own copyright statement to Your modifications and may provide 118 | additional or different license terms and conditions for use, reproduction, or 119 | distribution of Your modifications, or for any such Derivative Works as a whole, 120 | provided Your use, reproduction, and distribution of the Work otherwise complies 121 | with the conditions stated in this License. 122 | 123 | 5. Submission of Contributions. 124 | 125 | Unless You explicitly state otherwise, any Contribution intentionally submitted 126 | for inclusion in the Work by You to the Licensor shall be under the terms and 127 | conditions of this License, without any additional terms or conditions. 128 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 129 | any separate license agreement you may have executed with Licensor regarding 130 | such Contributions. 131 | 132 | 6. Trademarks. 133 | 134 | This License does not grant permission to use the trade names, trademarks, 135 | service marks, or product names of the Licensor, except as required for 136 | reasonable and customary use in describing the origin of the Work and 137 | reproducing the content of the NOTICE file. 138 | 139 | 7. Disclaimer of Warranty. 140 | 141 | Unless required by applicable law or agreed to in writing, Licensor provides the 142 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 143 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 144 | including, without limitation, any warranties or conditions of TITLE, 145 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 146 | solely responsible for determining the appropriateness of using or 147 | redistributing the Work and assume any risks associated with Your exercise of 148 | permissions under this License. 149 | 150 | 8. Limitation of Liability. 151 | 152 | In no event and under no legal theory, whether in tort (including negligence), 153 | contract, or otherwise, unless required by applicable law (such as deliberate 154 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 155 | liable to You for damages, including any direct, indirect, special, incidental, 156 | or consequential damages of any character arising as a result of this License or 157 | out of the use or inability to use the Work (including but not limited to 158 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 159 | any and all other commercial damages or losses), even if such Contributor has 160 | been advised of the possibility of such damages. 161 | 162 | 9. Accepting Warranty or Additional Liability. 163 | 164 | While redistributing the Work or Derivative Works thereof, You may choose to 165 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 166 | other liability obligations and/or rights consistent with this License. However, 167 | in accepting such obligations, You may act only on Your own behalf and on Your 168 | sole responsibility, not on behalf of any other Contributor, and only if You 169 | agree to indemnify, defend, and hold each Contributor harmless for any liability 170 | incurred by, or claims asserted against, such Contributor by reason of your 171 | accepting any such warranty or additional liability. 172 | 173 | END OF TERMS AND CONDITIONS 174 | -------------------------------------------------------------------------------- /identipy/cutils.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | from cpython.sequence cimport PySequence_GetSlice 3 | from cpython.dict cimport PyDict_GetItem, PyDict_SetItem 4 | from cpython.float cimport PyFloat_AsDouble 5 | from cpython.tuple cimport PyTuple_GetItem 6 | 7 | from pyteomics import cmass 8 | from math import factorial 9 | 10 | cimport pyteomics.cmass as cmass 11 | 12 | from pyteomics import electrochem as ec 13 | import numpy as np 14 | cimport numpy as np 15 | 16 | np.import_array() 17 | 18 | 19 | cdef: 20 | dict std_aa_mass = cmass.std_aa_mass 21 | dict std_ion_comp = cmass.std_ion_comp 22 | dict nist_mass = cmass._nist_mass 23 | 24 | cdef dict ion_shift_dict 25 | 26 | ion_shift_dict = { 27 | 'a': 46.00547930326002, 28 | 'b': 18.010564683699954, 29 | 'c': 0.984015582689949, 30 | 'x': -25.979264555419945, 31 | 'y': 0.0, 32 | 'z': 17.026549101010005, 33 | } 34 | 35 | 36 | 37 | @cython.cdivision(True) 38 | @cython.boundscheck(False) 39 | @cython.wraparound(True) 40 | def RNHS_ultrafast(dict cur_idict, dict theoretical_set, int min_matched, dict best_res, set allowed_idx, int max_v): 41 | 42 | cdef int total_matched, nm_key, num_b_ions, num_y_ions 43 | cdef dict cnt_b, cnt_y 44 | cdef set out 45 | cdef float best_res_val 46 | 47 | if not cur_idict: 48 | return None 49 | 50 | total_matched = 0 51 | 52 | cnt_b = dict() 53 | cnt_y = dict() 54 | 55 | for ion in theoretical_set['b']: 56 | if ion in cur_idict: 57 | for xx in cur_idict[ion]: 58 | if xx not in cnt_b: 59 | cnt_b[xx] = 1 60 | else: 61 | cnt_b[xx] += 1 62 | total_matched += 1 63 | 64 | for ion in theoretical_set['y']: 65 | if ion in cur_idict: 66 | for xx in cur_idict[ion]: 67 | if xx not in cnt_y: 68 | cnt_y[xx] = 1 69 | else: 70 | cnt_y[xx] += 1 71 | total_matched += 1 72 | 73 | if total_matched < min_matched: 74 | return None 75 | 76 | out = set() 77 | for k in allowed_idx: 78 | num_b_ions = 0 79 | num_y_ions = 0 80 | if k in cnt_b: 81 | num_b_ions = cnt_b[k] 82 | if k in cnt_y: 83 | num_y_ions = cnt_y[k] 84 | if num_b_ions + num_y_ions >= min_matched: 85 | best_res_val = best_res.get(k, 0) 86 | if not best_res_val or -factorial(num_b_ions) * factorial(num_y_ions) <= best_res_val: 87 | out.add(k) 88 | return out 89 | 90 | @cython.cdivision(True) 91 | @cython.boundscheck(False) 92 | @cython.wraparound(True) 93 | def RNHS_fast_old(set spectrum_fastset, dict spectrum_idict , dict theoretical_set, int min_matched): 94 | cdef int matched_approx_b, matched_approx_y, matched_approx 95 | cdef set matched_b, matched_y 96 | cdef float isum 97 | isum = 0 98 | matched_b = spectrum_fastset.intersection(theoretical_set['b']) 99 | matched_y = spectrum_fastset.intersection(theoretical_set['y']) 100 | matched_approx_b = len(matched_b) 101 | matched_approx_y = len(matched_y) 102 | matched_approx = matched_approx_b + matched_approx_y 103 | if matched_approx >= min_matched: 104 | for fr in matched_b: 105 | isum += spectrum_idict[fr] 106 | for fr in matched_y: 107 | isum += spectrum_idict[fr] 108 | return matched_approx, factorial(matched_approx_b) * factorial(matched_approx_y) * isum 109 | else: 110 | return 0, 0 111 | 112 | @cython.cdivision(True) 113 | @cython.boundscheck(False) 114 | @cython.wraparound(True) 115 | def RNHS_fast(set spectrum_fastset, dict spectrum_idict , dict theoretical_set, int min_matched, dict rank_map): 116 | cdef int matched_approx_b, matched_approx_y, matched_approx 117 | cdef set matched_b, matched_y 118 | cdef float isum 119 | cdef list all_matched 120 | isum = 0 121 | 122 | all_matched = [] 123 | for ion in theoretical_set: 124 | matched_tmp = spectrum_fastset.intersection(theoretical_set[ion]) 125 | all_matched.append((ion, matched_tmp)) 126 | matched_approx = sum(len(z) for z in all_matched) 127 | if matched_approx >= min_matched: 128 | for ion, matched_tmp in all_matched: 129 | for fr in matched_tmp: 130 | i_rank = spectrum_idict[fr] 131 | if i_rank in rank_map: 132 | tmp_d = rank_map[i_rank] 133 | if ion in tmp_d: 134 | isum += tmp_d[ion] 135 | else: 136 | isum += rank_map['m'] 137 | return matched_approx, isum 138 | else: 139 | return 0, 0 140 | 141 | 142 | @cython.cdivision(True) 143 | @cython.boundscheck(False) 144 | @cython.wraparound(True) 145 | def RNHS_fast_basic(set spectrum_fastset, dict spectrum_idict , dict theoretical_set, int min_matched): 146 | cdef int matched_approx_b, matched_approx_y, matched_approx 147 | cdef set matched_b, matched_y 148 | cdef float isum 149 | cdef list all_matched 150 | isum = 0 151 | 152 | all_matched = [] 153 | for ion in theoretical_set: 154 | matched_tmp = spectrum_fastset.intersection(theoretical_set[ion]) 155 | all_matched.append(matched_tmp) 156 | matched_approx = sum(len(z) for z in all_matched) 157 | if matched_approx >= min_matched: 158 | for matched_tmp in all_matched: 159 | for fr in matched_tmp: 160 | isum += spectrum_idict[fr] 161 | for matched_tmp in all_matched: 162 | isum *= factorial(len(matched_tmp)) 163 | return matched_approx, isum 164 | else: 165 | return 0, 0 166 | 167 | @cython.cdivision(True) 168 | @cython.boundscheck(False) 169 | @cython.wraparound(True) 170 | cdef float calc_ions_from_neutral_mass(str peptide, float nm, str ion_type, int charge, dict aa_mass, float cterm_mass, float nterm_mass): 171 | cdef float nmi 172 | if ion_type in 'abc': 173 | nmi = nm - aa_mass[peptide[-1]] - ion_shift_dict[ion_type] - (cterm_mass - 17.002735) 174 | else: 175 | nmi = nm - aa_mass[peptide[0]] - ion_shift_dict[ion_type] - (nterm_mass - 1.007825) 176 | return (nmi + 1.0072764667700085 * charge) / charge 177 | 178 | @cython.cdivision(True) 179 | @cython.boundscheck(False) 180 | @cython.wraparound(True) 181 | cdef list get_n_ions(str peptide, float maxmass, int pl, int charge, dict k_aa_mass): 182 | cdef int i 183 | cdef list tmp 184 | tmp = [maxmass, ] 185 | for i in xrange(1, pl): 186 | tmp.append(tmp[-1] - k_aa_mass[peptide[-i-1]]/charge) 187 | return tmp 188 | 189 | @cython.cdivision(True) 190 | @cython.boundscheck(False) 191 | @cython.wraparound(True) 192 | cdef list get_c_ions(str peptide, float maxmass, int pl, int charge, dict k_aa_mass): 193 | cdef int i 194 | cdef list tmp 195 | tmp = [maxmass, ] 196 | for i in xrange(pl-2, -1, -1): 197 | tmp.append(tmp[-1] - k_aa_mass[peptide[-(i+2)]]/charge) 198 | return tmp 199 | 200 | @cython.cdivision(True) 201 | @cython.boundscheck(False) 202 | @cython.wraparound(True) 203 | cdef tuple ctheor_spectrum(str peptide, double acc_frag, double nterm_mass, double cterm_mass, tuple types, 204 | int maxcharge, bint reshape, dict kwargs): 205 | cdef int pl, charge, i, n, i_type, n_types 206 | cdef bint nterminal 207 | cdef str ion_type, maxpart, part 208 | cdef float maxmass, part_mass, nm 209 | cdef dict peaks, theoretical_set 210 | cdef dict aa_mass, ion_comp, mass_data 211 | cdef list theoretical_set_item 212 | cdef list ions_scaled, marr 213 | cdef object marr_storage 214 | 215 | aa_mass = kwargs.get("aa_mass") 216 | if aa_mass is None: 217 | aa_mass = std_aa_mass 218 | ion_comp = kwargs.get("ion_comp") 219 | if ion_comp is None: 220 | ion_comp = std_ion_comp 221 | mass_data = kwargs.get("mass_data") 222 | if mass_data is None: 223 | mass_data = nist_mass 224 | nm = kwargs.get("nm") 225 | if nm is None: 226 | nm = cmass.fast_mass(peptide, **kwargs) + (nterm_mass - 1.007825) + (cterm_mass - 17.002735) 227 | peaks = {} 228 | theoretical_set = dict() 229 | 230 | pl = len(peptide) - 1 231 | n_types = len(types) 232 | for charge in range(1, maxcharge + 1): 233 | for i_type in range(n_types): 234 | ion_type = PyTuple_GetItem(types, i_type) 235 | nterminal = ion_type[0] in 'abc' 236 | if nterminal: 237 | maxmass = calc_ions_from_neutral_mass(peptide, nm, ion_type=ion_type, charge=charge, 238 | aa_mass=kwargs['aa_mass'], cterm_mass=cterm_mass, nterm_mass=nterm_mass) 239 | marr = get_n_ions(peptide, maxmass, pl, charge, kwargs['aa_mass']) 240 | else: 241 | maxmass = calc_ions_from_neutral_mass(peptide, nm, ion_type=ion_type, charge=charge, 242 | aa_mass=kwargs['aa_mass'], cterm_mass=cterm_mass, nterm_mass=nterm_mass) 243 | marr = get_c_ions(peptide, maxmass, pl, charge, kwargs['aa_mass']) 244 | 245 | iname = (ion_type, charge) 246 | ions_scaled = [(x / acc_frag) for x in marr] 247 | if iname in theoretical_set: 248 | theoretical_set_item = PyDict_GetItem(theoretical_set, iname) 249 | theoretical_set_item.extend(ions_scaled) 250 | else: 251 | theoretical_set[iname] = ions_scaled 252 | 253 | if reshape: 254 | marr_storage = np.array(marr) 255 | marr_storage.sort() 256 | n = marr_storage.size 257 | marr_storage = marr_storage.reshape((n, 1)) 258 | 259 | iname = (ion_type, charge) 260 | peaks[iname] = marr_storage 261 | else: 262 | iname = (ion_type, charge) 263 | peaks[iname] = sorted(marr) 264 | return peaks, theoretical_set 265 | 266 | 267 | def theor_spectrum(peptide, acc_frag, nterm_mass, cterm_mass, types=('b', 'y'), maxcharge=None, reshape=False, **kwargs): 268 | if not maxcharge: 269 | maxcharge = 1 + int(ec.charge(peptide, pH=2)) 270 | return ctheor_spectrum(peptide, acc_frag, nterm_mass, cterm_mass, tuple(types), maxcharge, reshape, kwargs) 271 | 272 | -------------------------------------------------------------------------------- /identipy/extras.py: -------------------------------------------------------------------------------- 1 | from scipy.stats import percentileofscore, scoreatpercentile 2 | from scipy.optimize import curve_fit 3 | from pyteomics import achrom, auxiliary as aux, parser, mass 4 | from collections import Counter, defaultdict 5 | from .main import * 6 | from .scoring import get_fragment_mass_tol, get_fragment_mass_tol_ppm 7 | import logging 8 | logger = logging.getLogger(__name__) 9 | import numpy as np 10 | from .utils import get_info, get_aa_mass, get_enzyme, calculate_RT, get_title 11 | try: 12 | from pyteomics import cmass 13 | except ImportError: 14 | cmass = mass 15 | from scipy.stats import rankdata 16 | from copy import deepcopy 17 | from scipy.optimize import curve_fit 18 | 19 | def FDbinSize(X): 20 | """Calculates the Freedman-Diaconis bin size for 21 | a data set for use in making a histogram 22 | Arguments: 23 | X: 1D Data set 24 | Returns: 25 | h: F-D bin size 26 | """ 27 | X = np.sort(X) 28 | upperQuartile = scoreatpercentile(X, 75) 29 | lowerQuartile = scoreatpercentile(X, 25) 30 | IQR = upperQuartile - lowerQuartile 31 | h = 2. * IQR / len(X) ** (1. / 3.) 32 | return h 33 | 34 | def get_peptides_subset(results): 35 | tmp_dict = dict() 36 | 37 | massdif = np.array([res['candidates'][0][4]['mzdiff']['ppm'] for res in results]) 38 | 39 | for result in results: 40 | r_spectrum = get_title(result['spectrum']) 41 | r_sequence = str(result['candidates'][0][1]) 42 | r_mass_diff_abs = abs(result['candidates'][0][4]['mzdiff']['ppm']) 43 | if r_sequence not in tmp_dict or r_mass_diff_abs < tmp_dict[r_sequence][1]: 44 | tmp_dict[r_sequence] = (r_spectrum, r_mass_diff_abs) 45 | # print(r_spectrum) 46 | 47 | new_results = [] 48 | for result in results: 49 | r_spectrum = get_title(result['spectrum']) 50 | r_sequence = str(result['candidates'][0][1]) 51 | if r_spectrum == tmp_dict[r_sequence][0]: 52 | new_results.append(result) 53 | return new_results 54 | 55 | def get_subset(results, settings, fdr=0.01): 56 | """Filter results to given FDR using top 1 candidates""" 57 | subset = aux.filter(results, key=lambda x: x['e-values'][0], 58 | is_decoy = lambda x: x['candidates'][0][2] == 'd', 59 | fdr=fdr) 60 | return subset 61 | 62 | def optimization(fname, settings): 63 | settings = settings.copy() 64 | settings.set('misc', 'first stage', '') 65 | efc = settings.get('scoring', 'e-values for candidates') 66 | settings.set('scoring', 'e-values for candidates', 1) 67 | left = settings.getfloat('search', 'precursor accuracy left') 68 | right = settings.getfloat('search', 'precursor accuracy right') 69 | wide = settings.getboolean('optimization', 'increase precursor mass tolerance') 70 | if settings.get('search', 'precursor accuracy unit') != 'ppm': 71 | left *= 1000 72 | right *= 1000 73 | if left < 100 and wide: 74 | settings.set('search', 'precursor accuracy left', 100) 75 | if right < 100 and wide: 76 | settings.set('search', 'precursor accuracy right', 100) 77 | # settings.set('search', 'precursor accuracy unit', 'ppm') 78 | results = list(process_file(fname, settings, initial_run=False)) 79 | filtered = get_subset(results, settings, fdr=0.01) 80 | filtered = get_peptides_subset(filtered) 81 | logger.info('%s PSMs with 1%% FDR.', len(filtered)) 82 | if len(filtered) < 250: 83 | if len(filtered) < 250: 84 | logger.warning('OPTIMIZATION ABORTED') 85 | return settings 86 | else: 87 | functions = [precursor_mass_optimization, fragment_mass_optimization, 88 | missed_cleavages_optimization] 89 | else: 90 | functions = [ 91 | rt_filtering, 92 | # precursor_mass_optimization, 93 | fragment_mass_optimization, 94 | # missed_cleavages_optimization 95 | ] 96 | for func in functions: 97 | # settings = func(filtered, settings, get_subset(results, settings, fdr=100.0)) 98 | settings = func(filtered, settings, [x for x in results if x['candidates'][0][2] == 'd']) 99 | settings.set('scoring', 'e-values for candidates', efc) 100 | settings.set('scoring', 'best peptides', [str(res['candidates'][0][1]) for res in results]) 101 | return settings 102 | 103 | 104 | def charge_optimization(results, settings): 105 | settings = settings.copy() 106 | chargestates = np.array([get_info(res['spectrum'], res, settings)[1] for res in results]) 107 | mincharge = chargestates.min() 108 | maxcharge = chargestates.max() 109 | 110 | for ch in range(mincharge, maxcharge+1): 111 | if float(chargestates[chargestates < ch].size) / chargestates.size < 0.01: 112 | mincharge = ch 113 | for ch in range(maxcharge, mincharge-1, -1): 114 | if float(chargestates[chargestates > ch].size) / chargestates.size < 0.01: 115 | maxcharge = ch 116 | logger.info('NEW charges = %s:%s', mincharge, maxcharge) 117 | settings.set('search', 'maximum charge', maxcharge) 118 | settings.set('search', 'minimum charge', mincharge) 119 | return settings 120 | 121 | def calibrate_mass(bwidth, mass_left, mass_right, true_md): 122 | bbins = np.arange(-mass_left, mass_right, bwidth) 123 | H1, b1 = np.histogram(true_md, bins=bbins) 124 | b1 = b1 + bwidth 125 | b1 = b1[:-1] 126 | 127 | popt, pcov = curve_fit(noisygaus, b1, H1, p0=[1, np.median(true_md), 1, 1]) 128 | mass_shift, mass_sigma = popt[1], np.abs(popt[2]) 129 | return mass_shift, mass_sigma, pcov[0][0] 130 | 131 | def noisygaus(x, a, x0, sigma, b): 132 | return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2)) + b 133 | 134 | def precursor_mass_optimization(results, settings, unf): 135 | settings_nopime = settings.copy() 136 | settings_nopime.set('search', 'precursor isotope mass error', '0') 137 | settings_nopime.set('search', 'shifts', '0') 138 | # results = get_output(results, settings_nopime) 139 | 140 | settings = settings.copy() 141 | mass_left = settings.getfloat('search', 'precursor accuracy left') 142 | mass_right = settings.getfloat('search', 'precursor accuracy right') 143 | massdif = np.array([res['candidates'][0][4]['mzdiff']['ppm'] for res in results]) 144 | massdif = massdif[(massdif > -mass_left) & (massdif < mass_right)] 145 | if settings.get('search', 'precursor accuracy unit') != 'ppm': 146 | mass_left = mass_left * 1e6 / 400 147 | mass_right = mass_right * 1e6 / 400 148 | logger.info('mass_left, mass_right: %s, %s', mass_left, mass_right) 149 | try: 150 | mass_shift, mass_sigma, covvalue = calibrate_mass(0.1, mass_left, mass_right, massdif) 151 | if np.isinf(covvalue): 152 | mass_shift, mass_sigma, covvalue = calibrate_mass(0.01, mass_left, mass_right, massdif) 153 | logger.info('%s, %s -> %s +- 8 * %s; %s', mass_left, mass_right, mass_shift, mass_sigma, covvalue) 154 | best_par_mt_l = mass_shift - 8 * mass_sigma 155 | best_par_mt_r = mass_shift + 8 * mass_sigma 156 | logger.info('SMART MASS TOLERANCE = %s:%s', best_par_mt_l, best_par_mt_r) 157 | except RuntimeError: 158 | error = True 159 | else: 160 | error = False 161 | if not error and np.isinf(covvalue): 162 | error = True 163 | logger.warning('Double error when fitting precursor errors: %s', massdif) 164 | print(percentileofscore(massdif, best_par_mt_r) - percentileofscore(massdif, best_par_mt_l), '!!!') 165 | if error or (percentileofscore(massdif, best_par_mt_r) - percentileofscore(massdif, best_par_mt_l) < 95): 166 | best_par_mt_l = scoreatpercentile(massdif, 0.1) 167 | best_par_mt_r = scoreatpercentile(massdif, 99.9) 168 | logger.warning('Percentage sanity check FAILED. Falling back on percentage boundaries') 169 | else: 170 | best_par_mt_l = max(best_par_mt_l, scoreatpercentile(massdif, 0.1)) 171 | best_par_mt_r = min(best_par_mt_r, scoreatpercentile(massdif, 99.9)) 172 | 173 | best_par_mt_l = -10 174 | best_par_mt_r = 10 175 | logger.info('NEW PARENT MASS TOLERANCE = %s:%s', best_par_mt_l, best_par_mt_r) 176 | settings.set('search', 'precursor accuracy left', -best_par_mt_l) 177 | settings.set('search', 'precursor accuracy right', best_par_mt_r) 178 | settings.set('search', 'precursor accuracy unit', 'ppm') 179 | return settings 180 | 181 | def missed_cleavages_optimization(results, settings, unf): 182 | settings = settings.copy() 183 | missedcleavages = np.array([parser.num_sites(str(res['candidates'][0][1]), get_enzyme(str(settings.get('search', 'enzyme')))) 184 | for res in results]) 185 | best_missedcleavages = missedcleavages.max() 186 | for mc in range(best_missedcleavages, -1, -1): 187 | if float(missedcleavages[missedcleavages > mc].size) / missedcleavages.size < 0.002: 188 | best_missedcleavages = mc 189 | logger.info('NEW miscleavages = %s', best_missedcleavages) 190 | settings.set('search', 'number of missed cleavages', best_missedcleavages) 191 | return settings 192 | 193 | def fragment_mass_optimization(results, settings, results_unf): 194 | settings = settings.copy() 195 | fragmassdif = [] 196 | I_all = [] 197 | 198 | maxcharge = settings.getint('search', 'maximum charge') 199 | mincharge = settings.getint('search', 'minimum charge') 200 | 201 | fragmassdif = [] 202 | if settings.has_option('misc', 'aa_mass'): 203 | aa_mass = settings.get('misc', 'aa_mass') 204 | else: 205 | aa_mass = get_aa_mass(settings) 206 | 207 | for res in results: 208 | 209 | neutral_mass, charge_state, RT, comp_voltage = get_info(res['spectrum'], res, settings, aa_mass) 210 | p_len = len(str(res['candidates'][0][1])) 211 | tres = get_fragment_mass_tol(res['spectrum'], str(res['candidates'][0][1]), settings, charge_state) 212 | fragmassdif.extend(tres['fmt']) 213 | 214 | fragmassdif = np.array(fragmassdif) 215 | 216 | best_frag_mt = scoreatpercentile(fragmassdif, 68) * 4 217 | 218 | logger.info('NEW FRAGMENT MASS TOLERANCE ppm = %s', best_frag_mt) 219 | settings.set('search', 'product accuracy ppm', best_frag_mt) 220 | settings.set('search', 'product accuracy unit', 'ppm') 221 | 222 | return settings 223 | 224 | 225 | def rt_filtering(results, settings, unf): 226 | settings = settings.copy() 227 | if settings.has_option('misc', 'legend'): 228 | legend = settings.get('misc', 'legend') 229 | else: 230 | legend = None 231 | RTexp, seqs = zip(*[(utils.get_RT(res['spectrum']), res['candidates'][0][1]) for res in results]) 232 | if legend is not None: 233 | stdl = set(parser.std_labels) 234 | newseqs = [] 235 | for s in seqs: 236 | if parser.fast_valid(s): 237 | newseqs.append(list(s)) 238 | else: 239 | seq = [] 240 | c, n = False, False 241 | for c in s: 242 | if c in stdl: 243 | seq.append(c) 244 | else: 245 | mod, res, term = legend[c] 246 | if res == '-': 247 | if term == '[': 248 | seq.append(mod+'-') 249 | n = True 250 | else: 251 | seq.append('-'+mod) 252 | c = True 253 | else: 254 | seq.append(mod+res) 255 | if not n: seq.append(parser.std_nterm) 256 | if not c: seq.append(parser.std_cterm) 257 | newseqs.append(seq) 258 | seqs = newseqs 259 | RTexp = [float(x) for x in RTexp] 260 | if np.allclose(RTexp, 0): 261 | logger.warning('RT is missing. Skipping RT optimization.') 262 | return settings 263 | RC_def = achrom.RCs_gilar_rp 264 | xdict = {} 265 | for key, val in RC_def['aa'].items(): 266 | xdict[key] = [val, None] 267 | RC_dict = utils.get_RCs_vary_lcp(seqs, RTexp) 268 | RC_dict_new = dict() 269 | for key, val in RC_dict['aa'].items(): 270 | xdict.setdefault(key, [val, None])[1] = val 271 | a, b, _, _ = aux.linear_regression([x[0] for x in xdict.values() if x[1] != None], [x[1] for x in xdict.values() if x[1] != None]) 272 | for key, x in xdict.items(): 273 | if x[1] == None: 274 | x[1] = x[0] * a + b 275 | RC_dict_new[key] = x[1] 276 | if legend is not None: 277 | for k, v in legend.items(): 278 | if len(k) == 1: continue 279 | if k[-1] in '[]': 280 | if k[-2] == '-': 281 | kk = ('-' + k[1:-1]) if k[-1] == ']' else (k[:-1]) 282 | else: 283 | kk = k[:-1] 284 | elif len(k) > 1: 285 | kk = k 286 | logger.debug('%s -> %s', k, kk) 287 | if kk in RC_dict_new: 288 | RC_dict_new[v] = RC_dict_new[kk] 289 | else: 290 | if kk[-1].isupper(): 291 | kkk = kk[-1] 292 | elif kk[-1] == '-': 293 | kkk = parser.std_nterm 294 | elif kk[0] == '-': 295 | kkk = parser.std_cterm 296 | RC_dict_new[v] = RC_dict_new.get(kkk, 0) 297 | logger.info('No RC for %s, using %s or 0: %s', kk, kkk, RC_dict_new[v]) 298 | 299 | 300 | RC_dict['aa'] = RC_dict_new 301 | 302 | logger.debug('RC dict: %s', RC_dict) 303 | rtexp = np.array([np.mean(x) for x in RTexp]) 304 | rttheor = np.array([calculate_RT(pep, RC_dict, raise_no_mod=False) 305 | for pep in seqs]) 306 | deltaRT = rtexp - rttheor 307 | logger.debug('Linear regression: %s', aux.linear_regression(rtexp, rttheor)) 308 | best_RT_l = scoreatpercentile(deltaRT, 0.05) 309 | best_RT_r = scoreatpercentile(deltaRT, 99.95) 310 | 311 | def condition(spectrum, cand, _, stored_value=False): 312 | if not stored_value: 313 | stored_value = calculate_RT(cand, RC_dict) 314 | rtd = spectrum['RT'] - stored_value 315 | return best_RT_l <= rtd <= best_RT_r, stored_value 316 | settings.set('scoring', 'condition', condition) 317 | return settings 318 | 319 | 320 | def calibrate_mass(bwidth, mass_left, mass_right, true_md): 321 | bbins = np.arange(-mass_left, mass_right, bwidth) 322 | H1, b1 = np.histogram(true_md, bins=bbins) 323 | b1 = b1 + bwidth 324 | b1 = b1[:-1] 325 | 326 | popt, pcov = curve_fit(noisygaus, b1, H1, p0=[1, np.median(true_md), 1, 1]) 327 | mass_shift, mass_sigma = popt[1], np.abs(popt[2]) 328 | return mass_shift, mass_sigma, pcov[0][0] 329 | 330 | def noisygaus(x, a, x0, sigma, b): 331 | return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2)) + b 332 | -------------------------------------------------------------------------------- /identipy/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import string 3 | import logging.config 4 | import os 5 | import subprocess 6 | import copy 7 | import shlex 8 | 9 | LOGGING = { 10 | 'version': 1, 11 | 'disable_existing_loggers': True, 12 | 'formatters': { 13 | 'verbose': { 14 | 'format': '%(levelname)7s %(asctime)s %(module)s %(process)d %(thread)d %(message)s', 15 | }, 16 | 'simple': { 17 | 'format': '%(levelname)7s: %(asctime)s %(message)s', 18 | 'datefmt': '[%H:%M:%S]', 19 | }, 20 | }, 21 | 'handlers': { 22 | 'console': { 23 | 'level': 'DEBUG', 24 | 'class': 'logging.StreamHandler', 25 | 'formatter': 'simple', 26 | }, 27 | }, 28 | 'loggers': { 29 | 'identipy': { 30 | 'handlers': ['console'], 31 | 'level': 'INFO', 32 | } 33 | } 34 | } 35 | 36 | logging.config.dictConfig(LOGGING) 37 | import logging 38 | logger = logging.getLogger(__name__) 39 | from . import main, utils 40 | 41 | 42 | def get_label(modmass, labels): 43 | abt = string.ascii_lowercase 44 | abt_l = len(abt) - 1 45 | if modmass in labels: 46 | return labels[modmass], labels, 0 47 | else: 48 | labels[modmass] = abt[labels['i']] + abt[labels['j']] + abt[labels['k']] 49 | labels['k'] += 1 50 | if labels['k'] > abt_l: 51 | labels['k'] = 0 52 | labels['j'] += 1 53 | if labels['j'] > abt_l: 54 | labels['j'] = 0 55 | labels['i'] += 1 56 | return labels[modmass], labels, 1 57 | 58 | 59 | def process_mods(settings, spec, name, labels): 60 | mods_array = [] 61 | if spec: 62 | for mod in spec.split(','): 63 | modmass, modaa = mod.split('@') 64 | lbl, labels, flag = get_label(modmass, labels) 65 | if modaa == '[': 66 | ntermlabel, modaa, ctermlabel = '-', '', '' 67 | elif modaa == ']': 68 | ntermlabel, modaa, ctermlabel = '', '', '-' 69 | else: 70 | ntermlabel, ctermlabel = '', '' 71 | mods_array.append(ctermlabel + lbl + modaa + ntermlabel) 72 | if flag: 73 | settings.set('modifications', lbl, modmass) 74 | if mods_array or spec is not None: 75 | settings.set('modifications', name, ','.join(mods_array)) 76 | 77 | 78 | def _update(settings, section, name, value): 79 | if value is not None: 80 | settings.set(section, name, value) 81 | 82 | 83 | def run(): 84 | parser = argparse.ArgumentParser( 85 | description='Search proteins using LC-MS/MS spectra', 86 | epilog=''' 87 | 88 | Example usage 89 | ------------- 90 | $ identipy input.mgf -db human.fasta 91 | ------------- 92 | ''', 93 | formatter_class=argparse.RawDescriptionHelpFormatter) 94 | 95 | parser.add_argument('file', help='input mzML or MGF file with MS/MS spectra', nargs='+') 96 | parser.add_argument('-db', help='path to protein FASTA file', metavar='FASTA') 97 | parser.add_argument('-cfg', help='path to file with parameters', metavar='CONFIG_FILE') 98 | parser.add_argument('-out', '-o', help='output path', metavar='PATH') 99 | parser.add_argument('-of', help='output format', metavar='FORMAT') 100 | parser.add_argument('-sep', help='output column separator (for table format)') 101 | parser.add_argument('-at', help='Use auto-tuning of search parameters', action='store_true') 102 | parser.add_argument('-nopwide', help='Do not increase initial precursor mass accuracy for auto-tuning', action='store_true') 103 | parser.add_argument('-punit', help='precursor mass tolerance unit', metavar='UNIT', choices=['ppm', 'Da']) 104 | parser.add_argument('-ptol', help='precursor mass tolerance', type=float, metavar='VALUE') 105 | parser.add_argument('-lptol', help='*left precursor mass tolerance', type=float, metavar='VALUE') 106 | parser.add_argument('-rptol', help='*right precursor mass tolerance', type=float, metavar='VALUE') 107 | parser.add_argument('-funit', help='fragment mass tolerance unit', metavar='UNIT', choices=['ppm', 'Da']) 108 | parser.add_argument('-ftol', help='fragment mass tolerance', type=float, metavar='VALUE') 109 | parser.add_argument('-fminmz', help='fragment min m/z', type=float, metavar='VALUE') 110 | parser.add_argument('-lmin', help='min length of peptides', type=int, metavar='N') 111 | parser.add_argument('-lmax', help='max length of peptides', type=int, metavar='N') 112 | parser.add_argument('-massmin', help='min mass of peptides', type=float, metavar='VALUE') 113 | parser.add_argument('-massmax', help='max mass of peptides', type=float, metavar='VALUE') 114 | parser.add_argument('-e', help='cleavage rule in quotes!. X!Tandem style for cleavage rules', metavar='RULE') 115 | parser.add_argument('-mc', help='number of missed cleavages', type=int, metavar='N') 116 | parser.add_argument('-semi', help='include semitryptic peptides', action='store_true') 117 | parser.add_argument('-noclip', help='Disable clipping of N-terminal methionine', action='store_false', dest='clip_M') 118 | parser.add_argument('-cmin', help='min precursor charge', type=int, metavar='N') 119 | parser.add_argument('-cmax', help='max precursor charge', type=int, metavar='N') 120 | parser.add_argument('-cumin', help='min unknown precursor charge', type=int, metavar='N') 121 | parser.add_argument('-cumax', help='max unknown precursor charge', type=int, metavar='N') 122 | parser.add_argument('-ime', help='precursor isotope mass error. The parent ion\ 123 | mass tolerance is expanded by opening up multiple tolerance windows centered\ 124 | on the given number of 13C isotope peaks for a peptide.', type=int, metavar='N') 125 | parser.add_argument('-shifts', help='shifts. example: 0,16.000,23.000,12') 126 | parser.add_argument('-snp', help='1 means make SNP changes for ALL peptides', type=int) 127 | parser.add_argument('-rapid', help='leave only 2000 random spectra for processing', action='store_true') 128 | parser.add_argument('-mm', help='number of minimum matched ions', type=int, metavar='N') 129 | parser.add_argument('-ad', help='add decoy', action='store_true') 130 | parser.add_argument('-prefix', help='decoy prefix') 131 | parser.add_argument('-infix', help='decoy infix') 132 | parser.add_argument('-method', help='decoy method; reverse or shuffle', choices=['reverse', 'shuffle']) 133 | parser.add_argument('-nodeis', help='do not use MS/MS deisotoping', action='store_true') 134 | parser.add_argument('-deistol', help='deisotope mass accuracy', type=float) 135 | parser.add_argument('-score', help='used scoring function', choices=['RNHS2', 'RNHS', 'hyperscore', 'morpheusscore']) 136 | parser.add_argument('-minp', help='minumum peaks in MS/MS spectra', type=int, metavar='N') 137 | parser.add_argument('-maxp', help='maximum peaks in MS/MS spectra', type=int, metavar='N') 138 | parser.add_argument('-dyn', help='dynamic range', type=float) 139 | parser.add_argument('-mfc', help='maximum fragment charge', type=int, metavar='N') 140 | parser.add_argument('-nproc', help='number of processes. 0 means auto', type=int, metavar='N') 141 | parser.add_argument('-maxmods', help='maximum variable mods per sequence', type=int, metavar='N') 142 | parser.add_argument('-ncleave', help='protein nterm cleavage', type=float) 143 | parser.add_argument('-ccleave', help='protein cterm cleavage', type=float) 144 | parser.add_argument('-fmods', help='fixed modifications. Format: mass1@aminoacid1,mass2@aminoacid2') 145 | parser.add_argument('-vmods', help='variable modifications. Format: mass1@aminoacid1,mass2@aminoacid2') 146 | parser.add_argument('-pmods', help='variable protein terminal modifications') 147 | parser.add_argument('-tags', help='Add quantitation tags to the pepXML output. Can be tmt10plex, tmt6plex, tmt11plex or custom format label1:mass1,label2:mass2...') 148 | parser.add_argument('-debug', help='Print debugging messages', action='store_true') 149 | parser.add_argument('-dino', help='path to Dinosaur JAR file or Biosaur executable. Used for chimeric spectrum processing and MS1 Intensity calculation', default=False) 150 | parser.add_argument('-dinoargs', help='extra arguments to Dinosaur or Biosaur.', default='') 151 | parser.add_argument('-sd', '-skipdino', action='store_true', help='Skip feature detection if a feature file is found.') 152 | parser.add_argument('-demixing',help='Use demixing', action='store_true') 153 | parser.add_argument('-pif', help='Calculate PIF', action='store_true') 154 | 155 | args = vars(parser.parse_args()) 156 | if args['debug']: 157 | logging.getLogger('identipy').setLevel(logging.DEBUG) 158 | 159 | if args['cfg']: 160 | settings = main.settings(args['cfg']) 161 | else: 162 | settings = main.settings() 163 | 164 | labels = {'i': 0, 'j': 0, 'k': 0} 165 | process_mods(settings, args['fmods'], 'fixed', labels) 166 | process_mods(settings, args['vmods'], 'variable', labels) 167 | process_mods(settings, args['pmods'], 'protein variable', labels) 168 | 169 | _update(settings, 'input', 'database', args['db']) 170 | _update(settings, 'search', 'precursor accuracy unit', args['punit']) 171 | _update(settings, 'search', 'precursor accuracy left', (args['ptol'] if not args['lptol'] else args['lptol'])) 172 | _update(settings, 'search', 'precursor accuracy right', (args['ptol'] if not args['rptol'] else args['rptol'])) 173 | _update(settings, 'search', 'product accuracy unit', args['funit']) 174 | _update(settings, 'search', 'product accuracy', args['ftol']) 175 | _update(settings, 'search', 'product minimum m/z', args['fminmz']) 176 | _update(settings, 'search', 'peptide maximum length', args['lmax']) 177 | _update(settings, 'search', 'peptide minimum length', args['lmin']) 178 | _update(settings, 'search', 'peptide maximum mass', args['massmax']) 179 | _update(settings, 'search', 'peptide minimum mass', args['massmin']) 180 | _update(settings, 'search', 'enzyme', args['e']) 181 | _update(settings, 'search', 'number of missed cleavages', args['mc']) 182 | _update(settings, 'search', 'semitryptic', args['semi']) 183 | _update(settings, 'search', 'clip N-terminal methionine', str(args['clip_M'])) 184 | _update(settings, 'search', 'maximum charge', args['cmax']) 185 | _update(settings, 'search', 'minimum charge', args['cmin']) 186 | _update(settings, 'search', 'maximum unknown charge', args['cumax']) 187 | _update(settings, 'search', 'minimum unknown charge', args['cumin']) 188 | _update(settings, 'search', 'precursor isotope mass error', args['ime']) 189 | _update(settings, 'search', 'shifts', args['shifts']) 190 | _update(settings, 'search', 'snp', args['snp']) 191 | _update(settings, 'output', 'minimum matched', args['mm']) 192 | if args['ad']: 193 | _update(settings, 'input', 'add decoy', 'yes') 194 | if args['rapid']: 195 | _update(settings, 'search', 'rapid_check', 1) 196 | _update(settings, 'input', 'decoy prefix', args['prefix']) 197 | _update(settings, 'input', 'decoy infix', args['infix']) 198 | _update(settings, 'input', 'decoy method', args['method']) 199 | if args['nodeis']: 200 | _update(settings, 'input', 'deisotope', 'no') 201 | _update(settings, 'input', 'deisotoping mass tolerance', args['deistol']) 202 | if args['score']: 203 | _update(settings, 'scoring', 'score', 'identipy.scoring.' + args['score']) 204 | _update(settings, 'scoring', 'minimum peaks', args['minp']) 205 | _update(settings, 'scoring', 'maximum peaks', args['maxp']) 206 | _update(settings, 'scoring', 'dynamic range', args['dyn']) 207 | _update(settings, 'scoring', 'maximum fragment charge', args['mfc']) 208 | _update(settings, 'performance', 'processes', args['nproc']) 209 | _update(settings, 'modifications', 'maximum variable mods', args['maxmods']) 210 | _update(settings, 'modifications', 'protein nterm cleavage', args['ncleave']) 211 | _update(settings, 'modifications', 'protein cterm cleavage', args['ccleave']) 212 | _update(settings, 'output', 'path', args['out']) 213 | _update(settings, 'output', 'format', args['of']) 214 | _update(settings, 'output', 'separator', args['sep']) 215 | _update(settings, 'output', 'tags', args['tags']) 216 | if args['at']: 217 | ao_setting = 'identipy.extras.optimization' 218 | if args['nopwide']: 219 | _update(settings, 'optimization', 'increase precursor mass tolerance', 'no') 220 | else: 221 | ao_setting = None 222 | _update(settings, 'misc', 'first stage', ao_setting) 223 | 224 | dino_path = args['dino'] 225 | demixing = args['demixing'] 226 | calc_PIF = args['pif'] 227 | logger.debug('Args: %s', args) 228 | 229 | for inputfile in args['file']: 230 | csettings = copy.deepcopy(settings) 231 | 232 | if dino_path or calc_PIF: 233 | logger.info('Starting mzML analysis...') 234 | if os.path.splitext(inputfile)[1].lower() != '.mzml': 235 | if dino_path: 236 | logger.error('Only mzML supported for Dinosaur!') 237 | elif calc_PIF: 238 | logger.error('mzML required for PIF calculation!') 239 | else: 240 | try: 241 | if dino_path: 242 | path_to_features = os.path.splitext(inputfile)[0] + os.extsep + 'features' + os.extsep + 'tsv' 243 | if not args['sd'] or not os.path.exists(path_to_features): 244 | if dino_path.endswith('.jar'): 245 | advpath = '--advParams=' + os.path.join(os.path.dirname(os.path.realpath(__file__)), 'adv.txt') 246 | logger.info('Starting Dinosaur...') 247 | subprocess.run(['java', '-Djava.awt.headless=true', '-jar', os.path.realpath(dino_path), advpath, '--concurrency=12', inputfile] + args['dinoargs']) 248 | elif 'dinosaur' in dino_path: 249 | advpath = '--advParams=' + os.path.join(os.path.dirname(os.path.realpath(__file__)), 'adv.txt') 250 | logger.info('Starting Dinosaur...') 251 | subprocess.run([os.path.realpath(dino_path), advpath, '--concurrency=12', inputfile] + shlex.split(args['dinoargs'])) 252 | elif 'biosaur2' in dino_path: 253 | logger.info('Starting biosaur2...') 254 | cmd = [os.path.realpath(dino_path), inputfile, '-o', path_to_features] + shlex.split(args['dinoargs']) 255 | logger.debug('Running command: %s', cmd) 256 | subprocess.run(cmd) 257 | else: 258 | logger.info('Starting Biosaur...') 259 | subprocess.run([os.path.realpath(dino_path), inputfile, '-out', path_to_features] + shlex.split(args['dinoargs'])) 260 | if demixing: 261 | logger.info('Starting demultiplexing...') 262 | else: 263 | path_to_features = False 264 | path_to_mgf = utils.demix_chimeric(path_to_features, inputfile, demixing, calc_PIF) 265 | logger.info('MGF was created.') 266 | if demixing: 267 | logger.info('Demultiplexing has finished.') 268 | utils.write_output(path_to_mgf, csettings, main.process_file(path_to_mgf, csettings)) 269 | except Exception as e: 270 | logger.error(e) 271 | break 272 | 273 | else: 274 | utils.write_output(inputfile, csettings, main.process_file(inputfile, csettings)) 275 | 276 | 277 | if __name__ == '__main__': 278 | run() 279 | -------------------------------------------------------------------------------- /identipy/peptide_centric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from string import punctuation 3 | from collections import defaultdict 4 | import random 5 | from pyteomics import mass 6 | from . import utils 7 | import logging 8 | logger = logging.getLogger(__name__) 9 | try: 10 | from pyteomics import cmass 11 | except ImportError: 12 | # logger.warning('cmass could not be imported') 13 | cmass = mass 14 | 15 | # try: 16 | # import pyximport; pyximport.install() 17 | from .cutils import theor_spectrum 18 | #from .utils import theor_spectrum 19 | # except: 20 | # logger.info('Cython modules were not loaded...') 21 | # from .utils import theor_spectrum 22 | from .utils import reshape_theor_spectrum 23 | # from .scoring import RNHS_ultrafast 24 | # from .cutils import RNHS_ultrafast 25 | 26 | def prepare_peptide_processor(fname, settings): 27 | 28 | global_data = list() 29 | n_proc = utils.get_nprocesses(settings) 30 | 31 | for _ in range(n_proc): 32 | global_data.append({ 33 | 'spectra': [], 34 | 'titles': [], 35 | 'nmasses': [], 36 | 'nmasses_set': set(), 37 | 't2s': {}, 38 | 'charges': [], 39 | 'effcharges': [], 40 | 'fulls_global': {}, 41 | }) 42 | 43 | logger.debug('global data: %s', len(global_data)) 44 | 45 | try: 46 | fast_first_stage = settings.getint('misc', 'fast first stage') 47 | except: 48 | fast_first_stage = 0 49 | 50 | # t2s = {} 51 | maxcharges = {} 52 | fcharge = settings.getint('scoring', 'maximum fragment charge') 53 | ch_range = range(settings.getint('search', 'minimum charge'), 54 | 1 + settings.getint('search', 'maximum charge')) 55 | # if fast_first_stage: 56 | # fcharge = 1 57 | for c in ch_range: 58 | maxcharges[c] = max(1, min(fcharge, c-1) if fcharge else c-1) 59 | 60 | params = {} 61 | params['maxpeaks'] = settings.getint('scoring', 'maximum peaks') 62 | params['minpeaks'] = settings.getint('scoring', 'minimum peaks') 63 | params['dynrange'] = settings.getfloat('scoring', 'dynamic range') 64 | params['acc'] = settings.getfloat('search', 'product accuracy') 65 | params['min_mz'] = settings.getfloat('search', 'product minimum m/z') 66 | params.update(utils._charge_params(settings)) 67 | params['dacc'] = settings.getfloat('input', 'deisotoping mass tolerance') 68 | params['deisotope'] = settings.getboolean('input', 'deisotope') 69 | params['tags'] = utils.get_tags(settings.get('output', 'tags')) 70 | rapid_check = settings.getint('search', 'rapid_check') 71 | 72 | ptol_unit = settings.get('search', 'precursor accuracy unit') 73 | lptol = settings.getfloat('search', 'precursor accuracy left') 74 | rptol = settings.getfloat('search', 'precursor accuracy right') 75 | prec_acc_Da = False 76 | # prec_acc_Da = max(abs(lptol), abs(rptol)) 77 | # if ptol_unit != 'Da' or prec_acc_Da < 1.0: 78 | # prec_acc_Da = False 79 | 80 | logger.info('Reading spectra ...') 81 | if not rapid_check: 82 | tmp_spec = utils.iterate_spectra(fname) 83 | else: 84 | tmp_spec = [spec for spec in utils.iterate_spectra(fname)] 85 | if len(tmp_spec) >= 2000: 86 | tmp_spec = random.sample(tmp_spec, 2000) 87 | 88 | num_spectra = 0 89 | 90 | tmp_spec2 = [] 91 | nmasses_tmp = [] 92 | charges_tmp = [] 93 | global_data_index_map = {} 94 | 95 | for spec in tmp_spec: 96 | ps = utils.preprocess_spectrum(spec, params) 97 | if ps is not None: 98 | 99 | tmp_spec2.append(ps) 100 | for m, c in utils.neutral_masses(ps, params): 101 | nmasses_tmp.append(m) 102 | charges_tmp.append(c) 103 | 104 | nmasses_tmp = np.array(nmasses_tmp) 105 | idx_t = np.argsort(nmasses_tmp) 106 | max_nmass = nmasses_tmp[idx_t[-1]] 107 | max_l = int(len(nmasses_tmp)/n_proc)+1 108 | for idx, k in enumerate(idx_t): 109 | global_data_index_map[k] = idx // max_l 110 | logger.debug('nproc: %d, nmasses: %d, max_l: %d, maximum index: %d', 111 | n_proc, nmasses_tmp.size, max_l, max(global_data_index_map.values())) 112 | 113 | for ps in tmp_spec2: 114 | # global_data_index = num_spectra % n_proc 115 | ttl = utils.get_title(ps) 116 | # t2s[ttl] = ps 117 | for m, c in utils.neutral_masses(ps, params): 118 | global_data_index = global_data_index_map[num_spectra] 119 | effc = maxcharges[c] 120 | ps.setdefault('nm', {})[c] = m 121 | 122 | global_data[global_data_index]['t2s'][ttl] = ps 123 | global_data[global_data_index]['nmasses'].append(m) 124 | global_data[global_data_index]['spectra'].append(ps) 125 | global_data[global_data_index]['titles'].append(ttl) 126 | global_data[global_data_index]['charges'].append(c) 127 | global_data[global_data_index]['effcharges'].append(effc) 128 | 129 | num_spectra += 1 130 | logger.info('%s spectra pass quality criteria.', num_spectra) 131 | 132 | if ptol_unit != 'Da': 133 | max_prec_acc_Da = max_nmass * 1e-6 * max(abs(lptol), abs(rptol)) 134 | else: 135 | max_prec_acc_Da = max(abs(lptol), abs(rptol)) 136 | 137 | 138 | for global_data_index in range(n_proc): 139 | 140 | i = np.argsort(global_data[global_data_index]['nmasses']) 141 | global_data[global_data_index]['nmasses'] = np.array(global_data[global_data_index]['nmasses'])[i] 142 | global_data[global_data_index]['spectra'] = np.array(global_data[global_data_index]['spectra'])[i] 143 | global_data[global_data_index]['titles'] = np.array(global_data[global_data_index]['titles'])[i] 144 | global_data[global_data_index]['charges'] = np.array(global_data[global_data_index]['charges'])[i] 145 | global_data[global_data_index]['effcharges'] = np.array(global_data[global_data_index]['effcharges'])[i] 146 | 147 | tmp = (global_data[global_data_index]['nmasses'] / max_prec_acc_Da).astype(int) 148 | global_data[global_data_index]['nmasses_set'].update(tmp) 149 | global_data[global_data_index]['nmasses_set'].update(tmp+1) 150 | global_data[global_data_index]['nmasses_set'].update(tmp-1) 151 | 152 | if prec_acc_Da: 153 | nmasses_conv = global_data[global_data_index]['nmasses'] / prec_acc_Da 154 | nmasses_conv = nmasses_conv.astype(int) 155 | 156 | tmp_dict = {} 157 | for idx, nm in enumerate(nmasses_conv): 158 | if nm not in tmp_dict: 159 | tmp_dict[nm] = {} 160 | if nm+1 not in tmp_dict: 161 | tmp_dict[nm+1] = {} 162 | if nm-1 not in tmp_dict: 163 | tmp_dict[nm-1] = {} 164 | for spval in global_data[global_data_index]['spectra'][idx]['idict']: 165 | if spval not in tmp_dict[nm]: 166 | tmp_dict[nm][spval] = [idx, ] 167 | else: 168 | tmp_dict[nm][spval].append(idx) 169 | if spval not in tmp_dict[nm+1]: 170 | tmp_dict[nm+1][spval] = [idx, ] 171 | else: 172 | tmp_dict[nm+1][spval].append(idx) 173 | if spval not in tmp_dict[nm-1]: 174 | tmp_dict[nm-1][spval] = [idx, ] 175 | else: 176 | tmp_dict[nm-1][spval].append(idx) 177 | 178 | del nmasses_conv 179 | 180 | global_data[global_data_index]['nmasses_set'] = tmp_dict 181 | 182 | utils.set_mod_dict(settings) 183 | 184 | aa_mass = utils.get_aa_mass(settings) 185 | score = utils.import_(settings.get('scoring', 'score')) 186 | try: 187 | score_fast_name = settings.get('scoring', 'score') + '_fast' 188 | logger.debug('Fast score name: %s', score_fast_name) 189 | if score_fast_name in {'identipy.scoring.RNHS_fast', 'RNHS_fast'}: 190 | try: 191 | from .cutils import RNHS_fast as score_fast 192 | from .cutils import RNHS_fast_basic as score_fast_basic 193 | except ImportError as e: 194 | logger.warning('Could not import from cutils: %s', e.args) 195 | score_fast = utils.import_(settings.get('scoring', 'score') + '_fast') 196 | score_fast_basic = utils.import_(settings.get('scoring', 'score') + '_fast_basic') 197 | else: 198 | score_fast = utils.import_(settings.get('scoring', 'score') + '_fast') 199 | score_fast_basic = utils.import_(settings.get('scoring', 'score') + '_fast_basic') 200 | except Exception as e: 201 | score_fast = False 202 | logging.debug('No fast score imported: %s', e) 203 | acc_l = settings.getfloat('search', 'precursor accuracy left') 204 | acc_r = settings.getfloat('search', 'precursor accuracy right') 205 | acc_frag = settings.getfloat('search', 'product accuracy') 206 | frag_unit = settings.get('search', 'product accuracy unit') 207 | if frag_unit == 'ppm': 208 | acc_frag_ppm = settings.getfloat('search', 'product accuracy ppm') 209 | else: 210 | acc_frag_ppm = False 211 | unit = settings.get('search', 'precursor accuracy unit') 212 | rel = utils.relative(unit) 213 | 214 | if settings.has_option('scoring', 'condition'): 215 | cond = settings.get('scoring', 'condition') 216 | else: 217 | cond = None 218 | if isinstance(cond, str) and cond.strip(): 219 | cond = utils.import_(cond) 220 | 221 | score = utils.import_(settings.get('scoring', 'score')) 222 | 223 | return {'rel': rel, 'aa_mass': aa_mass, 224 | 'acc_l': acc_l, 'acc_r': acc_r, 'acc_frag': acc_frag, 'acc_frag_ppm': acc_frag_ppm, 225 | 'unit': unit, # 'nmods': nmods, 'maxmods': maxmods, 226 | 'fast first stage': fast_first_stage, 227 | 'sapime': utils.get_shifts_and_pime(settings), 228 | 'cond': cond, 'score': score, 'score_fast': score_fast, 'score_fast_basic': score_fast_basic, 229 | 'settings': settings, 'max_v': num_spectra, 'prec_acc_Da': prec_acc_Da, 'max_prec_acc_Da': max_prec_acc_Da}, global_data 230 | 231 | 232 | def peptide_processor_iter_isoforms(peptide, best_res, global_data_local, **kwargs): 233 | res = peptide_processor(peptide, best_res, global_data_local, **kwargs) 234 | if res: 235 | return [res, ] 236 | 237 | # nmods, maxmods = op.itemgetter('nmods', 'maxmods')(kwargs) 238 | # if nmods and maxmods: 239 | # out = [] 240 | # for form in utils.custom_isoforms(peptide, variable_mods=nmods, maxmods=maxmods, snp=kwargs['snp']): 241 | # res = peptide_processor(form, best_res, global_data_local, **kwargs) 242 | # if res: 243 | # out.append(res) 244 | # if out: 245 | # return out 246 | # else: 247 | # res = peptide_processor(peptide, best_res, global_data_local, **kwargs) 248 | # if res: 249 | # return [res, ] 250 | 251 | 252 | def peptide_processor(peptide, best_res, global_data_local, **kwargs): 253 | spectra = global_data_local['spectra'] 254 | titles = global_data_local['titles'] 255 | nmasses = global_data_local['nmasses'] 256 | nmasses_set = global_data_local['nmasses_set'] 257 | t2s = global_data_local['t2s'] 258 | charges = global_data_local['charges'] 259 | effcharges = global_data_local['effcharges'] 260 | fulls_global = global_data_local['fulls_global'] 261 | seqm, aachange_pos, snp_label, m = peptide 262 | 263 | max_prec_acc_Da = kwargs.get('max_prec_acc_Da') 264 | 265 | nterm_mass = kwargs.get('nterm_mass') 266 | cterm_mass = kwargs.get('cterm_mass') 267 | rel = kwargs['rel'] 268 | acc_l = kwargs['acc_l'] 269 | acc_r = kwargs['acc_r'] 270 | settings = kwargs['settings'] 271 | 272 | shifts_and_pime = kwargs['sapime'] 273 | theor = {} 274 | theoretical_set = {} 275 | cand_idx = {} 276 | stored_value = False 277 | if rel: 278 | dm_l = acc_l * m / 1.0e6 279 | dm_r = acc_r * m / 1.0e6 280 | elif not rel: 281 | dm_l = acc_l 282 | dm_r = acc_r 283 | # for c in spectra: 284 | 285 | idx = set() 286 | for shift in shifts_and_pime: 287 | if int((m + shift)/max_prec_acc_Da) in nmasses_set: 288 | start = nmasses.searchsorted(m + shift - dm_l) 289 | end = nmasses.searchsorted(m + shift + dm_r, side='right') 290 | if end - start: 291 | idx.update(range(start, end)) 292 | if kwargs['cond']: 293 | idx2 = set() 294 | for i in idx: 295 | cond_val, stored_value = kwargs['cond'](spectra[i], seqm, settings, stored_value) 296 | if cond_val: 297 | idx2.add(i) 298 | idx = idx2 299 | 300 | if idx: 301 | cand_idx = idx 302 | reshaped = {} 303 | for c in set(effcharges[i] for i in idx): 304 | theor[c], theoretical_set[c] = theor_spectrum(seqm, maxcharge=c, aa_mass=kwargs['aa_mass'], reshape=False, 305 | acc_frag=kwargs['acc_frag'], nterm_mass = nterm_mass, 306 | cterm_mass = cterm_mass, nm=m) 307 | reshaped[c] = False 308 | # reshaped = False 309 | 310 | results = [] 311 | # for ind in cand_idx: 312 | ind = cand_idx 313 | # reshaped = False 314 | # if kwargs['prec_acc_Da']: 315 | # fulls_global_charge = fulls_global 316 | # nm_key = int(m / kwargs['prec_acc_Da']) 317 | # cur_idict = fulls_global_charge.get(nm_key, dict()) 318 | # fc_max = max(theor.keys()) 319 | # idx_new = RNHS_ultrafast(cur_idict, theoretical_set[fc_max], kwargs['min_matched'], best_res, ind, kwargs['max_v']) 320 | # else: 321 | idx_new = ind 322 | if idx_new: 323 | # logger.info(len(idx_new)) 324 | for i in idx_new: 325 | # st = utils.get_title(s) 326 | # if idx_new.count(st) >= kwargs['min_matched']:#st in idx_new: 327 | # if i in idx_new: 328 | fc = effcharges[i] 329 | s = spectra[i] 330 | st = titles[i] 331 | chim = ('params' in s and 'isowidthdiff' in s['params'] and abs(float(s['params']['isowidthdiff'])) >= 0.1) 332 | spcharge = charges[i] 333 | # neutral_mass, charge_state, RT = get_info(res['spectrum'], res, settings, aa_mass) 334 | if kwargs['score_fast']: 335 | if 1: 336 | hf = kwargs['score_fast_basic'](s['fastset'], s['idict'], theoretical_set[fc], kwargs['min_matched']) 337 | if hf[0]: 338 | if -hf[1] <= best_res.get(st, 0): 339 | if kwargs['fast first stage']: 340 | sc = hf[1] 341 | score = {'match': [], 'sumI': 1, 'dist': [], 'total_matched': 999, 'score_std': 0} 342 | else: 343 | if not reshaped[fc]: 344 | theor[fc] = reshape_theor_spectrum(theor[fc]) 345 | reshaped[fc] = True 346 | score = kwargs['score'](s, theor[fc], kwargs['acc_frag'], kwargs['acc_frag_ppm'], position=aachange_pos) # FIXME (?) 347 | sc = score.pop('score') 348 | if -sc <= best_res.get(st, 0) and score.pop('total_matched') >= kwargs['min_matched']: 349 | results.append((sc, st, charges[i], score)) 350 | else: 351 | if not reshaped[fc]: 352 | theor[fc] = reshape_theor_spectrum(theor[fc]) 353 | reshaped[fc] = True 354 | score = kwargs['score'](s, theor[fc], kwargs['acc_frag'], kwargs['acc_frag_ppm'], position=aachange_pos) # FIXME (?) 355 | sc = score.pop('score') 356 | if -sc <= best_res.get(st, 0) and score.pop('total_matched') >= kwargs['min_matched']: 357 | results.append((sc, st, charges[i], score)) 358 | 359 | if results: 360 | return seqm, m, snp_label, results 361 | 362 | 363 | def process_peptides(fname, settings): 364 | logger.debug('Started process_peptides.') 365 | spec_results = defaultdict(dict) 366 | peps = utils.peptide_isoforms(settings) 367 | kwargs, global_data = prepare_peptide_processor(fname, settings) 368 | func = peptide_processor_iter_isoforms 369 | kwargs['min_matched'] = settings.getint('output', 'minimum matched') 370 | kwargs['snp'] = settings.getint('search', 'snp') 371 | kwargs['nterm_mass'] = settings.getfloat('modifications', 'protein nterm cleavage') 372 | kwargs['cterm_mass'] = settings.getfloat('modifications', 'protein cterm cleavage') 373 | kwargs['qsize'] = settings.getint('performance', 'out queue size') 374 | 375 | logger.info('Running the search ...') 376 | n = utils.get_nprocesses(settings) 377 | leg = {} 378 | if settings.has_option('misc', 'legend'): 379 | leg = settings.get('misc', 'legend').copy() 380 | if settings.has_option('misc', 'plegend'): 381 | leg.update(settings.get('misc', 'plegend')) 382 | 383 | try: 384 | kwargs['best_peptides'] = settings.get('scoring', 'best peptides') 385 | except: 386 | kwargs['best_peptides'] = False 387 | 388 | best_res_raw, best_res = utils.multimap(n, func, peps, global_data=global_data, **kwargs) 389 | 390 | t2s_global = {} 391 | for global_data_local in global_data: 392 | t2s_global.update(global_data_local['t2s']) 393 | 394 | for spec_t, v in best_res_raw.items(): 395 | peptide, m, snp_label, score, st, c, info = v 396 | spec_results[spec_t]['spectrum'] = t2s_global[spec_t] 397 | info['pep_nm'] = m 398 | info['charge'] = c 399 | spec_results[spec_t]['top_scores'] = -score 400 | spec_results[spec_t]['sequences'] = peptide 401 | spec_results[spec_t]['info'] = info 402 | spec_results[spec_t]['snp_label'] = snp_label 403 | 404 | maxlen = settings.getint('search', 'peptide maximum length') 405 | dtype = np.dtype([('score', np.float64), 406 | ('seq', np.str_, maxlen + 2), ('note', np.str_, 1), 407 | ('charge', np.int8), ('info', np.object_), ('sumI', np.float64), ('fragmentMT', np.float64), ('snp_label', np.str_, 15), ('nextscore_std', np.float64)]) 408 | for spec_name, val in spec_results.items(): 409 | s = val['spectrum'] 410 | c = [] 411 | evalues = [] 412 | score = val['top_scores'] 413 | mseq = val['sequences'] 414 | seq = mseq 415 | info = val['info'] 416 | for x in set(mseq).intersection(punctuation): 417 | repl = leg[x][1] 418 | if repl == '-': 419 | repl = '' 420 | seq = seq.replace(x, repl) 421 | pnm = info['pep_nm'] 422 | c.append((-score, mseq, 't' if seq in utils.seen_target else 'd', 423 | info['charge'], info, info.pop('sumI'), np.median(info.pop('dist')), val['snp_label'], info.pop('score_std'))) 424 | c[-1][4]['mzdiff'] = {'Da': s['nm'][info['charge']] - pnm} 425 | c[-1][4]['mzdiff']['ppm'] = 1e6 * c[-1][4]['mzdiff']['Da'] / pnm 426 | evalues.append(-1./score if -score else 1e6) 427 | c = np.array(c, dtype=dtype) 428 | yield {'spectrum': s, 'candidates': c, 'e-values': evalues} 429 | -------------------------------------------------------------------------------- /identipy/scoring.py: -------------------------------------------------------------------------------- 1 | from .utils import get_aa_mass, custom_mass 2 | from .cutils import theor_spectrum 3 | from scipy.spatial import cKDTree 4 | import numpy as np 5 | from math import factorial 6 | from copy import copy 7 | import logging 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def get_fragment_mass_tol(spectrum, peptide, settings, charge_state): 12 | """A function for obtaining optimal fragment mass tolerance, dynamic range""" 13 | acc = settings.getfloat('search', 'product accuracy') 14 | int_array = spectrum['intensity array'] 15 | int_array = int_array / int_array.max() * 100 16 | 17 | 18 | fcharge = settings.getint('scoring', 'maximum fragment charge') 19 | maxfrag_charge = max(1, min(fcharge, charge_state-1) if fcharge else charge_state-1) 20 | 21 | cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage') 22 | nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage') 23 | m = custom_mass(peptide, aa_mass=get_aa_mass(settings), nterm_mass = nterm_mass, cterm_mass = cterm_mass) 24 | 25 | 26 | theor, _ = theor_spectrum(peptide, maxcharge=maxfrag_charge, reshape=True, aa_mass=get_aa_mass(settings), acc_frag=acc, 27 | nterm_mass = nterm_mass, cterm_mass=cterm_mass, nm=m) 28 | if '__KDTree' not in spectrum: 29 | spectrum['__KDTree'] = cKDTree(spectrum['m/z array'].reshape((spectrum['m/z array'].size, 1))) 30 | 31 | dist_total, int_array_total = np.array([]), np.array([]) 32 | dist_total_tmp = np.array([]) 33 | match2 = {} 34 | for ion, fragments in theor.items(): 35 | n = fragments.size 36 | dist, ind = spectrum['__KDTree'].query(fragments.reshape((n, 1)), distance_upper_bound=acc) 37 | mask = (dist != np.inf) 38 | # logger.debug('m/z array: %s', spectrum['m/z array']) 39 | # logger.debug('fragments: %s', fragments) 40 | # logger.debug('dist: %s\nind: %s\n', dist, ind) 41 | 42 | logger.debug('%s %s %s', spectrum['intensity array'].size, ind.size, ind[mask]) 43 | int_array_total = np.append(int_array_total, spectrum['intensity array'][ind[mask]]) 44 | 45 | dist_total = np.append(dist_total, dist[mask] / spectrum['m/z array'][ind[mask]] * 1e6) 46 | dist_total_tmp = np.append(dist_total_tmp, dist[mask]) 47 | match2[ion] = mask 48 | # matchI[ion] = spectrum['intensity array'][ind[mask]] 49 | # dist_total = np.append(dist_total, dist[mask]) 50 | 51 | yions = match2[('y', 1)] 52 | bions = match2[('b', 1)] 53 | new_params = {} 54 | if dist_total.size: 55 | new_params['fmt'] = dist_total#2 * np.median(dist_total) 56 | new_params['fmt_neutral'] = dist_total_tmp 57 | new_params['bions'] = bions 58 | new_params['yions'] = yions 59 | else: 60 | new_params['fmt'] = [] 61 | new_params['fmt_neutral'] = [] 62 | new_params['bions'] = [] 63 | new_params['yions'] = [] 64 | return new_params 65 | 66 | def get_fragment_mass_tol_ppm(spectrum, peptide, settings, charge_state, acc_ppm): 67 | """A function for obtaining optimal fragment mass tolerance, dynamic range""" 68 | # acc = settings.getfloat('search', 'product accuracy') 69 | acc = acc_ppm * 1500 * 1e-6 70 | # spectrum = copy(spectrum) 71 | # idx = np.nonzero(spectrum['m/z array'] >= 150) 72 | # spectrum['intensity array'] = spectrum['intensity array'][idx] 73 | # spectrum['m/z array'] = spectrum['m/z array'][idx] 74 | int_array = spectrum['intensity array'] 75 | int_array = int_array / int_array.max() * 100 76 | # charge = 1#max(1, max(c for _, c in neutral_masses(spectrum, settings)) - 1) 77 | 78 | 79 | fcharge = settings.getint('scoring', 'maximum fragment charge') 80 | maxfrag_charge = max(1, min(fcharge, charge_state-1) if fcharge else charge_state-1) 81 | 82 | cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage') 83 | nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage') 84 | m = custom_mass(peptide, aa_mass=get_aa_mass(settings), nterm_mass = nterm_mass, cterm_mass = cterm_mass) 85 | theor, _ = theor_spectrum(peptide, maxcharge=maxfrag_charge, reshape=True, aa_mass=get_aa_mass(settings), acc_frag=acc, 86 | nterm_mass = nterm_mass, cterm_mass=cterm_mass, nm=m) 87 | if '__KDTree' not in spectrum: 88 | spectrum['__KDTree'] = cKDTree(spectrum['m/z array'].reshape((spectrum['m/z array'].size, 1))) 89 | 90 | dist_total, int_array_total = np.array([]), np.array([]) 91 | dist_total_tmp = np.array([]) 92 | match2 = {} 93 | for ion, fragments in theor.items(): 94 | n = fragments.size 95 | dist, ind = spectrum['__KDTree'].query(fragments.reshape((n, 1)), distance_upper_bound=acc) 96 | mask = (dist != np.inf) 97 | 98 | 99 | ind = ind.clip(max=spectrum['m/z array'].size-1) 100 | nacc = np.where(dist / spectrum['m/z array'][ind] * 1e6 > acc_ppm)[0] 101 | mask[nacc] = False 102 | 103 | 104 | # logger.debug('m/z array: %s', spectrum['m/z array']) 105 | # logger.debug('fragments: %s', fragments) 106 | # logger.debug('dist: %s\nind: %s\n', dist, ind) 107 | 108 | logger.debug('%s %s %s', spectrum['intensity array'].size, ind.size, ind[mask]) 109 | int_array_total = np.append(int_array_total, spectrum['intensity array'][ind[mask]]) 110 | 111 | dist_total = np.append(dist_total, dist[mask] / spectrum['m/z array'][ind[mask]] * 1e6) 112 | dist_total_tmp = np.append(dist_total_tmp, dist[mask]) 113 | match2[ion] = mask 114 | # matchI[ion] = spectrum['intensity array'][ind[mask]] 115 | # dist_total = np.append(dist_total, dist[mask]) 116 | 117 | yions = match2[('y', 1)] 118 | bions = match2[('b', 1)] 119 | new_params = {} 120 | if dist_total.size: 121 | new_params['fmt'] = dist_total#2 * np.median(dist_total) 122 | new_params['fmt_neutral'] = dist_total_tmp 123 | new_params['bions'] = bions 124 | new_params['yions'] = yions 125 | else: 126 | new_params['fmt'] = [] 127 | new_params['fmt_neutral'] = [] 128 | new_params['bions'] = [] 129 | new_params['yions'] = [] 130 | return new_params 131 | 132 | def morpheusscore_fast(spectrum_fastset, spectrum_idict, theoretical_set, min_matched): 133 | matched_b = spectrum_fastset.intersection(theoretical_set['b']) 134 | matched_y = spectrum_fastset.intersection(theoretical_set['y']) 135 | matched_approx_b = len(matched_b) 136 | matched_approx_y = len(matched_y) 137 | matched_approx = matched_approx_b + matched_approx_y 138 | if matched_approx >= min_matched: 139 | isum = 0 140 | for fr in matched_b: 141 | isum += spectrum_idict[fr] 142 | for fr in matched_y: 143 | isum += spectrum_idict[fr] 144 | return matched_approx, matched_approx + isum 145 | # return matched_approx, factorial(matched_approx_b) * (100 * matched_approx_b) + factorial(matched_approx_y) * (100 * matched_approx_y) 146 | # return matched_approx, factorial(matched_approx) * (100 * matched_approx) 147 | else: 148 | return 0, 0 149 | 150 | def morpheusscore(spectrum, theoretical, acc, acc_ppm=False, position=False): 151 | if 'norm' not in spectrum: 152 | spectrum['norm'] = spectrum['Isum']#spectrum['intensity array'].sum()#spectrum['intensity array'].max() / 100. 153 | mz_array = spectrum['m/z array'] 154 | score = 0 155 | match = {} 156 | match2 = {} 157 | total_matched = 0 158 | sumI = 0 159 | if '__KDTree' not in spectrum: 160 | spectrum['__KDTree'] = cKDTree(mz_array.reshape((mz_array.size, 1))) 161 | 162 | dist_all = [] 163 | for ion, fragments in theoretical.items(): 164 | dist, ind = spectrum['__KDTree'].query(fragments, distance_upper_bound=acc) 165 | mask1 = (dist != np.inf) 166 | if acc_ppm: 167 | mask2 = (dist[mask1] / spectrum['m/z array'][ind[mask1]] * 1e6 <= acc_ppm) 168 | else: 169 | mask2 = np.ones_like(dist[mask1], dtype=bool) 170 | nmatched = mask2.sum() 171 | if nmatched: 172 | total_matched += nmatched 173 | sumi = spectrum['intensity array'][ind[mask1][mask2]].sum() 174 | sumI += sumi 175 | score += sumi / spectrum['norm'] 176 | dist_all.extend(dist[mask1][mask2]) 177 | match[ion] = mask2 178 | match2[ion] = mask1 179 | if not total_matched: 180 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 181 | if position: 182 | yions = match2[('y', 1)] 183 | bions = match2[('b', 1)] 184 | plen = len(yions) + 1 185 | if position == 1: 186 | if not bions[0]: 187 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 188 | elif position == plen: 189 | if not yions[0]: 190 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 191 | else: 192 | if not (yions[plen - position] and yions[plen - position - 1]) or (bions[position - 1] and bions[position - 2]): 193 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 194 | 195 | score += total_matched 196 | sumI = np.log10(sumI) 197 | 198 | return {'score': score, 'match': match, 'sumI': sumI, 'dist': dist_all, 'total_matched': total_matched, 'score_std': 0} 199 | 200 | def hyperscore_fast(spectrum_fastset, spectrum_idict, theoretical_set, min_matched): 201 | matched_b = spectrum_fastset.intersection(theoretical_set['b']) 202 | matched_y = spectrum_fastset.intersection(theoretical_set['y']) 203 | matched_approx_b = len(matched_b) 204 | matched_approx_y = len(matched_y) 205 | #matched_approx_b = len(spectrum_fastset.intersection(theoretical_set['b'])) 206 | #matched_approx_y = len(spectrum_fastset.intersection(theoretical_set['y'])) 207 | matched_approx = matched_approx_b + matched_approx_y 208 | if matched_approx >= min_matched: 209 | isum = 0 210 | for fr in matched_b: 211 | isum += spectrum_idict[fr] 212 | for fr in matched_y: 213 | isum += spectrum_idict[fr] 214 | # return matched_approx, factorial(matched_approx_b) * factorial(matched_approx_y) 215 | return matched_approx, factorial(matched_approx_b) * 100 * isum * (matched_approx_b + matched_approx_y) * factorial(matched_approx_y) 216 | # return matched_approx, factorial(matched_approx) * (100 * matched_approx) 217 | else: 218 | return 0, 0 219 | 220 | def hyperscore(spectrum, theoretical, acc, acc_ppm=False, position=False): 221 | if 'norm' not in spectrum: 222 | spectrum['norm'] = spectrum['intensity array'].max() / 100. 223 | mz_array = spectrum['m/z array'] 224 | score = 0 225 | mult = [] 226 | match = {} 227 | match2 = {} 228 | total_matched = 0 229 | sumI = 0 230 | if '__KDTree' not in spectrum: 231 | spectrum['__KDTree'] = cKDTree(mz_array.reshape((mz_array.size, 1))) 232 | 233 | dist_all = [] 234 | for ion, fragments in theoretical.items(): 235 | dist, ind = spectrum['__KDTree'].query(fragments, distance_upper_bound=acc) 236 | mask1 = (dist != np.inf) 237 | if acc_ppm: 238 | ind = ind.clip(max=mz_array.size-1) 239 | nacc = np.where(dist / mz_array[ind] * 1e6 > acc_ppm)[0] 240 | mask2 = mask1.copy() 241 | mask2[nacc] = False 242 | else: 243 | mask2 = np.ones_like(dist[mask1], dtype=bool) 244 | nmatched = mask2.sum() 245 | if nmatched: 246 | total_matched += nmatched 247 | mult.append(factorial(nmatched)) 248 | sumi = spectrum['intensity array'][ind[mask2]].sum() 249 | sumI += sumi 250 | score += sumi / spectrum['norm'] 251 | dist_all.extend(dist[mask2]) 252 | match[ion] = mask2 253 | match2[ion] = mask1 254 | if not total_matched: 255 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 256 | if position: 257 | yions = match2[('y', 1)] 258 | bions = match2[('b', 1)] 259 | plen = len(yions) + 1 260 | if position == 1: 261 | if not bions[0]: 262 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 263 | elif position == plen: 264 | if not yions[0]: 265 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 266 | else: 267 | if not (yions[plen - position] and yions[plen - position - 1]) or (bions[position - 1] and bions[position - 2]): 268 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 269 | 270 | for m in mult: 271 | score *= m 272 | sumI = np.log10(sumI) 273 | 274 | return {'score': score, 'score_std': 0, 'match': match, 'sumI': sumI, 'dist': dist_all, 'total_matched': total_matched} 275 | 276 | 277 | def RNHS_ultrafast(spectrum_idict, theoretical_set, min_matched, nm, best_res, allowed_idx, max_v, prec_acc_Da): 278 | 279 | nm_key = int(nm / prec_acc_Da) 280 | 281 | cur_idict = spectrum_idict.get(nm_key, None) 282 | if not cur_idict: 283 | return None 284 | 285 | total_matched = 0 286 | 287 | cnt_b = dict() 288 | cnt_y = dict() 289 | 290 | for ion in theoretical_set['b']: 291 | if ion in cur_idict: 292 | for xx in cur_idict[ion]: 293 | if xx not in cnt_b: 294 | cnt_b[xx] = 1 295 | else: 296 | cnt_b[xx] += 1 297 | total_matched += 1 298 | 299 | for ion in theoretical_set['y']: 300 | if ion in cur_idict: 301 | for xx in cur_idict[ion]: 302 | if xx not in cnt_y: 303 | cnt_y[xx] = 1 304 | else: 305 | cnt_y[xx] += 1 306 | total_matched += 1 307 | 308 | if total_matched < min_matched: 309 | return None 310 | 311 | out = set() 312 | for k in allowed_idx: 313 | num_b_ions = 0 314 | num_y_ions = 0 315 | if k in cnt_b: 316 | num_b_ions = cnt_b[k] 317 | if k in cnt_y: 318 | num_y_ions = cnt_y[k] 319 | if num_b_ions + num_y_ions >= min_matched: 320 | best_res_val = best_res.get(k, 0) 321 | if not best_res_val or -factorial(num_b_ions) * factorial(num_y_ions) <= best_res_val: 322 | out.add(k) 323 | return out 324 | 325 | # isum = 0 326 | # matched_approx_b, matched_approx_y = 0, 0 327 | # for ion in theoretical_set['b']: 328 | # if ion in spectrum_idict: 329 | # matched_approx_b += 1 330 | # isum += spectrum_idict[ion] 331 | 332 | # for ion in theoretical_set['y']: 333 | # if ion in spectrum_idict: 334 | # matched_approx_y += 1 335 | # isum += spectrum_idict[ion] 336 | 337 | # # # isum = 0 338 | # # for fr in matched_b: 339 | # # isum += spectrum_idict[fr] 340 | # # for fr in matched_y: 341 | # # isum += spectrum_idict[fr] 342 | # matched_approx = matched_approx_b + matched_approx_y 343 | # if matched_approx >= min_matched: 344 | # return matched_approx, factorial(matched_approx_b) * factorial(matched_approx_y) * isum 345 | # else: 346 | # return 0, 0 347 | 348 | def RNHS_fast(spectrum_fastset, spectrum_idict, theoretical_set, min_matched): 349 | # matched_b = spectrum_fastset.intersection(theoretical_set['b']) 350 | # matched_y = spectrum_fastset.intersection(theoretical_set['y']) 351 | # matched_approx_b = len(matched_b) 352 | # matched_approx_y = len(matched_y) 353 | #matched_approx_b = len(spectrum_fastset.intersection(theoretical_set['b'])) 354 | #matched_approx_y = len(spectrum_fastset.intersection(theoretical_set['y'])) 355 | # matched_approx = matched_approx_b + matched_approx_y 356 | # if matched_approx >= min_matched: 357 | score = 0 358 | isum = 0 359 | matched_approx_b, matched_approx_y = 0, 0 360 | for ion in theoretical_set['b']: 361 | if ion in spectrum_idict: 362 | matched_approx_b += 1 363 | isum += spectrum_idict[ion] 364 | score = isum * factorial(matched_approx_b) 365 | isum = 0 366 | for ion in theoretical_set['y']: 367 | if ion in spectrum_idict: 368 | matched_approx_y += 1 369 | isum += spectrum_idict[ion] 370 | score += isum * factorial(matched_approx_y) 371 | # # isum = 0 372 | # for fr in matched_b: 373 | # isum += spectrum_idict[fr] 374 | # for fr in matched_y: 375 | # isum += spectrum_idict[fr] 376 | matched_approx = matched_approx_b + matched_approx_y 377 | if matched_approx >= min_matched: 378 | return matched_approx, score 379 | else: 380 | return 0, 0 381 | 382 | def RNHS(spectrum, theoretical, acc, acc_ppm=False, position=False): 383 | if 'norm' not in spectrum: 384 | spectrum['norm'] = spectrum['Isum'] 385 | mz_array = spectrum['m/z array'] 386 | score = 0 387 | mult = [] 388 | match = {} 389 | match2 = {} 390 | total_matched = 0 391 | sumI = 0 392 | if '__KDTree' not in spectrum: 393 | spectrum['__KDTree'] = cKDTree(mz_array.reshape((mz_array.size, 1))) 394 | 395 | dist_all = [] 396 | for ion, fragments in theoretical.items(): 397 | dist, ind = spectrum['__KDTree'].query(fragments, distance_upper_bound=acc) 398 | mask1 = (dist != np.inf) 399 | if acc_ppm: 400 | ind = ind.clip(max=mz_array.size-1) 401 | nacc = np.where(dist / mz_array[ind] * 1e6 > acc_ppm)[0] 402 | mask2 = mask1.copy() 403 | mask2[nacc] = False 404 | else: 405 | mask2 = mask1 406 | nmatched = mask2.sum() 407 | if nmatched: 408 | total_matched += nmatched 409 | mult.append(factorial(nmatched)) 410 | sumi = spectrum['intensity array'][ind[mask2]].sum() 411 | sumI += sumi 412 | score += sumi# / spectrum['norm'] 413 | dist_all.extend(dist[mask2]) 414 | match[ion] = mask2 415 | match2[ion] = mask2 416 | 417 | score = score / spectrum['norm'] 418 | 419 | if not total_matched: 420 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0} 421 | if position: 422 | yions = match2[('y', 1)] 423 | bions = match2[('b', 1)] 424 | plen = len(yions) 425 | if position > plen + 1: 426 | # print 'Something wrong with aachange position' 427 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0} 428 | if position == 1: 429 | if not bions[0]: 430 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0} 431 | elif position == plen + 1: 432 | if not yions[0]: 433 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0} 434 | else: 435 | if not (yions[plen - position + 1] and yions[plen - position]): 436 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0} 437 | 438 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0} 439 | 440 | 441 | for m in mult: 442 | score *= m 443 | 444 | sumI = np.log10(sumI) 445 | 446 | outscore = score 447 | 448 | return {'score': outscore, 'match': match, 'sumI': sumI, 'dist': dist_all, 'total_matched': total_matched, 'score_std': 0, 'RNHS': score} 449 | 450 | def rank_cor(theoretical_list, experimental_list): 451 | n = len(theoretical_list) 452 | if n <= 1: 453 | return 0 454 | top = 6 * sum((float(z1 - z2))**2 for z1, z2 in zip(theoretical_list, experimental_list)) 455 | bottom = n * (n**2 - 1) 456 | return 1 - top/bottom 457 | 458 | import math 459 | def cos_correlation(theoretical_list, experimental_list): 460 | top = 0 461 | if len(theoretical_list) <= 1: 462 | return 0 463 | bottom = math.sqrt(sum([numb * numb for numb in theoretical_list])) * \ 464 | math.sqrt(sum([numb * numb for numb in experimental_list])) 465 | if not bottom: 466 | return 0 467 | 468 | for i1, i2 in zip(theoretical_list, experimental_list): 469 | top += i1 * i2 470 | 471 | return top / bottom 472 | 473 | def RNHS2_ultrafast(spectrum_idict, theoretical_set, min_matched, nm, best_res, allowed_idx): 474 | return RNHS_ultrafast(spectrum_idict, theoretical_set, min_matched, nm, best_res, allowed_idx) 475 | 476 | def RNHS2_fast(spectrum_fastset, spectrum_idict, theoretical_set, min_matched): 477 | return RNHS_fast(spectrum_fastset, spectrum_idict, theoretical_set, min_matched) 478 | 479 | def RNHS2(spectrum, theoretical, acc, acc_ppm=False, position=False): 480 | mz_array = copy(spectrum['m/z array']) 481 | KDT = copy(spectrum['__KDTree']) 482 | s_ia = copy(spectrum['intensity array']) 483 | s_is = copy(spectrum['Isum']) 484 | 485 | query_dict = {} 486 | for ion, fragments in theoretical.items(): 487 | query_dict[ion] = KDT.query(fragments, distance_upper_bound=acc) 488 | 489 | score_tmp = [] 490 | if not acc_ppm: 491 | acc_ppm = 0 492 | for i in range(21, 1, -2): 493 | # for accc, accc_ppm in zip([acc/3, acc/2, acc], [acc_ppm/3, acc_ppm/2, acc_ppm]): 494 | accc = acc / i 495 | accc_ppm = acc_ppm / i 496 | score = 0 497 | mult = [] 498 | match = {} 499 | match2 = {} 500 | total_matched = 0 501 | sumI = 0 502 | dist_all = [] 503 | for ion, fragments in theoretical.items(): 504 | dist, ind = query_dict[ion]#spectrum['__KDTree'].query(fragments, distance_upper_bound=accc) 505 | # dist, ind = spectrum['__KDTree'].query(fragments, distance_upper_bound=accc) 506 | mask1 = (dist != np.inf) 507 | if acc_ppm: 508 | ind = ind.clip(max=mz_array.size-1) 509 | nacc = np.where(dist / mz_array[ind] * 1e6 > accc_ppm)[0] 510 | mask2 = mask1.copy() 511 | mask2[nacc] = False 512 | else: 513 | # if len(np.where(np.abs(dist[mask1]) > accc)[0]) > 0: 514 | # logger.info('\n') 515 | # logger.info(dist) 516 | # logger.info(dist[mask1]) 517 | # logger.info(np.where(np.abs(dist[mask1]) > accc)[0]) 518 | # logger.info('\n') 519 | nacc = np.where(dist > accc)[0] 520 | mask2 = mask1.copy() 521 | mask2[nacc] = False 522 | # mask2 = mask1 523 | nmatched = mask2.sum() 524 | if nmatched: 525 | total_matched += nmatched 526 | mult.append(factorial(nmatched)) 527 | sumi = s_ia[ind[mask2]].sum() 528 | sumI += sumi 529 | score += sumi# / s_is 530 | dist_all.extend(dist[mask2]) 531 | match[ion] = mask2 532 | match2[ion] = mask2 533 | score = score / s_is 534 | if not total_matched: 535 | # return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0} 536 | pass 537 | else: 538 | for m in mult: 539 | score *= m 540 | sumI = np.log10(sumI) 541 | score_tmp.append(score) 542 | if not total_matched: 543 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 544 | if position: 545 | yions = match2[('y', 1)] 546 | bions = match2[('b', 1)] 547 | plen = len(yions) 548 | if position > plen + 1: 549 | # print 'Something wrong with aachange position' 550 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 551 | if position == 1: 552 | if not bions[0]: 553 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 554 | elif position == plen + 1: 555 | if not yions[0]: 556 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 557 | else: 558 | if not (yions[plen - position + 1] and yions[plen - position]): 559 | return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0} 560 | 561 | score_std = np.std(score_tmp)# / np.mean(score_tmp) 562 | bions_score_neg = score_tmp[0] 563 | print(score_tmp[0], np.mean(score_tmp)) 564 | score_tmp = np.mean(score_tmp) 565 | return {'score': score_tmp, 'score_std': score_std, 'match': match, 'sumI': sumI, 'dist': dist_all, 'total_matched': total_matched, 566 | 'yions_score': 0, 'bions_score': 0, 'yions_score_neg': 0, 'bions_score_neg': bions_score_neg} 567 | -------------------------------------------------------------------------------- /identipy/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pyteomics import mass, electrochem as ec, auxiliary as aux, fasta, mzml, parser, mgf 3 | import pandas as pd 4 | from itertools import combinations, islice 5 | from collections import defaultdict, Counter 6 | import numpy as np 7 | from multiprocessing import Queue, Process, cpu_count 8 | import string 9 | from copy import copy 10 | try: 11 | from ConfigParser import RawConfigParser 12 | except ImportError: 13 | from configparser import RawConfigParser 14 | import tempfile 15 | import os 16 | import platform 17 | import logging 18 | import itertools as it 19 | try: 20 | from lxml import etree 21 | except ImportError: 22 | etree = None 23 | from time import strftime 24 | from os import path 25 | logger = logging.getLogger(__name__) 26 | 27 | try: 28 | from pyteomics import cmass 29 | except ImportError: 30 | logger.warning('pyteomics.cythonize not found. It is highly recommended for good performance.') 31 | cmass = mass 32 | try: 33 | from . import cparser 34 | except ImportError: 35 | from . import customparser as cparser 36 | from scipy.spatial import cKDTree 37 | import pkg_resources 38 | try: 39 | basestring 40 | except NameError: 41 | basestring = (str, bytes) 42 | 43 | default_tags = { 44 | 'tmt10plex': { 45 | 'tmt_126': 126.1277261, 46 | 'tmt_127N': 127.1247610, 47 | 'tmt_128C': 128.1344357, 48 | 'tmt_129N': 129.1314706, 49 | 'tmt_130C': 130.1411453, 50 | 'tmt_131': 131.1381802, 51 | 'tmt_127C': 127.1310809, 52 | 'tmt_128N': 128.1281158, 53 | 'tmt_129C': 129.1377905, 54 | 'tmt_130N': 130.1348254 55 | }, 56 | 'tmt11plex': { 57 | 'tmt_126': 126.1277261, 58 | 'tmt_127N': 127.1247610, 59 | 'tmt_127C': 127.1310809, 60 | 'tmt_128N': 128.1281158, 61 | 'tmt_128C': 128.1344357, 62 | 'tmt_129N': 129.1314706, 63 | 'tmt_129C': 129.1377905, 64 | 'tmt_130N': 130.1348254, 65 | 'tmt_130C': 130.1411453, 66 | 'tmt_131': 131.1381802, 67 | 'tmt_131C': 131.144999 68 | }, 69 | 'tmt_pro': { 70 | 'tmt_126': 126.1277261, 71 | 'tmt_127N': 127.1247610, 72 | 'tmt_127C': 127.1310809, 73 | 'tmt_128N': 128.1281158, 74 | 'tmt_128C': 128.1344357, 75 | 'tmt_129N': 129.1314706, 76 | 'tmt_129C': 129.1377905, 77 | 'tmt_130N': 130.1348254, 78 | 'tmt_130C': 130.1411453, 79 | 'tmt_131N': 131.1381802, 80 | 'tmt_131C': 131.1445, 81 | 'tmt_132N': 132.14153, 82 | 'tmt_132C': 132.14785, 83 | 'tmt_133N': 133.14489, 84 | 'tmt_133C': 133.15121, 85 | 'tmt_134N': 134.14824, 86 | }, 87 | 'tmt6plex': { 88 | 'tmt_126': 126.1277261, 89 | 'tmt_127N': 127.1247610, 90 | 'tmt_128C': 128.1344357, 91 | 'tmt_129N': 129.1314706, 92 | 'tmt_130C': 130.1411453, 93 | 'tmt_131': 131.1381802, 94 | } 95 | } 96 | default_tags['tmt16plex'] = default_tags['tmt_pro'] 97 | 98 | 99 | def get_tags(tags): 100 | logger.debug('Tags: %s', tags) 101 | if tags: 102 | if tags in default_tags: 103 | return default_tags[tags] 104 | else: 105 | ctags = dict() 106 | for tag in str(tags).split(','): 107 | for lbl, mss in tag.split(':'): 108 | ctags[lbl] = float(mss) 109 | return ctags 110 | else: 111 | return tags 112 | 113 | 114 | def get_child_for_mods(mods_str, settings, fixed=True, protein=False): 115 | if mods_str: 116 | for mod in re.split(r'[,;]\s*', mods_str): 117 | term = False 118 | if '-' not in mod: 119 | child_mod = etree.Element('aminoacid_modification') 120 | 121 | t = None 122 | if '[' in mod: 123 | t = 'n' 124 | elif ']' in mod: 125 | t = 'c' 126 | if t: 127 | child_mod.set('protein_terminus' if protein else 'peptide_terminus', t) 128 | mod = mod.replace('[', '').replace(']', '') 129 | mod_label, mod_aa = parser._split_label(mod) 130 | mod_mass = mass.std_aa_mass.get(mod_aa, 0) 131 | mod_massdiff = settings.getfloat('modifications', mod_label) 132 | 133 | child_mod.set('aminoacid', mod_aa) 134 | child_mod.set('massdiff', str(mod_massdiff)) 135 | child_mod.set('mass', str(mod_mass+mod_massdiff)) 136 | child_mod.set('variable', 'Y' if not fixed else 'N') 137 | yield child_mod 138 | elif mod[0] == '-': 139 | term = 'c' 140 | mod_label = mod[1:] 141 | mod_term_mass = settings.getfloat('modifications', 'protein cterm cleavage') 142 | elif mod[-1] == '-': 143 | term = 'n' 144 | mod_label = mod[:-1] 145 | mod_term_mass = settings.getfloat('modifications', 'protein nterm cleavage') 146 | 147 | if term: 148 | mod_massdiff = settings.getfloat('modifications', mod_label) 149 | child_mod = etree.Element('terminal_modification') 150 | child_mod.set('terminus', term) 151 | child_mod.set('massdiff', str(mod_massdiff)) 152 | child_mod.set('mass', str((mod_massdiff if not fixed else 0)+mod_term_mass)) 153 | child_mod.set('variable', 'Y' if not fixed else 'N') 154 | yield child_mod 155 | 156 | 157 | def custom_mass(sequence, nterm_mass, cterm_mass, **kwargs): 158 | return cmass.fast_mass(sequence, **kwargs) + (nterm_mass - 1.007825) + (cterm_mass - 17.002735) 159 | 160 | 161 | def get_RCs(sequences, RTs, lcp=-0.21, term_aa=False, **kwargs): 162 | 163 | peptide_lengths = kwargs.get('lengths', np.log([len(peptide) for peptide in sequences])) 164 | peptide_dicts = sequences#[Counter(peptide) for peptide in sequences] 165 | 166 | detected_amino_acids = {aa for peptide_dict in peptide_dicts 167 | for aa in peptide_dict} 168 | 169 | # Determine retention coefficients using multidimensional linear 170 | # regression. 171 | composition_array = [] 172 | for idx, pdict in enumerate(peptide_dicts): 173 | loglen = peptide_lengths[idx]#np.log(parser.length(pdict)) 174 | composition_array.append([pdict.get(aa, 0.) * (1. + lcp * loglen) 175 | for aa in detected_amino_acids] + [1.]) 176 | 177 | # Add normalizing conditions for terminal retention coefficients. The 178 | # condition we are using here is quite arbitrary. It implies that the sum 179 | # of N- or C-terminal RCs minus the sum of corresponding internal RCs must 180 | # be equal to zero. 181 | if term_aa: 182 | for term_label in ['nterm', 'cterm']: 183 | normalizing_peptide = [] 184 | for aa in detected_amino_acids: 185 | if aa.startswith(term_label): 186 | normalizing_peptide.append(1.0) 187 | elif (term_label+aa) in detected_amino_acids: 188 | normalizing_peptide.append(-1.0) 189 | else: 190 | normalizing_peptide.append(0.0) 191 | normalizing_peptide.append(0.0) 192 | composition_array.append(normalizing_peptide) 193 | RTs.append(0.0) 194 | 195 | # Use least square linear regression. 196 | RCs, res, rank, s = np.linalg.lstsq(np.array(composition_array), np.array(RTs)) 197 | 198 | # Remove normalizing elements from the RTs vector. 199 | if term_aa: 200 | for term_label in ['nterm', 'cterm']: 201 | RTs.pop() 202 | 203 | # Form output. 204 | RC_dict = {} 205 | RC_dict['aa'] = dict( 206 | zip(list(detected_amino_acids), 207 | RCs[:len(detected_amino_acids)])) 208 | RC_dict['aa'][parser.std_nterm] = 0.0 209 | RC_dict['aa'][parser.std_cterm] = 0.0 210 | RC_dict['const'] = RCs[len(detected_amino_acids)] 211 | RC_dict['lcp'] = lcp 212 | 213 | # Find remaining terminal RCs. 214 | if term_aa: 215 | for term_label in ['nterm', 'cterm']: 216 | # Check if there are terminal RCs remaining undefined. 217 | undefined_term_RCs = [aa for aa in RC_dict['aa'] 218 | if aa[1:5] != 'term' and term_label + aa not in RC_dict['aa']] 219 | if not undefined_term_RCs: 220 | continue 221 | 222 | # Find a linear relationship between internal and terminal RCs. 223 | defined_term_RCs = [aa for aa in RC_dict['aa'] 224 | if aa[1:5] != 'term' and term_label + aa in RC_dict['aa']] 225 | 226 | a, b, r, stderr = aux.linear_regression( 227 | [RC_dict['aa'][aa] for aa in defined_term_RCs], 228 | [RC_dict['aa'][term_label+aa] for aa in defined_term_RCs]) 229 | 230 | # Define missing terminal RCs using this linear equation. 231 | for aa in undefined_term_RCs: 232 | RC_dict['aa'][term_label + aa] = a * RC_dict['aa'][aa] + b 233 | 234 | return RC_dict 235 | 236 | 237 | def get_RCs_vary_lcp(sequences, RTs, term_aa=False, lcp_range=(-1.0, 1.0), **kwargs): 238 | 239 | labels = kwargs.get('labels') 240 | 241 | best_r = -1.1 242 | best_RC_dict = {} 243 | lcp_accuracy = kwargs.get('lcp_accuracy', 0.1) 244 | 245 | min_lcp = lcp_range[0] 246 | max_lcp = lcp_range[1] 247 | step = (max_lcp - min_lcp) / 10.0 248 | peptide_lengths = np.log([len(peptide) for peptide in sequences]) 249 | peptide_dicts = [Counter(peptide) for peptide in sequences] 250 | while step > lcp_accuracy: 251 | lcp_grid = np.arange(min_lcp, max_lcp, 252 | (max_lcp - min_lcp) / 10.0) 253 | for lcp in lcp_grid: 254 | RC_dict = get_RCs(peptide_dicts, RTs, lcp, term_aa, labels=labels, lengths=peptide_lengths) 255 | regression_coeffs = aux.linear_regression( 256 | RTs, 257 | [calculate_RT(peptide, RC_dict) for peptide in peptide_dicts]) 258 | if regression_coeffs[2] > best_r: 259 | best_r = regression_coeffs[2] 260 | best_RC_dict = dict(RC_dict) 261 | min_lcp = best_RC_dict['lcp'] - step 262 | max_lcp = best_RC_dict['lcp'] + step 263 | step = (max_lcp - min_lcp) / 10.0 264 | 265 | return best_RC_dict 266 | 267 | 268 | def calculate_RT(peptide, RC_dict, raise_no_mod=True): 269 | plen = len(peptide) 270 | peptide_dict = peptide 271 | RT = 0.0 272 | for aa in peptide_dict: 273 | if aa not in RC_dict['aa']: 274 | if len(aa) == 1: 275 | raise aux.PyteomicsError('No RC for residue "{}".'.format(aa)) 276 | if (not raise_no_mod) and aa[-1] in RC_dict['aa']: 277 | RT += RC_dict['aa'][aa[-1]] 278 | else: 279 | raise aux.PyteomicsError( 280 | 'Residue "{0}" not found in RC_dict. '.format(aa) + 281 | 'Set raise_no_mod=False to ignore this error ' + 282 | 'and use the RC for "{0}"" instead.'.format(aa[-1])) 283 | else: 284 | RT += RC_dict['aa'][aa] 285 | 286 | length_correction_term = ( 287 | 1.0 + RC_dict.get('lcp', 0) * np.log(plen)) 288 | RT *= length_correction_term 289 | 290 | RT += RC_dict.get('const', 0) 291 | 292 | return RT 293 | 294 | 295 | _modchars = set(string.ascii_lowercase + string.digits) 296 | 297 | 298 | def custom_split_label(mod): 299 | j = 0 300 | while mod[j] in _modchars: 301 | j += 1 302 | if j == 0: 303 | return mod[1:], '-', ']' 304 | if len(mod[j:]) > 1 and '[' in mod: 305 | return mod[:j], mod[j:].replace('[', ''), '[' 306 | elif len(mod[j:]) > 1 and ']' in mod: 307 | return mod[:j], mod[j:].replace(']', ''), ']' 308 | elif len(mod[j:]) == 1: 309 | if mod.startswith('-'): 310 | return mod[:j], '-', ']' 311 | elif mod.endswith('-'): 312 | return mod[:j], '-', '[' 313 | else: 314 | return mod[:j], mod[j:], '' 315 | 316 | 317 | class MS2OnlyMzML(mzml.MzML): 318 | _default_iter_path = '//spectrum[./*[local-name()="cvParam" and @name="ms level" and @value="2"]]' 319 | _use_index = False 320 | _iterative = False 321 | 322 | 323 | def iterate_spectra(fname): 324 | ftype = fname.rsplit('.', 1)[-1].lower() 325 | if ftype == 'mgf': 326 | with mgf.read(fname, read_charges=False, use_index=False) as f: 327 | for x in f: 328 | yield x 329 | elif ftype == 'mzml': 330 | for x in MS2OnlyMzML(source=fname): 331 | yield x 332 | # with mzml.read(fname, use_index=False) as f: 333 | # for x in f: 334 | # if x['ms level'] > 1: 335 | # yield x 336 | else: 337 | raise ValueError('Unrecognized file type: {}'.format(ftype)) 338 | 339 | 340 | def get_nprocesses(settings): 341 | if platform.system() == 'Windows': 342 | return 1 343 | n = settings.getint('performance', 'processes') 344 | if n == 0: 345 | try: 346 | n = cpu_count() 347 | except NotImplementedError: 348 | n = 1 349 | return n 350 | 351 | 352 | def iterate_and_preprocess(fname, params, settings): 353 | it = iterate_spectra(fname) 354 | n = get_nprocesses(settings) 355 | return multimap(n, preprocess_spectrum, it, kwargs=params) 356 | 357 | 358 | def is_decoy_function(settings): 359 | prefix = settings.get('input', 'decoy prefix').strip() 360 | infix = settings.get('input', 'decoy infix').strip() 361 | if infix: 362 | return lambda d: infix in d 363 | if prefix: 364 | return lambda d: d.startswith(prefix) 365 | logger.error('No decoy label specified. One of "decoy prefix" or "decoy infix" is needed.') 366 | 367 | 368 | def peptide_gen(settings, clear_seen_peptides=False): 369 | if clear_seen_peptides: 370 | seen_target.clear() 371 | seen_decoy.clear() 372 | isdecoy = is_decoy_function(settings) 373 | enzyme = get_enzyme(settings.get('search', 'enzyme')) 374 | logger.debug('Using cleavage rule: %s', enzyme) 375 | semitryptic = settings.getint('search', 'semitryptic') 376 | mc = settings.getint('search', 'number of missed cleavages') 377 | minlen = settings.getint('search', 'peptide minimum length') 378 | maxlen = settings.getint('search', 'peptide maximum length') 379 | snp = settings.getint('search', 'snp') 380 | clip_M = settings.getboolean('search', 'clip N-terminal methionine') 381 | for prot in prot_gen(settings): 382 | for pep, pos in prot_peptides(prot[1], enzyme, mc, minlen, maxlen, 383 | is_decoy=isdecoy(prot[0]), snp=snp, desc=prot[0], semitryptic=semitryptic, position=True, clip_M=clip_M): 384 | term = '' 385 | if pos == 0: 386 | term += 'n' 387 | if pos + len(pep) == len(prot[1]): 388 | term += 'c' 389 | yield pep, term 390 | 391 | 392 | def peptide_isoforms(settings, clear_seen_peptides=False): 393 | snp = settings.getint('search', 'snp') 394 | maxmods = settings.getint('modifications', 'maximum variable mods') 395 | leg = settings.get('misc', 'legend') 396 | pleg = settings.get('misc', 'plegend') 397 | logger.debug('leg: %s, pleg: %s', leg, pleg) 398 | punct = set(string.punctuation) 399 | nmods = [(p, mod[1], mod[2]) for p, mod in leg.items() if p in punct] 400 | pmods_n, pmods_c = [], [] 401 | for p, mod in pleg.items(): 402 | if p in punct: 403 | if mod[2] == '[': 404 | pmods_n.append((p, mod[1], mod[2])) 405 | if mod[2] == ']': 406 | pmods_c.append((p, mod[1], mod[2])) 407 | logger.debug('nmods: %s', nmods) 408 | logger.debug('pmods_n: %s', pmods_n) 409 | logger.debug('pmods_c: %s', pmods_c) 410 | aa_mass = get_aa_mass(settings) 411 | nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage') 412 | cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage') 413 | for peptide, term in peptide_gen(settings, clear_seen_peptides): 414 | mods = nmods[:] 415 | if 'n' in term: 416 | mods += pmods_n 417 | if 'c' in term: 418 | mods += pmods_c 419 | for form in (custom_isoforms(peptide, variable_mods=mods, maxmods=maxmods, snp=snp) if (nmods and maxmods) else [peptide, ]): 420 | if snp: 421 | if 'snp' not in form: 422 | seqm = form 423 | aachange_pos = False 424 | snp_label = 'wild' 425 | else: 426 | tmp = form.split('snp') 427 | seqm = tmp[0] + tmp[1].split('at')[0].split('to')[-1] + tmp[2] 428 | aachange_pos = len(tmp[0]) + 1 429 | snp_label = tmp[1] 430 | aachange_pos = False 431 | else: 432 | seqm = form 433 | aachange_pos = False 434 | snp_label = False 435 | 436 | m = custom_mass(seqm, aa_mass=aa_mass, nterm_mass=nterm_mass, cterm_mass=cterm_mass) 437 | yield (seqm, aachange_pos, snp_label, m) 438 | 439 | 440 | def prot_gen(settings): 441 | db = settings.get('input', 'database') 442 | # add_decoy = settings.getboolean('input', 'add decoy') 443 | # prefix = settings.get('input', 'decoy prefix') 444 | 445 | with fasta.read(db) as f: 446 | for p in f: 447 | yield p 448 | 449 | 450 | def get_peptides(prot_seq, enzyme, mc, minlen, maxlen, semitryptic=False): 451 | peptides = cparser._cleave(prot_seq, enzyme, mc) 452 | for pep, startposition in peptides: 453 | plen = len(pep) 454 | if minlen <= plen <= maxlen: 455 | if not semitryptic: 456 | yield pep, startposition, plen 457 | else: 458 | for i in range(plen-minlen+1): 459 | yield pep[i:], startposition + i, plen - i 460 | for i in range(1, plen-minlen+1, 1): 461 | yield pep[:-i], startposition, plen - i 462 | 463 | 464 | seen_target = set() 465 | seen_decoy = set() 466 | def prot_peptides(prot_seq, enzyme, mc, minlen, maxlen, is_decoy, 467 | dont_use_seen_peptides=False, snp=False, desc=False, position=False, semitryptic=False, clip_M=True): 468 | 469 | dont_use_fast_valid = parser.fast_valid(prot_seq) 470 | methionine_check = (clip_M and prot_seq[0] == 'M') 471 | if snp == 2: 472 | if desc: 473 | try: 474 | tmp = desc.split(' ')[0].split('|') 475 | pos = int(tmp[1]) - 1 476 | aach = tmp[2] 477 | except: 478 | desc = False 479 | # peptides = cparser._cleave(prot_seq, enzyme, mc) 480 | # for pep, startposition in peptides: 481 | # plen = len(pep) 482 | for pep, startposition, plen in get_peptides(prot_seq, enzyme, mc, minlen, maxlen, semitryptic): 483 | loopcnt = 0 484 | if pep not in seen_target and pep not in seen_decoy and (dont_use_fast_valid or parser.fast_valid(pep)): 485 | loopcnt = 1 486 | if methionine_check and startposition == 0: 487 | if minlen <= plen - 2: 488 | loopcnt = 3 489 | elif minlen <= plen - 1: 490 | loopcnt = 2 491 | while loopcnt: 492 | f = pep[loopcnt-1:] 493 | if dont_use_seen_peptides: 494 | if snp == 1: 495 | for ff, seq_new in custom_snp(f, startposition): 496 | if not seq_new: 497 | yield ff if not position else (ff, startposition) 498 | else: 499 | yield ff if not position else (ff, startposition) 500 | else: 501 | yield f if not position else (f, startposition) 502 | else: 503 | if f not in seen_target and f not in seen_decoy: 504 | if is_decoy: 505 | seen_decoy.add(f) 506 | else: 507 | seen_target.add(f) 508 | if snp == 1: 509 | for ff, seq_new in custom_snp(f, startposition): 510 | if not seq_new: 511 | yield ff if not position else (ff, startposition) 512 | if seq_new not in seen_decoy and seq_new not in seen_target: 513 | yield ff if not position else (ff, startposition) 514 | elif snp == 2: 515 | if desc and startposition <= pos <= startposition + plen: 516 | if len(aach) == 3 and aach[0] in parser.std_amino_acids and aach[2] in parser.std_amino_acids: 517 | pos_diff = pos - startposition 518 | f = f[:pos_diff] + 'snp%sto%sat%ssnp' % (aach.split('>')[0], aach.split('>')[-1], pos) + f[pos_diff+1:] 519 | yield f if not position else (f, startposition) 520 | else: 521 | yield f if not position else (f, startposition) 522 | else: 523 | yield f if not position else (f, startposition) 524 | loopcnt -= 1 525 | 526 | 527 | def custom_snp(peptide, startposition): 528 | yield peptide, None 529 | j = len(peptide) - 1 530 | while j >= 0: 531 | for aa in parser.std_amino_acids: 532 | if aa != 'L' and aa != peptide[j] and not (aa == 'I' and peptide[j] == 'L'): 533 | aa_label = 'snp%sto%sat%ssnp' % (peptide[j], aa, str(j + startposition)) 534 | out = peptide[:j] + aa_label + peptide[j+1:], peptide[:j] + aa + peptide[j+1:] 535 | yield out 536 | j -= 1 537 | 538 | 539 | def normalize_mods(sequence, settings): 540 | leg = settings.get('misc', 'legend') 541 | if leg: 542 | for char in string.punctuation: 543 | if char in leg: 544 | if leg[char][2] == ']' and leg[char][1] == '-': 545 | sequence = sequence.replace(char, '-' + leg[char][0]) 546 | else: 547 | sequence = sequence.replace(char, ''.join(leg[char][:2])) 548 | return sequence 549 | 550 | 551 | def custom_isoforms(peptide, variable_mods, maxmods=2, nterm=False, cterm=False, snp=False): 552 | if not variable_mods: 553 | yield peptide 554 | else: 555 | to_char = variable_mods[-1][0] 556 | from_char = variable_mods[-1][1] 557 | term = variable_mods[-1][2] 558 | sites = [s[0] for s in enumerate(peptide) if (not snp or (s[0] - 4 < 0 or peptide[s[0]-4:s[0]-1] != 'snp')) and (from_char == '-' or s[1] == from_char) and (not term or (term == '[' and s[0] == 0) or (term == ']' and s[0] == len(peptide)-1))] 559 | for m in range(maxmods+1): 560 | for comb in combinations(sites, m): 561 | flag = 0 562 | flag2 = 0 563 | tmpnterm = True if nterm else False 564 | tmpcterm = True if cterm else False 565 | v = '' 566 | cc_prev = 0 567 | for cc in comb: 568 | if from_char == '-': 569 | if term == '[' and not nterm: 570 | flag2 = 1 571 | v += to_char 572 | tmpnterm = True 573 | elif term == ']' and not cterm: 574 | v = v + peptide[cc_prev:cc+1] + to_char 575 | tmpcterm = True 576 | else: 577 | flag = 1 578 | else: 579 | v = v + peptide[cc_prev:cc] + to_char 580 | if not flag2: 581 | cc_prev = cc + 1 582 | if not flag: 583 | v = v + peptide[cc_prev:] 584 | for z in custom_isoforms(v, variable_mods[:-1], maxmods=maxmods - m, nterm=tmpnterm, cterm=tmpcterm, snp=snp): 585 | yield z 586 | 587 | 588 | def remove_precursor(mz_prec, spectrum, acc): 589 | mz = spectrum['m/z array'] 590 | intens = spectrum['intensity array'] 591 | idx = np.full(mz.size, True) 592 | i_l = mz.searchsorted(mz_prec - acc)#mz.size-2 593 | i_r = mz.searchsorted(mz_prec + acc, side='right') 594 | for i in range(i_l, i_r, 1): 595 | idx[i] = False 596 | spectrum['m/z array'] = mz[idx] 597 | spectrum['intensity array'] = intens[idx] 598 | 599 | 600 | def deisotope(spectrum, acc, charge): 601 | # acc = 0.3 602 | mz = spectrum['m/z array'] 603 | intens = spectrum['intensity array'] 604 | 605 | h = 1.0057 606 | i = mz.size-2 607 | skip = set() 608 | add = [] 609 | while i >= 0: 610 | j = min(mz.size-1, mz.searchsorted(mz[i] + 1.5, side='right')) 611 | while j > i: 612 | if intens[i] > intens[j]: 613 | d = mz[j] - mz[i] 614 | if d > 1.5*h: 615 | j -= 1 616 | continue 617 | for z in range(1, charge+1): 618 | if abs(d - 1./z) < acc: 619 | skip.add(j) 620 | if z > 1: 621 | # skip.add(i) 622 | add.append((i, z)) 623 | j -= 1 624 | i -= 1 625 | ix = np.delete(np.arange(mz.size, dtype=int), list(skip)) 626 | newmz, newint = [], [] 627 | for i, z in add: 628 | newmz.append(mz[i]*z - (z-1)*h) 629 | newint.append(intens[i]) 630 | # print len(skip), len(add) 631 | mz = np.hstack((mz[ix], newmz)) 632 | intens = np.hstack((intens[ix], newint)) 633 | spectrum['m/z array'] = mz 634 | spectrum['intensity array'] = intens 635 | 636 | 637 | def preprocess_spectrum(spectrum, kwargs): 638 | spectrum = copy(spectrum) 639 | maxpeaks = kwargs['maxpeaks'] 640 | minpeaks = kwargs['minpeaks'] 641 | dynrange = kwargs['dynrange'] 642 | acc = kwargs['acc'] 643 | tags = kwargs['tags'] 644 | 645 | if 'm/z array' not in spectrum: 646 | return None 647 | 648 | _, states = get_expmass(spectrum, kwargs) 649 | if not states: 650 | return None 651 | 652 | if tags: 653 | # TODO optimize performance 654 | max_mass_label_val = max(tags.values()) + 1.0 655 | tmp_idx = np.nonzero(spectrum['m/z array'] <= max_mass_label_val) 656 | tags_res = defaultdict(float) 657 | for tmt_label, tmt_mass in tags.items(): 658 | for t_m, t_i in zip(spectrum['m/z array'][tmp_idx], spectrum['intensity array'][tmp_idx]): 659 | if abs(t_m - tmt_mass) / tmt_mass <= 1e-5: 660 | tags_res[tmt_label] += t_i 661 | for tmt_label, tmt_intensity in tags_res.items(): 662 | spectrum[tmt_label] = tmt_intensity 663 | 664 | if kwargs['deisotope']: 665 | dacc = kwargs['dacc'] 666 | deisotope(spectrum, dacc, states[-1]) 667 | 668 | mz_prec, _ = get_expmass(spectrum, kwargs) 669 | remove_precursor(mz_prec, spectrum, acc) 670 | 671 | mz = spectrum['m/z array'] 672 | 673 | idx = np.nonzero(mz >= kwargs['min_mz']) 674 | spectrum['intensity array'] = spectrum['intensity array'][idx] 675 | mz = mz[idx] 676 | spectrum['intensity array'] = spectrum['intensity array'].astype(np.float32) 677 | 678 | if minpeaks and spectrum['intensity array'].size < minpeaks: 679 | return None 680 | 681 | spectrum['intensity array'] = spectrum['intensity array'].astype(np.float32) 682 | 683 | if dynrange: 684 | i = spectrum['intensity array'] > spectrum['intensity array'].max( 685 | ) / dynrange 686 | spectrum['intensity array'] = spectrum['intensity array'][i] 687 | mz = mz[i] 688 | 689 | if maxpeaks and minpeaks > maxpeaks: 690 | raise ValueError('minpeaks > maxpeaks: {} and {}'.format( 691 | minpeaks, maxpeaks)) 692 | if maxpeaks and spectrum['intensity array'].size > maxpeaks: 693 | i = np.argsort(spectrum['intensity array'])[-maxpeaks:] 694 | j = np.argsort(mz[i]) 695 | spectrum['intensity array'] = spectrum['intensity array'][i][j] 696 | mz = mz[i][j] 697 | 698 | spectrum['m/z array'] = mz 699 | 700 | if minpeaks and spectrum['intensity array'].size < minpeaks: 701 | return None 702 | 703 | spectrum['Isum'] = spectrum['intensity array'].sum() 704 | 705 | tmp2 = dict() 706 | tmp = spectrum['m/z array'] / acc 707 | tmp = tmp.astype(int) 708 | for idx, mt in enumerate(tmp): 709 | i_val = spectrum['intensity array'][idx] / spectrum['Isum'] 710 | for mz_val_int in (mt-1, mt, mt+1): 711 | if mz_val_int not in tmp2: 712 | tmp2[mz_val_int] = i_val 713 | else: 714 | tmp2[mz_val_int] = max(i_val, tmp2[mz_val_int]) 715 | tmp = np.concatenate((tmp, tmp-1, tmp+1)) 716 | spectrum['fastset'] = set(tmp.tolist()) 717 | spectrum['RT'] = get_RT(spectrum) 718 | spectrum['comp_voltage'] = get_comp_voltage(spectrum) 719 | spectrum['idict'] = tmp2 720 | 721 | spectrum['__KDTree'] = cKDTree(spectrum['m/z array'].reshape((spectrum['m/z array'].size, 1))) 722 | 723 | return spectrum 724 | 725 | 726 | def relative(unit): 727 | if unit == 'ppm': 728 | return True 729 | elif unit in {'Th', 'Da', 'amu'}: 730 | return False 731 | else: 732 | raise ValueError('Unrecognized precursor accuracy unit: ' + unit) 733 | 734 | 735 | def set_mod_dict(settings): 736 | mods = settings.get('modifications', 'variable') 737 | pmods = settings.get('modifications', 'protein variable') 738 | 739 | settings.set('modifications', 'variable_original', mods) 740 | settings.set('modifications', 'protein_original', pmods) 741 | i = None 742 | if isinstance(mods, basestring): 743 | mods = mods.strip(' ,') 744 | mod_dict = {} 745 | legend = {} 746 | 747 | if mods: 748 | mods = [custom_split_label(l) for l in re.split(r',\s*', mods)] 749 | mods.sort(key=lambda x: len(x[0]), reverse=True) 750 | for i, (mod, char) in enumerate(zip(mods, string.punctuation), 1): 751 | legend[''.join(mod)] = char 752 | legend[char] = mod 753 | assert all(len(m) == 3 for m in mods), 'unmodified residue given' 754 | for mod, aa, term in mods: 755 | mod_dict.setdefault(mod, []).append(aa) 756 | settings.set('modifications', 'variable', mod_dict) 757 | logger.info('Setting legend: %s', legend) 758 | settings.set('misc', 'legend', legend) 759 | 760 | if isinstance(pmods, basestring): 761 | plegend = {} 762 | pmod_dict = {} 763 | if pmods: 764 | pmods = [custom_split_label(l) for l in re.split(r',\s*', pmods)] 765 | pmods.sort(key=lambda x: len(x[0]), reverse=True) 766 | for mod, char in zip(pmods, string.punctuation[i:]): 767 | plegend[''.join(mod)] = char 768 | plegend[char] = mod 769 | assert all(len(m) == 3 for m in pmods), 'unmodified residue given' 770 | for mod, aa, term in pmods: 771 | pmod_dict.setdefault(mod, []).append(aa) 772 | settings.set('modifications', 'protein variable', pmod_dict) 773 | mod_dict.update(pmod_dict) 774 | settings.set('modifications', 'variable', mod_dict) 775 | settings.set('misc', 'plegend', plegend) 776 | logger.info('Setting plegend: %s', plegend) 777 | 778 | 779 | def get_enzyme(enzyme): 780 | if enzyme in parser.expasy_rules: 781 | return parser.expasy_rules[enzyme] 782 | else: 783 | try: 784 | enzyme = convert_tandem_cleave_rule_to_regexp(enzyme) 785 | return enzyme 786 | except Exception as e: 787 | logger.debug('Exception parsing cleavage rule %s: %s', enzyme, e.args[0]) 788 | return enzyme 789 | 790 | 791 | def convert_tandem_cleave_rule_to_regexp(cleavage_rule): 792 | 793 | def get_sense(c_term_rule, n_term_rule): 794 | if '{' in c_term_rule: 795 | return 'N' 796 | elif '{' in n_term_rule: 797 | return 'C' 798 | else: 799 | if len(c_term_rule) <= len(n_term_rule): 800 | return 'C' 801 | else: 802 | return 'N' 803 | 804 | def get_cut(cut, no_cut): 805 | aminoacids = set(parser.std_amino_acids) 806 | cut = ''.join(aminoacids & set(cut)) 807 | if '{' in no_cut: 808 | no_cut = ''.join(aminoacids & set(no_cut)) 809 | return cut, no_cut 810 | else: 811 | no_cut = ''.join(set(parser.std_amino_acids) - set(no_cut)) 812 | return cut, no_cut 813 | 814 | out_rules = [] 815 | for protease in cleavage_rule.split(','): 816 | protease = protease.replace('X', ''.join(parser.std_amino_acids)) 817 | c_term_rule, n_term_rule = protease.split('|') 818 | sense = get_sense(c_term_rule, n_term_rule) 819 | if sense == 'C': 820 | cut, no_cut = get_cut(c_term_rule, n_term_rule) 821 | else: 822 | cut, no_cut = get_cut(n_term_rule, c_term_rule) 823 | 824 | if no_cut: 825 | if sense == 'C': 826 | out_rules.append('([%s](?=[^%s]))' % (cut, no_cut)) 827 | else: 828 | out_rules.append('([^%s](?=[%s]))' % (no_cut, cut)) 829 | else: 830 | if sense == 'C': 831 | out_rules.append('([%s])' % (cut, )) 832 | else: 833 | out_rules.append('(?=[%s])' % (cut, )) 834 | return '|'.join(out_rules) 835 | 836 | 837 | class CustomRawConfigParser(RawConfigParser, object): 838 | def get(self, section, option, **kwargs): 839 | val = super(CustomRawConfigParser, self).get(section, option) 840 | if isinstance(val, basestring): 841 | if section == 'search' and option == 'enzyme': 842 | return val.split('|class')[0] 843 | return val[::-1].split('|', 1)[-1][::-1] 844 | return val 845 | 846 | def get_choices(self, section, option): 847 | val = super(CustomRawConfigParser, self).get(section, option) 848 | if isinstance(val, basestring) and len(val.split('|')) > 1: 849 | return val[::-1].split('|', 1)[0][::-1] 850 | else: 851 | return '' 852 | 853 | def copy(self): 854 | new_config = CustomRawConfigParser() 855 | for section in self.sections(): 856 | new_config.add_section(section) 857 | for name, value in self.items(section): 858 | new_config.set(section, name, value) 859 | return new_config 860 | 861 | 862 | def find_nearest(array, value): 863 | return (np.abs(np.array(array) - value)).argmin() 864 | 865 | 866 | def _charge_params(settings): 867 | params = {} 868 | params['maxcharge'] = settings.getint('search', 'maximum charge') or None 869 | params['mincharge'] = settings.getint('search', 'minimum charge') or None 870 | if settings.has_option('search', 'minimum unknown charge') and settings.getint('search', 'minimum unknown charge'): 871 | params['min_ucharge'] = max(settings.getint('search', 'minimum unknown charge'), params['mincharge']) 872 | else: 873 | params['min_ucharge'] = params['mincharge'] 874 | if settings.has_option('search', 'maximum unknown charge') and settings.getint('search', 'maximum unknown charge'): 875 | params['max_ucharge'] = min(settings.getint('search', 'maximum unknown charge'), params['maxcharge']) 876 | else: 877 | params['max_ucharge'] = params['maxcharge'] 878 | return params 879 | 880 | 881 | def get_info(spectrum, result, settings, aa_mass=None): 882 | 'Returns neutral mass, charge state and retention time of the top candidate' 883 | if not aa_mass: 884 | aa_mass = get_aa_mass(settings) 885 | RT = spectrum['RT']#get_RT(spectrum) 886 | comp_voltage = spectrum['comp_voltage'] 887 | 888 | params = _charge_params(settings) 889 | 890 | masses, states = zip(*neutral_masses(spectrum, params)) 891 | # idx = find_nearest(masses, cmass.fast_mass(str(result['candidates'][0][1]), aa_mass=aa_mass)) 892 | 893 | 894 | nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage') 895 | cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage') 896 | 897 | idx = find_nearest(masses, custom_mass(str(result['candidates'][0][1]), aa_mass=aa_mass, nterm_mass=nterm_mass, cterm_mass=cterm_mass)) 898 | return (masses[idx], states[idx], RT, comp_voltage) 899 | 900 | 901 | def reshape_theor_spectrum(peaks): 902 | for k in peaks.keys(): 903 | marr = np.array(peaks[k]) 904 | n = marr.size 905 | peaks[k] = marr.reshape((n, 1)) 906 | return peaks 907 | 908 | 909 | ion_shift_dict = { 910 | 'a': 46.00547930326002, 911 | 'b': 18.010564683699954, 912 | 'c': 0.984015582689949, 913 | 'x': -25.979264555419945, 914 | 'y': 0.0, 915 | 'z': 17.026549101010005, 916 | } 917 | 918 | 919 | def calc_ions_from_neutral_mass(peptide, nm, ion_type, charge, aa_mass, cterm_mass, nterm_mass): 920 | if ion_type in 'abc': 921 | nmi = nm - aa_mass[peptide[-1]] - ion_shift_dict[ion_type] - (cterm_mass - 17.002735) 922 | else: 923 | nmi = nm - aa_mass[peptide[0]] - ion_shift_dict[ion_type] - (nterm_mass - 1.007825) 924 | return (nmi + 1.0072764667700085 * charge) / charge 925 | 926 | 927 | def check_n_term(ion_type): 928 | return (ion_type[0] == 'b' or ion_type[0] == 'a' or ion_type[0] == 'c') 929 | 930 | 931 | def get_n_ions(peptide, maxmass, pl, charge, k_aa_mass): 932 | tmp = [maxmass, ] 933 | for i in range(1, pl): 934 | tmp.append(tmp[-1] - k_aa_mass[peptide[-i-1]]/charge) 935 | return tmp 936 | 937 | 938 | def get_c_ions(peptide, maxmass, pl, charge, k_aa_mass): 939 | tmp = [maxmass, ] 940 | for i in range(pl-2, -1, -1): 941 | tmp.append(tmp[-1] - k_aa_mass[peptide[-(i+2)]]/charge) 942 | return tmp 943 | 944 | 945 | def theor_spectrum(peptide, acc_frag, nterm_mass, cterm_mass, types=('b', 'y'), maxcharge=None, reshape=False, **kwargs): 946 | peaks = {} 947 | theoretical_set = dict() 948 | if 'nm' in kwargs: 949 | nm = kwargs['nm'] 950 | else: 951 | nm = custom_mass(peptide, aa_mass=kwargs['aa_mass'], nterm_mass = nterm_mass, cterm_mass = cterm_mass) 952 | pl = len(peptide) - 1 953 | if not maxcharge: 954 | maxcharge = 1 + int(ec.charge(peptide, pH=2)) 955 | for charge in range(1, maxcharge + 1): 956 | for ion_type in types: 957 | nterminal = check_n_term(ion_type) 958 | if nterminal: 959 | maxmass = calc_ions_from_neutral_mass(peptide, nm, ion_type=ion_type, charge=charge, 960 | aa_mass=kwargs['aa_mass'], cterm_mass=cterm_mass, nterm_mass=nterm_mass) 961 | marr = get_n_ions(peptide, maxmass, pl, charge, kwargs['aa_mass']) 962 | else: 963 | maxmass = calc_ions_from_neutral_mass(peptide, nm, ion_type=ion_type, charge=charge, 964 | aa_mass=kwargs['aa_mass'], cterm_mass=cterm_mass, nterm_mass=nterm_mass) 965 | marr = get_c_ions(peptide, maxmass, pl, charge, kwargs['aa_mass']) 966 | 967 | tmp = [int(x / acc_frag) for x in marr] 968 | if ion_type in theoretical_set: 969 | theoretical_set[ion_type].extend(tmp) 970 | else: 971 | theoretical_set[ion_type] = tmp 972 | 973 | if reshape: 974 | marr = np.array(marr) 975 | n = marr.size 976 | marr = marr.reshape((n, 1)) 977 | peaks[ion_type, charge] = marr 978 | return peaks, theoretical_set 979 | 980 | 981 | def get_expmass(spectrum, kwargs): 982 | maxcharge = kwargs['maxcharge'] or None 983 | mincharge = kwargs['mincharge'] or None 984 | min_ucharge = kwargs['min_ucharge'] 985 | max_ucharge = kwargs['max_ucharge'] 986 | 987 | if 'params' in spectrum: 988 | exp_mass = spectrum['params']['pepmass'][0] 989 | charge = spectrum['params'].get('charge') 990 | else: 991 | ion = spectrum['precursorList']['precursor'][ 992 | 0]['selectedIonList']['selectedIon'][0] 993 | charge = ion.get('charge state') 994 | if charge is not None: charge = [int(charge)] 995 | exp_mass = ion['selected ion m/z'] 996 | 997 | if isinstance(charge, str): 998 | states = [s for s in aux._parse_charge(charge, True) 999 | if (mincharge is None or s >= mincharge) and (maxcharge is None or s <= maxcharge)] 1000 | elif charge is None: 1001 | states = list(range(min_ucharge, 1 + max_ucharge)) 1002 | else: 1003 | states = [c for c in charge if 1004 | (mincharge is None or c >= mincharge) and (maxcharge is None or c <= maxcharge)] 1005 | states.sort() 1006 | return exp_mass, states 1007 | 1008 | 1009 | def neutral_masses(spectrum, params): 1010 | exp_mass, states = get_expmass(spectrum, params) 1011 | return zip((c * (exp_mass - mass.nist_mass['H+'][0][0]) 1012 | for c in states), states) 1013 | 1014 | 1015 | @aux.memoize(10) 1016 | def import_(name): 1017 | """Import a function by name: module.function or 1018 | module.submodule.function, etc. By default trying to find 1019 | function name in identipy.scoring module. 1020 | Return the function object.""" 1021 | 1022 | try: 1023 | mod, f = name.rsplit('.', 1) 1024 | return getattr(__import__(mod, fromlist=[f]), f) 1025 | except Exception as e: 1026 | logger.error('%s', e) 1027 | return getattr(__import__('identipy.scoring', fromlist=[name]), name) 1028 | 1029 | 1030 | def get_aa_mass(settings): 1031 | if settings.has_option('misc', 'aa_mass'): 1032 | return settings.get('misc', 'aa_mass') 1033 | aa_mass = mass.std_aa_mass.copy() 1034 | aa_mass['-'] = 0.0 1035 | for k, v in settings.items('modifications'): 1036 | if k not in {'fixed', 'variable', 'variable_original', 'protein variable', 'protein_original'}: 1037 | aa_mass[k] = float(v) 1038 | fmods = settings.get('modifications', 'fixed') 1039 | if fmods: 1040 | for mod in re.split(r'[,;]\s*', fmods): 1041 | if '-' not in mod: 1042 | m, aa = parser._split_label(mod) 1043 | aa_mass[aa] += settings.getfloat('modifications', m) 1044 | vmods = settings.get('modifications', 'variable') 1045 | if vmods: 1046 | leg = settings.get('misc', 'legend') 1047 | for p in string.punctuation: 1048 | if p in leg: 1049 | mod, aa, term = leg[p] 1050 | if term == ']' and aa == '-': 1051 | aa_mass[p] = aa_mass[mod] + aa_mass[aa] 1052 | aa_mass[aa+mod] = aa_mass[mod] + aa_mass[aa] 1053 | else: 1054 | aa_mass[p] = aa_mass[mod] + aa_mass[aa] 1055 | aa_mass[mod+aa] = aa_mass[mod] + aa_mass[aa] 1056 | pmods = settings.get('modifications', 'protein variable') 1057 | if pmods: 1058 | leg = settings.get('misc', 'plegend') 1059 | for p in string.punctuation: 1060 | if p in leg: 1061 | mod, aa, term = leg[p] 1062 | if term == ']' and aa == '-': 1063 | aa_mass[p] = aa_mass[mod] + aa_mass[aa] 1064 | aa_mass[aa+mod] = aa_mass[mod] + aa_mass[aa] 1065 | else: 1066 | aa_mass[p] = aa_mass[mod] + aa_mass[aa] 1067 | aa_mass[mod+aa] = aa_mass[mod] + aa_mass[aa] 1068 | return aa_mass 1069 | 1070 | 1071 | def multimap(n, func, it, global_data, best_res_in=False, best_res_raw_in=False, best_peptides=False, **kw): 1072 | global best_res 1073 | 1074 | 1075 | rel = kw['rel'] 1076 | nterm_mass = kw.get('nterm_mass') 1077 | cterm_mass = kw.get('cterm_mass') 1078 | acc_l = kw['acc_l'] 1079 | acc_r = kw['acc_r'] 1080 | 1081 | shifts_and_pime = kw['sapime'] 1082 | 1083 | if best_res_in: 1084 | best_res = deepcopy(best_res_in) 1085 | best_res_raw = deepcopy(best_res_raw_in) 1086 | else: 1087 | best_res = {} 1088 | best_res_raw = {} 1089 | best_res_pep = {} 1090 | 1091 | 1092 | if n == 1: 1093 | cnt1 = 0 1094 | for s in it: 1095 | cnt1 += 1 1096 | if cnt1 % 10000 == 0: 1097 | logger.debug(cnt1) 1098 | result = func(s, best_res, global_data[0], **kw) 1099 | if result: 1100 | for x in result: 1101 | peptide, m, snp_label, res = x 1102 | 1103 | for score, spec_t, c, info in res: 1104 | if -score <= best_res.get(spec_t, 0): 1105 | best_res_raw[spec_t] = [peptide, m, snp_label, score, spec_t, c, info] 1106 | best_res[spec_t] = -score 1107 | return best_res_raw, best_res 1108 | 1109 | else: 1110 | 1111 | def worker(qout, start, end, global_data_local): 1112 | # maxval = len(qin) 1113 | # start = 0 1114 | 1115 | new_best_res = {} 1116 | new_best_res_raw = {} 1117 | best_pep_res = {} 1118 | 1119 | while start < end: 1120 | item = qin[start] 1121 | result = func(item, best_res, global_data_local, **kw) 1122 | 1123 | if result: 1124 | for x in result: 1125 | peptide, m, snp_label, res = x 1126 | 1127 | for score, spec_t, c, info in res: 1128 | if -score <= new_best_res.get(spec_t, best_res.get(spec_t, 0)): 1129 | new_best_res[spec_t] = -score 1130 | best_res[spec_t] = -score 1131 | new_best_res_raw[spec_t] = [peptide, m, snp_label, score, spec_t, c, info] 1132 | start += 1 1133 | qout.put(new_best_res_raw) 1134 | qout.put(None) 1135 | qsize = kw.pop('qsize') 1136 | qout = Queue(qsize) 1137 | count = 0 1138 | 1139 | global qin 1140 | 1141 | while True: 1142 | qint = list(islice(it, 5000000)) 1143 | if not len(qint): 1144 | break 1145 | 1146 | qin = [] 1147 | for seqm, aachange_pos, snp_label, m in qint: 1148 | qin.append((seqm, aachange_pos, snp_label, m)) 1149 | qin = sorted(qin, key=lambda x: x[3]) 1150 | qin_masses = np.array([z[3] for z in qin]) 1151 | 1152 | procs = [] 1153 | for proc_num in range(n): 1154 | 1155 | min_mass = min(global_data[proc_num]['nmasses']) 1156 | max_mass = max(global_data[proc_num]['nmasses']) 1157 | if rel: 1158 | dm_l = acc_l * max_mass / 1.0e6 1159 | dm_r = acc_r * max_mass / 1.0e6 1160 | elif not rel: 1161 | dm_l = acc_l 1162 | dm_r = acc_r 1163 | dm_l -= min(shifts_and_pime) 1164 | dm_r += max(shifts_and_pime) 1165 | start = qin_masses.searchsorted(min_mass + dm_l) 1166 | end = qin_masses.searchsorted(max_mass + dm_r, side='right') 1167 | 1168 | p = Process(target=worker, args=(qout, start, end, global_data[proc_num])) 1169 | p.start() 1170 | procs.append(p) 1171 | 1172 | count = len(qin) 1173 | 1174 | for _ in range(n): 1175 | logger.debug('%s %s', _, len(best_res_pep)) 1176 | for item in iter(qout.get, None): 1177 | for k, v in item.items(): 1178 | if -v[3] <= best_res.get(k, 0): 1179 | best_res_raw[k] = v 1180 | best_res[k] = -v[3] 1181 | best_res_pep[k] = v[0] 1182 | logger.debug('%s %s', _, len(best_res_pep)) 1183 | 1184 | logger.debug('HERE1') 1185 | 1186 | for p in procs: 1187 | p.join() 1188 | 1189 | logger.debug('HERE2') 1190 | 1191 | logger.info(len(best_res_pep)) 1192 | return best_res_raw, best_res 1193 | 1194 | 1195 | def allow_all(*args): 1196 | return True 1197 | 1198 | 1199 | def get_RT(spectrum): 1200 | """Return scan retention time in seconds""" 1201 | # MGF 1202 | if 'params' in spectrum: 1203 | try: 1204 | return float(spectrum['params']['rtinseconds']) 1205 | except: 1206 | try: 1207 | return float(spectrum['params']['title'].split(',')[-1].strip().split()[0]) 1208 | except: 1209 | try: 1210 | return 60 * np.average([float(x) for x in spectrum['params']['title'].split('lution from: ')[-1].split(' period:')[0].split(' to ')]) 1211 | except: 1212 | return 0 1213 | # mzML 1214 | try: 1215 | rt = spectrum['scanList']['scan'][0]['scan start time'] 1216 | try: 1217 | if rt.unit_info == 'second': 1218 | return float(rt) 1219 | else: 1220 | return float(rt * 60) 1221 | except AttributeError: 1222 | return float(rt) 1223 | except KeyError: 1224 | return 0 1225 | 1226 | 1227 | 1228 | def get_comp_voltage(spectrum): 1229 | """Return scan compensation_voltage""" 1230 | # MGF 1231 | if 'params' in spectrum: 1232 | try: 1233 | return float(spectrum['params']['FAIMS compensation voltage']) 1234 | except: 1235 | return 0 1236 | # mzML 1237 | try: 1238 | return spectrum['FAIMS compensation voltage'] 1239 | except: 1240 | return 0 1241 | 1242 | 1243 | def get_title(spectrum): 1244 | if 'params' in spectrum: 1245 | return spectrum['params']['title'] 1246 | else: 1247 | return spectrum['id'] 1248 | 1249 | 1250 | def get_precursor_mz(spectrum): 1251 | try: 1252 | return spectrum['params']['pepmass'][0] 1253 | except: 1254 | return spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z'] 1255 | 1256 | 1257 | def is_db_target_only(settings): 1258 | db = settings.get('input', 'database') 1259 | isdecoy = is_decoy_function(settings) 1260 | balance = 0 1261 | for prot in fasta.read(db): 1262 | if isdecoy(prot[0]): 1263 | balance -= 1 1264 | else: 1265 | balance += 1 1266 | return bool(balance) 1267 | 1268 | 1269 | def get_shifts_and_pime(settings): 1270 | # try: 1271 | # fast_first_stage = settings.getint('misc', 'fast first stage') 1272 | # except: 1273 | # fast_first_stage = 0 1274 | pime = settings.getint('search', 'precursor isotope mass error') 1275 | # if fast_first_stage: 1276 | # pime = 0 1277 | shifts =[float(x) for x in settings.get('search', 'shifts').split(',')] 1278 | dM = mass.nist_mass['C'][13][0] - mass.nist_mass['C'][12][0] 1279 | shifts_and_pime = shifts[:] 1280 | for i in range(pime): 1281 | shifts_and_pime += [x + (i + 1) * dM for x in shifts] 1282 | return shifts_and_pime 1283 | 1284 | 1285 | def build_pept_prot(settings, results): 1286 | mc = settings.getint('search', 'number of missed cleavages') 1287 | minlen = settings.getint('search', 'peptide minimum length') 1288 | maxlen = settings.getint('search', 'peptide maximum length') 1289 | isdecoy = is_decoy_function(settings) 1290 | clip_M = settings.getboolean('search', 'clip N-terminal methionine') 1291 | 1292 | snp = settings.getint('search', 'snp') 1293 | pept_prot = {} 1294 | prots = {} 1295 | peptides = set() 1296 | pept_neighbors = {} 1297 | pept_ntts = {} 1298 | enzyme = settings.get('search', 'enzyme') 1299 | semitryptic = settings.getint('search', 'semitryptic') 1300 | for x in results: 1301 | peptides.update(re.sub(r'[^A-Z]', '', normalize_mods(x['candidates'][i][1], settings)) for i in range( 1302 | 1 or len(x['candidates']))) 1303 | seen_target.clear() 1304 | seen_decoy.clear() 1305 | enzyme_rule = get_enzyme(enzyme) 1306 | for desc, prot in prot_gen(settings): 1307 | dbinfo = desc.split(' ')[0] 1308 | prots[dbinfo] = desc 1309 | if semitryptic: 1310 | cl_positions = set(z for z in it.chain([x.end() for x in re.finditer(enzyme_rule, prot)], 1311 | [0, 1, len(prot)])) 1312 | for pep, startposition in prot_peptides(prot, enzyme_rule, mc, minlen, maxlen, isdecoy(desc), 1313 | dont_use_seen_peptides=True, snp=snp, desc=desc, position=True, semitryptic=semitryptic, clip_M=clip_M): 1314 | if snp: 1315 | if 'snp' not in pep: 1316 | seqm = pep 1317 | else: 1318 | tmp = pep.split('snp') 1319 | seqm = tmp[0] + tmp[1].split('at')[0].split('to')[-1] + tmp[2] 1320 | else: 1321 | seqm = pep 1322 | 1323 | if seqm in peptides: 1324 | pept_ntts.setdefault(seqm, {}) 1325 | pept_neighbors.setdefault(seqm, {}) 1326 | pept_neighbors[seqm][dbinfo] = (prot[startposition - 1] if startposition != 0 else '-', 1327 | prot[startposition + len(seqm)] if startposition + len(seqm) < len(prot) else '-') 1328 | 1329 | if not semitryptic: 1330 | pept_prot.setdefault(seqm, []).append(dbinfo) 1331 | pept_ntts[seqm][dbinfo] = 2 1332 | else: 1333 | ntt = (startposition in cl_positions) + ((startposition + len(seqm)) in cl_positions) 1334 | pept_ntts[seqm][dbinfo] = ntt 1335 | pept_prot.setdefault(seqm, []).append(dbinfo) 1336 | 1337 | return pept_prot, prots, pept_neighbors, pept_ntts 1338 | 1339 | 1340 | def get_outpath(inputfile, settings, suffix): 1341 | outpath = settings.get('output', 'path') 1342 | filename = os.path.join(outpath, os.path.splitext(os.path.basename(inputfile))[0] + os.path.extsep + suffix) 1343 | return filename 1344 | 1345 | 1346 | def write_pepxml(inputfile, settings, results): 1347 | outpath = settings.get('output', 'path') 1348 | logger.debug('Output path: %s', outpath) 1349 | 1350 | set_mod_dict(settings) 1351 | 1352 | enzyme = settings.get('search', 'enzyme') 1353 | search_engine = 'IdentiPy' 1354 | database = settings.get('input', 'database') 1355 | missed_cleavages = settings.getint('search', 'number of missed cleavages') 1356 | fmods = settings.get('modifications', 'fixed') 1357 | snp = settings.getint('search', 'snp') 1358 | nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage') 1359 | cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage') 1360 | tags = get_tags(settings.get('output', 'tags')) 1361 | 1362 | nterm_fixed = 0 1363 | cterm_fixed = 0 1364 | 1365 | for mod in re.split(r'[,;]\s*', fmods): 1366 | if mod.startswith('-'): 1367 | cterm_fixed = settings.getfloat('modifications', 'protein cterm cleavage') 1368 | elif mod.endswith('-'): 1369 | nterm_fixed = settings.getfloat('modifications', 'protein nterm cleavage') 1370 | 1371 | filename = get_outpath(inputfile, settings, 'pep.xml') 1372 | with open(filename, 'wb') as output: 1373 | logger.info('Writing %s ...', filename) 1374 | line1 = b'\n\ 1375 | \n' 1376 | output.write(line1) 1377 | 1378 | base_name, ftype = path.splitext(inputfile) 1379 | ftype = ftype.lower() 1380 | 1381 | root = etree.Element('msms_pipeline_analysis') 1382 | root.set("date", strftime("%Y:%m:%d:%H:%M:%S")) 1383 | root.set("summary_xml", '') 1384 | root.set("xmlns", 'http://regis-web.systemsbiology.net/pepXML') 1385 | # TODO 1386 | #root.set("xmlns:xsi", 'http://www.w3.org/2001/XMLSchema-instance') 1387 | #root.set("xsi:schemaLocation", 'http://sashimi.sourceforge.net/schema_revision/pepXML/pepXML_v117.xsd') 1388 | 1389 | child1 = etree.Element('msms_run_summary') 1390 | child1.set("base_name", base_name) 1391 | child1.set("search_engine", search_engine) 1392 | child1.set("raw_data_type", "raw") # ? 1393 | 1394 | if ftype == '.mgf': 1395 | child1.set("raw_data", ".mgf") 1396 | elif ftype == '.mzml': 1397 | child1.set("raw_data", ".mzML") 1398 | else: 1399 | child1.set("raw_data", ".?") 1400 | root.append(child1) 1401 | 1402 | child2 = etree.Element('sample_enzyme') 1403 | child2.set('name', enzyme) 1404 | child1.append(child2) 1405 | 1406 | child3 = etree.Element('specificity') 1407 | child3.set("cut", "KR") 1408 | child3.set("no_cut", "P") 1409 | child3.set("sense", "C") 1410 | 1411 | child2.append(child3) 1412 | 1413 | child4 = etree.Element('search_summary') 1414 | child4.set('base_name', base_name) 1415 | child4.set('search_engine', search_engine) 1416 | child4.set("search_engine_version", get_version()) 1417 | child4.set('precursor_mass_type', 'monoisotopic') 1418 | child4.set('fragment_mass_type', 'monoisotopic') 1419 | child4.set('search_id', '1') 1420 | 1421 | for child_mod in get_child_for_mods(settings.get('modifications', 'fixed'), settings, fixed=True): 1422 | child4.append(child_mod) 1423 | for child_mod in get_child_for_mods(settings.get('modifications', 'variable_original'), settings, fixed=False): 1424 | child4.append(child_mod) 1425 | for child_mod in get_child_for_mods(settings.get('modifications', 'protein_original'), settings, fixed=False, protein=True): 1426 | child4.append(child_mod) 1427 | 1428 | child1.append(child4) 1429 | 1430 | child5 = etree.Element('search_database') 1431 | child5.set('local_path', database) 1432 | child5.set('type', 'AA') 1433 | 1434 | child4.append(copy(child5)) 1435 | 1436 | child5 = etree.Element('enzymatic_search_constraint') 1437 | child5.set('enzyme', enzyme) 1438 | child5.set('max_num_internal_cleavages', str(missed_cleavages)) 1439 | child5.set('min_number_termini', '2') 1440 | 1441 | child4.append(copy(child5)) 1442 | 1443 | results = [x for x in results if x['candidates'].size] 1444 | # results = list(get_output(results, settings)) 1445 | logger.info('Accumulated results: %s', len(results)) 1446 | pept_prot, prots, pept_neighbors, pept_ntts = build_pept_prot(settings, results) 1447 | if settings.has_option('misc', 'aa_mass'): 1448 | aa_mass = settings.get('misc', 'aa_mass') 1449 | else: 1450 | aa_mass = get_aa_mass(settings) 1451 | vmods = set() 1452 | variablemods = settings.get('modifications', 'variable') 1453 | if variablemods: 1454 | for k, v in variablemods.items(): 1455 | for aa in v: 1456 | vmods.add(k + aa) 1457 | vmods.add(aa + k) 1458 | 1459 | leg = {} 1460 | if settings.has_option('misc', 'legend'): 1461 | leg = settings.get('misc', 'legend') 1462 | if settings.has_option('misc', 'plegend'): 1463 | leg.update(settings.get('misc', 'plegend')) 1464 | 1465 | ntermcleavage = settings.getfloat('modifications', 'protein nterm cleavage') 1466 | ctermcleavage = settings.getfloat('modifications', 'protein cterm cleavage') 1467 | 1468 | for idx, result in enumerate(results): 1469 | if result['candidates'].size: 1470 | tmp = etree.Element('spectrum_query') 1471 | spectrum = result['spectrum'] 1472 | tmp.set('spectrum', get_title(spectrum)) 1473 | tmp.set('spectrumNativeID', get_title(spectrum)) 1474 | tmp.set('start_scan', str(idx)) # ??? 1475 | tmp.set('end_scan', str(idx)) # ??? 1476 | tmp.set('index', str(idx)) # ??? 1477 | 1478 | neutral_mass, charge_state, RT, comp_voltage = get_info(spectrum, result, settings, aa_mass) 1479 | tmp.set('precursor_neutral_mass', str(neutral_mass)) 1480 | tmp.set('assumed_charge', str(int(charge_state))) 1481 | if RT: 1482 | tmp.set('retention_time_sec', str(RT)) 1483 | if comp_voltage: 1484 | tmp.set('compensation_voltage', str(comp_voltage)) 1485 | 1486 | tmp2 = etree.Element('search_result') 1487 | result['candidates'] = result['candidates'][:len(result['e-values'])] 1488 | 1489 | flag = 1 1490 | for i, candidate in enumerate(result['candidates']): 1491 | match = candidate[4]['match'] 1492 | if match is None: 1493 | break 1494 | tmp3 = etree.Element('search_hit') 1495 | tmp3.set('hit_rank', str(i + 1)) 1496 | mod_sequence = normalize_mods(str(candidate[1]), settings) 1497 | sequence = re.sub(r'[^A-Z]', '', mod_sequence) 1498 | if sequence not in pept_prot: 1499 | flag = 0 1500 | logger.error('Unaccounted sequence! %s (%s)', sequence, mod_sequence) 1501 | break 1502 | else: 1503 | tmp3.set('peptide', sequence) 1504 | 1505 | proteins = pept_prot[re.sub(r'[^A-Z]', '', sequence)] 1506 | 1507 | tmp3.set('protein', prots[proteins[0]].split(' ', 1)[0] + (('_' + candidate[7]) if snp else '')) 1508 | try: 1509 | protein_descr = prots[proteins[0]].split(' ', 1)[1] 1510 | except: 1511 | protein_descr = '' 1512 | 1513 | neighbors = pept_neighbors.get(sequence, {}).get(proteins[0], ('-', '-')) 1514 | 1515 | tmp3.set('peptide_prev_aa', neighbors[0]) 1516 | tmp3.set('peptide_next_aa', neighbors[1]) 1517 | tmp3.set('protein_descr', protein_descr) 1518 | 1519 | num_tot_proteins = len(proteins) 1520 | tmp3.set('num_tot_proteins', str(num_tot_proteins)) 1521 | tmp3.set('num_matched_ions', str(sum(v.sum() for v in match.values()))) 1522 | tmp3.set('tot_num_ions', str((len(sequence) - 1) * 2)) 1523 | neutral_mass_theor = custom_mass(str(candidate[1]), aa_mass=aa_mass, nterm_mass=nterm_mass, cterm_mass=cterm_mass) 1524 | # neutral_mass_theor = cmass.fast_mass(sequence, aa_mass=aa_mass) 1525 | tmp3.set('calc_neutral_pep_mass', str(neutral_mass_theor)) 1526 | tmp3.set('massdiff', str(candidate[4]['mzdiff']['Da'])) 1527 | tmp3.set('num_tol_term', str(pept_ntts.get(sequence, {}).get(proteins[0], '?'))) 1528 | tmp3.set('num_missed_cleavages', str(parser.num_sites(sequence, get_enzyme(enzyme)))) 1529 | tmp3.set('is_rejected', '0') # ??? 1530 | 1531 | if num_tot_proteins > 1 and (not snp or 'wild' not in prots[proteins[0]].split(' ', 1)[0]): 1532 | for prot in proteins[1:]: 1533 | tmp4 = etree.Element('alternative_protein') 1534 | tmp4.set('protein', prots[prot].split(' ', 1)[0] + (('_' + candidate[7]) if snp else '')) 1535 | try: 1536 | protein_descr = prots[prot].split(' ', 1)[1] 1537 | except: 1538 | protein_descr = '' 1539 | tmp4.set('protein_descr', protein_descr) 1540 | neighbors = pept_neighbors.get(sequence, {}).get(prot, ('-', '-')) 1541 | tmp4.set('peptide_prev_aa', neighbors[0]) 1542 | tmp4.set('peptide_next_aa', neighbors[1]) 1543 | tmp4.set('num_tol_term', str(pept_ntts.get(sequence, {}).get(prot, '?'))) 1544 | tmp3.append(copy(tmp4)) 1545 | 1546 | labels = parser.std_labels + [la[:-1] if la[-1] == '[' else '-' + la[:-2] if la[-1] == ']' else la for la in leg if len(la) > 1] 1547 | # logger.debug('Known labels: %s', labels) 1548 | try: 1549 | aalist = parser.parse(mod_sequence, labels=labels) 1550 | except Exception as e: 1551 | logger.debug('Problematic sequence: %s\n%s', mod_sequence, e) 1552 | aalist = [a[::-1] for a in parser.parse(mod_sequence[::-1], labels=labels)][::-1] 1553 | tmp4 = etree.Element('modification_info') 1554 | ntermmod = 0 1555 | 1556 | if nterm_fixed: 1557 | tmp4.set('mod_nterm_mass', str(nterm_fixed)) 1558 | if cterm_fixed: 1559 | tmp4.set('mod_cterm_mass', str(cterm_fixed)) 1560 | 1561 | for idx, aminoacid in enumerate(aalist): 1562 | if aminoacid in fmods or aminoacid in vmods: 1563 | if aminoacid.endswith('-') and idx == 0: 1564 | ntermmod = 1 1565 | tmp4.set('mod_nterm_mass', str(str(aa_mass.get(aminoacid) + ntermcleavage))) 1566 | elif aminoacid.startswith('-') and idx == len(aalist) - 1: 1567 | tmp4.set('mod_cterm_mass', str(aa_mass.get(aminoacid) + ctermcleavage)) 1568 | else: 1569 | tmp5 = etree.Element('mod_aminoacid_mass') 1570 | tmp5.set('position', str(idx + 1 - ntermmod)) 1571 | tmp5.set('mass', str(aa_mass.get(aminoacid))) 1572 | tmp4.append(copy(tmp5)) 1573 | tmp3.append(copy(tmp4)) 1574 | 1575 | if 'RNHS' in candidate[4]: 1576 | tmp4 = etree.Element('search_score') 1577 | tmp4.set('name', 'hyperscore') 1578 | tmp4.set('value', str(candidate[4]['RNHS'])) 1579 | tmp3.append(copy(tmp4)) 1580 | 1581 | tmp4 = etree.Element('search_score') 1582 | tmp4.set('name', 'expect') 1583 | tmp4.set('value', str(1./candidate[4]['RNHS'])) 1584 | tmp3.append(copy(tmp4)) 1585 | 1586 | else: 1587 | tmp4 = etree.Element('search_score') 1588 | tmp4.set('name', 'hyperscore') 1589 | tmp4.set('value', str(candidate[0])) 1590 | tmp3.append(copy(tmp4)) 1591 | 1592 | tmp4 = etree.Element('search_score') 1593 | tmp4.set('name', 'expect') 1594 | tmp4.set('value', str(result['e-values'][i])) 1595 | tmp3.append(copy(tmp4)) 1596 | 1597 | tmp4 = etree.Element('search_score') 1598 | tmp4.set('name', 'sumI') 1599 | tmp4.set('value', str(candidate[5])) 1600 | tmp3.append(copy(tmp4)) 1601 | 1602 | tmp4 = etree.Element('search_score') 1603 | tmp4.set('name', 'fragmentMT') 1604 | tmp4.set('value', str(candidate[6])) 1605 | tmp3.append(copy(tmp4)) 1606 | 1607 | tmp4 = etree.Element('search_score') 1608 | tmp4.set('name', 'nextscore_std') 1609 | tmp4.set('value', str(candidate[8])) 1610 | tmp3.append(copy(tmp4)) 1611 | 1612 | if 'params' in spectrum: 1613 | if 'isowidthdiff' in spectrum['params']: 1614 | tmp4 = etree.Element('search_score') 1615 | tmp4.set('name', 'ISOWIDTHDIFF') 1616 | tmp4.set('value', str(spectrum['params'].get('isowidthdiff', 0))) 1617 | tmp3.append(copy(tmp4)) 1618 | 1619 | if 'rtwidth' in spectrum['params']: 1620 | tmp4 = etree.Element('search_score') 1621 | tmp4.set('name', 'RTwidth') 1622 | tmp4.set('value', str(spectrum['params'].get('rtwidth', 0))) 1623 | tmp3.append(copy(tmp4)) 1624 | 1625 | if 'ms1intensity' in spectrum['params']: 1626 | tmp4 = etree.Element('search_score') 1627 | tmp4.set('name', 'MS1Intensity') 1628 | tmp4.set('value', str(spectrum['params'].get('ms1intensity', 0))) 1629 | tmp3.append(copy(tmp4)) 1630 | 1631 | if 'pif' in spectrum['params']: 1632 | tmp4 = etree.Element('search_score') 1633 | tmp4.set('name', 'PIF') 1634 | tmp4.set('value', str(spectrum['params'].get('pif', -3))) 1635 | tmp3.append(copy(tmp4)) 1636 | 1637 | if 'sulfur' in spectrum['params']: 1638 | tmp4 = etree.Element('search_score') 1639 | tmp4.set('name', 'sulfur') 1640 | tmp4.set('value', str(spectrum['params'].get('sulfur', -1))) 1641 | tmp3.append(copy(tmp4)) 1642 | 1643 | if 'ionmobility' in spectrum['params']: 1644 | tmp4 = etree.Element('search_score') 1645 | tmp4.set('name', 'ionmobility') 1646 | tmp4.set('value', str(spectrum['params'].get('ionmobility', 0))) 1647 | tmp3.append(copy(tmp4)) 1648 | 1649 | if tags: 1650 | for tag_label in tags.keys(): 1651 | tmp4 = etree.Element('search_score') 1652 | tmp4.set('name', 'tag_' + tag_label) 1653 | tmp4.set('value', str(spectrum.get(tag_label, 0))) 1654 | tmp3.append(copy(tmp4)) 1655 | 1656 | 1657 | for k, v in match.items(): 1658 | tmp4 = etree.Element('search_score') 1659 | tmp4.set('name', 'matched_{}{}_ions'.format(*k)) 1660 | tmp4.set('value', str(v.sum())) 1661 | tmp3.append(copy(tmp4)) 1662 | 1663 | tmp2.append(copy(tmp3)) 1664 | if flag: 1665 | tmp.append(copy(tmp2)) 1666 | child1.append(copy(tmp)) 1667 | 1668 | s = etree.tostring(root, pretty_print=True) 1669 | output.write(s) 1670 | 1671 | 1672 | def write_csv(inputfile, settings, results): 1673 | df = dataframe(inputfile, settings, results) 1674 | if df is None: 1675 | logger.info('No results to write. File not created.') 1676 | return 1677 | 1678 | sep = settings.get('output', 'separator') 1679 | of = settings.get('output', 'format').lower() 1680 | if not sep: 1681 | sep = ',' if of == 'csv' else '\t' 1682 | fname = get_outpath(inputfile, settings, of) 1683 | logger.info('Writing %s ...', fname) 1684 | df.to_csv(fname, index=False, sep=sep) 1685 | 1686 | 1687 | def dataframe(inputfile, settings, results): 1688 | # results = list(get_output(results, settings)) 1689 | results = list(results) 1690 | if not results: 1691 | return None 1692 | 1693 | logger.info('Accumulated results: %s', len(results)) 1694 | # ensure_decoy(settings) 1695 | set_mod_dict(settings) 1696 | fmods = settings.get('modifications', 'fixed') 1697 | pept_prot, prots, pept_neighbors, pept_ntts = build_pept_prot(settings, results) 1698 | if settings.has_option('misc', 'aa_mass'): 1699 | aa_mass = settings.get('misc', 'aa_mass') 1700 | else: 1701 | aa_mass = get_aa_mass(settings) 1702 | 1703 | nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage') 1704 | cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage') 1705 | 1706 | vmods = set() 1707 | variablemods = settings.get('modifications', 'variable') 1708 | if variablemods: 1709 | for k, v in variablemods.items(): 1710 | for aa in v: 1711 | vmods.add(k + aa) 1712 | vmods.add(aa + k) 1713 | 1714 | leg = {} 1715 | if settings.has_option('misc', 'legend'): 1716 | leg = settings.get('misc', 'legend') 1717 | 1718 | enzyme = settings.get('search', 'enzyme') 1719 | snp = settings.getint('search', 'snp') 1720 | columns = ['Title', 'Assumed charge', 'RT', 'compensation_voltage', 'Rank', 'Matched ions', 'Total ions', 'Calculated mass', 1721 | 'Mass difference', 'Missed cleavages', 'Proteins', '# proteins', 'Sequence', 'Modified sequence', 1722 | 'Hyperscore', 'Expect', 'sumI', 'fragmentMT'] 1723 | rows = [] 1724 | for result in results: 1725 | if result['candidates'].size: 1726 | row = [] 1727 | spectrum = result['spectrum'] 1728 | row.append(get_title(spectrum)) 1729 | neutral_mass, charge_state, RT, comp_voltage = get_info(spectrum, result, settings, aa_mass) 1730 | row.append(charge_state) 1731 | row.append(RT) 1732 | row.append(comp_voltage) 1733 | result['candidates'] = result['candidates'][:len(result['e-values'])] 1734 | 1735 | flag = 1 1736 | for i, candidate in enumerate(result['candidates'], 1): 1737 | match = candidate[4]['match'] 1738 | if match is None: break 1739 | row.append(i) 1740 | mod_sequence = normalize_mods(candidate[1], settings) 1741 | 1742 | sequence = re.sub(r'[^A-Z]', '', mod_sequence) 1743 | if sequence not in pept_prot: 1744 | flag = 0 1745 | logger.error('Unaccounted sequence! %s (%s)', sequence, mod_sequence) 1746 | break 1747 | else: 1748 | allproteins = pept_prot[sequence] 1749 | 1750 | row.append(sum(v.sum() for v in match.values())) 1751 | row.append((len(sequence) - 1) * 2) 1752 | neutral_mass_theor = custom_mass(candidate[1], aa_mass=aa_mass, nterm_mass = nterm_mass, cterm_mass = cterm_mass) 1753 | row.append(neutral_mass_theor) 1754 | row.append(candidate[4]['mzdiff']['Da']) 1755 | row.append(parser.num_sites(sequence, get_enzyme(enzyme))) 1756 | 1757 | proteins = [allproteins[0]] 1758 | if len(allproteins) > 1: 1759 | if snp: 1760 | wilds = any('wild' in prots[p].split(' ', 1)[0] for p in allproteins) 1761 | for prot in allproteins[1:]: 1762 | d = prots[prot].split(' ', 1)[0] 1763 | if (not snp or not wilds or 'wild' in d): 1764 | proteins.append(prot) 1765 | 1766 | row.append(';'.join(proteins)) 1767 | row.append(len(proteins)) 1768 | 1769 | row.append(sequence) 1770 | if fmods: 1771 | for mod in re.split(r'[,;]\s*', fmods): 1772 | if '-' not in mod: 1773 | m, aa = parser._split_label(mod) 1774 | mod_sequence = mod_sequence.replace(aa, m+aa) 1775 | elif mod[0] == '-': 1776 | mod_sequence = mod_sequence + mod 1777 | elif mod[-1] == '-': 1778 | mod_sequence = mod + mod_sequence 1779 | row.append(mod_sequence) 1780 | 1781 | row.append(candidate[0]) 1782 | row.append(result['e-values'][i-1]) 1783 | row.append(candidate[5]) 1784 | row.append(candidate[6]) 1785 | 1786 | rows.append(row) 1787 | df = pd.DataFrame(rows) 1788 | df.columns = columns 1789 | return df 1790 | 1791 | 1792 | def write_pickle(inputfile, settings, results): 1793 | results = list(results) 1794 | logger.info('Accumulated results: %s', len(results)) 1795 | try: 1796 | import cPickle as pickle 1797 | except ImportError: 1798 | import pickle 1799 | filename = get_outpath(inputfile, settings, 'pickle') 1800 | with open(filename, 'wb') as output: 1801 | pickle.dump((inputfile, settings, results), output, -1) 1802 | 1803 | 1804 | def write_output(inputfile, settings, results): 1805 | formats = {'pepxml': write_pepxml, 'csv': write_csv, 'tsv': write_csv, 'pickle': write_pickle} 1806 | of = settings.get('output', 'format') 1807 | writer = formats[re.sub(r'[^a-z]', '', of.lower())] 1808 | 1809 | if settings.has_option('output', 'path'): 1810 | outd = settings.get('output', 'path') 1811 | if not os.path.isdir(outd): 1812 | logger.info('Creating %s ...', outd) 1813 | os.makedirs(outd) 1814 | else: 1815 | outpath = os.path.dirname(inputfile) 1816 | settings.set('output', 'path', outpath) 1817 | 1818 | return writer(inputfile, settings, results) 1819 | 1820 | 1821 | def demix_chimeric(path_to_features, path_to_mzml, demixing=False, calc_PIF=True): 1822 | 1823 | basename_mzml = os.path.splitext(path.basename(path_to_mzml))[0] 1824 | 1825 | if path_to_features: 1826 | df1 = pd.read_table(path_to_features) 1827 | df1 = df1.rename(columns=lambda x: x.strip()) 1828 | df1 = df1[df1['nIsotopes'] >= 2] 1829 | logger.info(df1.shape) 1830 | else: 1831 | df1 = None 1832 | 1833 | mzs = [] 1834 | RTs = [] 1835 | ionmobs = [] 1836 | chs = [] 1837 | titles = [] 1838 | ms2_map = {} 1839 | isolation_window_left = False 1840 | isolation_window_right = False 1841 | 1842 | cur_ms1 = False 1843 | mass_acc = 20 1844 | for a in mzml.read(path_to_mzml): 1845 | if a['ms level'] == 1: 1846 | cur_ms1 = a 1847 | elif a['ms level'] == 2: 1848 | logger.debug('PROCESSING RT: ' + str(a['scanList']['scan'][0]['scan start time']) ) 1849 | if a['ms level'] == 2 and 'm/z array' in a.keys() and len(a['m/z array']) > 0: 1850 | # if : 1851 | if not isolation_window_left: 1852 | isolation_window_left = float(a['precursorList']['precursor'][0]['isolationWindow']['isolation window lower offset']) 1853 | isolation_window_right = float(a['precursorList']['precursor'][0]['isolationWindow']['isolation window upper offset']) 1854 | pepmass = float(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z']) 1855 | RT = float(a['scanList']['scan'][0]['scan start time']) 1856 | # logger.info(a['mean inverse reduced ion mobility array']) 1857 | try: 1858 | ion_mob = float(a['mean inverse reduced ion mobility array'][0]) 1859 | except: 1860 | # logger.info('missing ion mob') 1861 | ion_mob = 0.0 1862 | try: 1863 | ch = int(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['charge state']) 1864 | except: 1865 | ch = 0 1866 | 1867 | if calc_PIF: 1868 | if not cur_ms1: 1869 | calc_PIF = False 1870 | # logger.info('Missing MS1 spectra in mzML, turning off PIF calculation') 1871 | elif not isolation_window_left: 1872 | calc_PIF = False 1873 | # logger.info('Missing isolation window info in mzML, turning off PIF calculation') 1874 | else: 1875 | intensity_full_ms2 = 0 1876 | intensity_precursor = 0 1877 | 1878 | idx_l = cur_ms1['m/z array'].searchsorted(pepmass - isolation_window_left) 1879 | idx_r = cur_ms1['m/z array'].searchsorted(pepmass + isolation_window_right, side='right') 1880 | 1881 | if not ch: 1882 | tch = 2 1883 | else: 1884 | tch = ch 1885 | 1886 | abs_error = pepmass * mass_acc * 1e-6 1887 | for mz, intensity in zip(cur_ms1['m/z array'][idx_l:idx_r], cur_ms1['intensity array'][idx_l:idx_r]): 1888 | if any(abs(mz - (pepmass + (k * 1.007825) / tch)) <= abs_error for k in [-2, -1, 0, 1, 2, 3, 4]): 1889 | intensity_precursor += intensity 1890 | intensity_full_ms2 += intensity 1891 | if intensity_full_ms2: 1892 | PIF = intensity_precursor / intensity_full_ms2 * 100 1893 | else: 1894 | PIF = -1 1895 | a['PIF'] = PIF 1896 | 1897 | 1898 | title = a['id'] 1899 | mzs.append(pepmass) 1900 | RTs.append(RT) 1901 | ionmobs.append(ion_mob) 1902 | chs.append(ch) 1903 | titles.append(title) 1904 | ms2_map[title] = a 1905 | 1906 | 1907 | mzs = np.array(mzs) 1908 | RTs = np.array(RTs) 1909 | ionmobs = np.array(ionmobs) 1910 | chs = np.array(chs) 1911 | titles = np.array(titles) 1912 | idx = np.argsort(mzs) 1913 | mzs = mzs[idx] 1914 | RTs = RTs[idx] 1915 | ionmobs = ionmobs[idx] 1916 | chs = chs[idx] 1917 | titles = titles[idx] 1918 | 1919 | if not df1 is None: 1920 | if 'ion_mobility' not in df1.columns: 1921 | df1['ion_mobility'] = 0 1922 | if 'sulfur' not in df1.columns: 1923 | df1['sulfur'] = 0 1924 | df1['MSMS'] = df1.apply(findMSMS, axis=1, args = (isolation_window_left, isolation_window_right, mzs, RTs, titles, ionmobs)) 1925 | df1['MSMS_accurate'] = df1.apply(findMSMS_accurate, axis=1, args = (mzs, RTs, titles, ionmobs, chs)) 1926 | # print(df1['MSMS_accurate']) 1927 | 1928 | outmgf_name = os.path.splitext(path_to_mzml)[0] + '_identipy' + os.extsep + 'mgf' 1929 | outmgf = open(outmgf_name, 'w') 1930 | 1931 | t_i = 1 1932 | f_i = 0 1933 | 1934 | added_MSMS = set() 1935 | 1936 | if demixing: 1937 | 1938 | for z in df1[['mz', 'rtApex', 'charge', 'intensityApex', 'MSMS', 'MSMS_accurate', 'rtStart', 'rtEnd', 'ion_mobility', 'sulfur']].values: 1939 | mz, RT, ch, Intensity, ttls, ttl_ac, rt_ll, rt_rr, ion_mob, sulfur = z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7], z[8], z[9] 1940 | if ttls: 1941 | f_i += 1 1942 | for ttl in ttls: 1943 | if ttl in ttl_ac: 1944 | added_MSMS.add(ttl) 1945 | mz_arr, I_arr = ms2_map[ttl]['m/z array'], ms2_map[ttl]['intensity array'] 1946 | PIF = ms2_map[ttl].get('PIF', -2) 1947 | pepmass = float(ms2_map[ttl]['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z']) 1948 | t_i_orig = ms2_map[ttl]['index'] 1949 | outmgf.write('BEGIN IONS\n') 1950 | outmgf.write('TITLE=%s.%d.%d.%d.%d\n' % (basename_mzml, t_i_orig, t_i, t_i, ch)) 1951 | outmgf.write('RTINSECONDS=%f\n' % (RT * 60, )) 1952 | outmgf.write('PEPMASS=%f %f\n' % (mz, Intensity)) 1953 | outmgf.write('CHARGE=%d+\n' % (ch, )) 1954 | outmgf.write('ISOWIDTHDIFF=%f\n' % (mz - pepmass, )) 1955 | outmgf.write('RTwidth=%f\n' % (rt_rr - rt_ll, )) 1956 | outmgf.write('MS1Intensity=%f\n' % (Intensity, )) 1957 | outmgf.write('PIF=%f\n' % (PIF, )) 1958 | outmgf.write('IonMobility=%f\n' % (ion_mob, )) 1959 | outmgf.write('Sulfur=%f\n' % (sulfur, )) 1960 | for mz_val, I_val in zip(mz_arr, I_arr): 1961 | outmgf.write('%f %f\n' % (mz_val, I_val)) 1962 | outmgf.write('END IONS\n\n') 1963 | t_i += 1 1964 | 1965 | for k in ms2_map: 1966 | if k not in added_MSMS: 1967 | f_i += 1 1968 | a = ms2_map[k] 1969 | mz_arr, I_arr = a['m/z array'], a['intensity array'] 1970 | PIF = a.get('PIF', -2) 1971 | mz = float(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z']) 1972 | RT = float(a['scanList']['scan'][0]['scan start time']) 1973 | t_i_orig = a['index'] 1974 | try: 1975 | ch = int(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['charge state']) 1976 | except: 1977 | ch = '' 1978 | outmgf.write('BEGIN IONS\n') 1979 | outmgf.write('TITLE=%s.%d.%d.%d.%s\n' % (basename_mzml, t_i_orig, t_i, t_i, str(ch))) 1980 | outmgf.write('RTINSECONDS=%f\n' % (RT * 60, )) 1981 | outmgf.write('PEPMASS=%f %f\n' % (mz, 0)) 1982 | if ch: 1983 | outmgf.write('CHARGE=%d+\n' % (ch, )) 1984 | outmgf.write('ISOWIDTHDIFF=%f\n' % (0.0, )) 1985 | outmgf.write('RTwidth=%f\n' % (0.0, )) 1986 | outmgf.write('MS1Intensity=%f\n' % (0.0, )) 1987 | outmgf.write('PIF=%f\n' % (PIF, )) 1988 | outmgf.write('IonMobility=%f\n' % (0.0, )) 1989 | outmgf.write('Sulfur=%f\n' % (-1.0, )) 1990 | for mz_val, I_val in zip(mz_arr, I_arr): 1991 | outmgf.write('%f %f\n' % (mz_val, I_val)) 1992 | outmgf.write('END IONS\n\n') 1993 | t_i += 1 1994 | 1995 | else: 1996 | 1997 | MS2_acc_map = {} 1998 | 1999 | if not df1 is None: 2000 | for z in df1[['mz', 'rtApex', 'charge', 'intensityApex', 'MSMS', 'MSMS_accurate', 'rtStart', 'rtEnd', 'ion_mobility', 'sulfur']].values: 2001 | ttl_ac = z[5] 2002 | for ttl in ttl_ac: 2003 | if ttl not in MS2_acc_map: 2004 | MS2_acc_map[ttl] = z 2005 | else: 2006 | if MS2_acc_map[ttl][3] < z[3]: 2007 | MS2_acc_map[ttl] = z 2008 | 2009 | # print(MS2_acc_map) 2010 | 2011 | for k in ms2_map: 2012 | a = ms2_map[k] 2013 | 2014 | if k in MS2_acc_map: 2015 | # print('HERE, ok') 2016 | z = MS2_acc_map[k] 2017 | mz, RT, ch, Intensity, ttls, ttl_ac, rt_ll, rt_rr, ion_mob, sulfur = z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7], z[8], z[9] 2018 | else: 2019 | mz = float(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z']) 2020 | RT = float(a['scanList']['scan'][0]['scan start time']) 2021 | try: 2022 | ch = int(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['charge state']) 2023 | except: 2024 | ch = 0 2025 | Intensity, ttls, ttl_ac, rt_ll, rt_rr, ion_mob, sulfur = 0, 0, 0, 0, 0, 0, 0 2026 | 2027 | 2028 | mz_arr, I_arr = a['m/z array'], a['intensity array'] 2029 | PIF = a.get('PIF', -2) 2030 | # mz = float(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z']) 2031 | # RT = float(a['scanList']['scan'][0]['scan start time']) 2032 | t_i_orig = a['index'] 2033 | outmgf.write('BEGIN IONS\n') 2034 | outmgf.write('TITLE=%s.%d.%d.%d.%s\n' % (basename_mzml, t_i_orig, t_i, t_i, str(ch))) 2035 | outmgf.write('RTINSECONDS=%f\n' % (RT * 60, )) 2036 | outmgf.write('PEPMASS=%f %f\n' % (mz, Intensity)) 2037 | if ch: 2038 | outmgf.write('CHARGE=%d+\n' % (ch, )) 2039 | outmgf.write('ISOWIDTHDIFF=%f\n' % 0) 2040 | outmgf.write('RTwidth=%f\n' % (rt_rr - rt_ll, )) 2041 | outmgf.write('MS1Intensity=%f\n' % (Intensity, )) 2042 | outmgf.write('PIF=%f\n' % (PIF, )) 2043 | outmgf.write('IonMobility=%f\n' % (ion_mob, )) 2044 | outmgf.write('Sulfur=%f\n' % (sulfur, )) 2045 | for mz_val, I_val in zip(mz_arr, I_arr): 2046 | outmgf.write('%f %f\n' % (mz_val, I_val)) 2047 | outmgf.write('END IONS\n\n') 2048 | t_i += 1 2049 | outmgf.close() 2050 | 2051 | return outmgf_name 2052 | 2053 | 2054 | def findMSMS(raw, isolation_window_left, isolation_window_right, mzs, RTs, titles, ionmobs): 2055 | out = [] 2056 | isotope_fix = raw['nIsotopes'] / raw['charge'] 2057 | mz = raw['mz'] 2058 | RT_l = raw['rtStart'] 2059 | RT_r = raw['rtEnd'] 2060 | ion_mob_p = raw['ion_mobility'] 2061 | # There is no error below: -right and +left! 2062 | id_l = mzs.searchsorted(mz - isolation_window_right) 2063 | id_r = mzs.searchsorted(mz + isolation_window_left + isotope_fix, side='right') 2064 | for idx, RT in enumerate(RTs[id_l:id_r]): 2065 | if RT_l <= RT <= RT_r: 2066 | if abs(ionmobs[id_l+idx] - ion_mob_p) <= 0.1: 2067 | out.append(titles[id_l+idx]) 2068 | if len(out): 2069 | return out 2070 | else: 2071 | return None 2072 | 2073 | 2074 | def findMSMS_accurate(raw, mzs, RTs, titles, ionmobs, chs): 2075 | out = set() 2076 | acc = 10 2077 | mz = raw['mz'] 2078 | ch = raw['charge'] 2079 | RT_l = raw['rtStart'] 2080 | RT_r = raw['rtEnd'] 2081 | ion_mob_p = raw['ion_mobility'] 2082 | acc_rel = mz * acc * 1e-6 2083 | id_l = mzs.searchsorted(mz - acc_rel) 2084 | id_r = mzs.searchsorted(mz + acc_rel, side='right') 2085 | for idx, RT in enumerate(RTs[id_l:id_r]): 2086 | if RT_l <= RT <= RT_r: 2087 | if abs(ionmobs[id_l+idx] - ion_mob_p) <= 0.1: 2088 | ch_msms = chs[id_l+idx] 2089 | if not ch_msms or ch_msms == ch: 2090 | out.add(titles[id_l+idx]) 2091 | # return True 2092 | # return False 2093 | return out 2094 | 2095 | 2096 | def generate_database(settings, outname=None): 2097 | add_decoy = settings.getboolean('input', 'add decoy') 2098 | prefix = settings.get('input', 'decoy prefix') 2099 | infix = settings.get('input', 'decoy infix') 2100 | if infix and add_decoy: 2101 | if not prefix: 2102 | prefix = infix 2103 | logger.warning('infix is specified with "add decoy" = True. Generated decoys will have PREFIX %s', prefix) 2104 | mode = settings.get('input', 'decoy method') 2105 | db = settings.get('input', 'database') 2106 | target_only = is_db_target_only(settings) 2107 | if add_decoy and target_only: 2108 | gdbname = outname or settings.get('output', 'generated database') 2109 | if gdbname: 2110 | ft = open(gdbname, 'w') 2111 | else: 2112 | ft = tempfile.NamedTemporaryFile(mode='w', delete=False) 2113 | fasta.write_decoy_db(db, ft, mode=mode, prefix=prefix) 2114 | ft.flush() 2115 | settings.set('input', 'database', ft.name) 2116 | settings.set('input', 'add decoy', 'no') 2117 | logger.debug('Generated database: %s (isfile = %s)', ft.name, os.path.isfile(ft.name)) 2118 | return ft.name 2119 | else: 2120 | logger.debug('Skipping database generation. add_decoy = %s, target_only = %s', add_decoy, target_only) 2121 | 2122 | 2123 | def get_version(): 2124 | return pkg_resources.get_distribution('identipy').version 2125 | --------------------------------------------------------------------------------