├── INSTALL
├── VERSION
├── identipy
    ├── __init__.py
    ├── adv.txt
    ├── customparser.py
    ├── cparser.pyx
    ├── main.py
    ├── default.cfg
    ├── identipy2pin.py
    ├── cutils.pyx
    ├── extras.py
    ├── cli.py
    ├── peptide_centric.py
    ├── scoring.py
    └── utils.py
├── act_payload.json
├── .gitignore
├── requirements.txt
├── pyproject.toml
├── test.sh
├── NOTICE
├── .github
    └── workflows
    │   └── publish.yml
├── setup.py
├── README.md
└── LICENSE


/INSTALL:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.3.9
2 | 


--------------------------------------------------------------------------------
/identipy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/act_payload.json:
--------------------------------------------------------------------------------
1 | {
2 |   "push": {
3 |         "ref": "v0.4.2"
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **.pyc
2 | **.so
3 | identipy/*.c
4 | **/__pycache__
5 | build/
6 | *.egg-info/
7 | test_data
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | pandas
4 | cython>=3.0a7
5 | lxml
6 | pyteomics
7 | pyteomics.cythonize
8 | 


--------------------------------------------------------------------------------
/identipy/adv.txt:
--------------------------------------------------------------------------------
1 | hillPeakFactorMinLength=800
2 | hillPeakFactor=4
3 | hillBatchSize=800
4 | hillValleyFactor=1.6
5 | hillMinLength=3
6 | hillNBoots=600
7 | maxBootSize=1200
8 | noHillSplit=True
9 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools",
3 |             "wheel",
4 |             "Cython>=3.0a7",
5 |             "numpy",
6 |             "pyteomics.cythonize",
7 |             ]
8 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | cd test_data
 4 | identipy -cfg identipy.cfg -o . *.mgf
 5 | pyteomics pepxml info *.pep.xml
 6 | 
 7 | echo "Reference values (1% FDR):"
 8 | echo 26718
 9 | echo 17670
10 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | When using or redistributing IdentiPy, or parts of it, please cite the following paper:
2 | 
3 | Levitsky, L. I., Ivanov, M. V, Lobas, A. A., Bubis, J. A., Tarasova, I. A., Solovyeva, E. M.,
4 | Pridatchenko, M. L., Gorshkov, M. V. (2018). IdentiPy: An Extensible Search Engine
5 | for Protein Identification in Shotgun Proteomics. Journal of Proteome Research, 17(7), 2249–2255.
6 | https://doi.org/10.1021/acs.jproteome.7b00640
7 | 
8 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 | 
 8 | jobs:
 9 |   publish:
10 |     name: Publish for ${{ matrix.os }}
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         os: [ubuntu-20.04, windows-2019, macos-11]
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |     - name: Build wheels
19 |       uses: pypa/cibuildwheel@v2.16.2
20 |       env:
21 |         CIBW_SKIP: "cp36-* cp37-* *-win32 *_i686 pp*"
22 |     - name: Upload binaries to release
23 |       uses: svenstaro/upload-release-action@v2
24 |       with:
25 |         repo_token: ${{ secrets.GITHUB_TOKEN }}
26 |         file: wheelhouse/*.whl
27 |         tag: ${{ github.ref }}
28 |         overwrite: false
29 |         file_glob: true
30 | 


--------------------------------------------------------------------------------
/identipy/customparser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from collections import deque
 3 | import itertools as it
 4 | 
 5 | def cleave(sequence, rule, missed_cleavages=0, min_length=None):
 6 |     """Cleaves a polypeptide sequence using a given rule.
 7 | 
 8 |     Parameters
 9 |     ----------
10 |     sequence : str
11 |         The sequence of a polypeptide.
12 | 
13 |         .. note::
14 |             The sequence is expected to be in one-letter uppercase notation.
15 |             Otherwise, some of the cleavage rules in :py:data:`expasy_rules`
16 |             will not work as expected.
17 | 
18 |     rule : str or compiled regex
19 |         A regular expression describing the site of cleavage. It is recommended
20 |         to design the regex so that it matches only the residue whose C-terminal
21 |         bond is to be cleaved. All additional requirements should be specified
22 |         using `lookaround assertions
23 |         <http://www.regular-expressions.info/lookaround.html>`_.
24 |         :py:data:`expasy_rules` contains cleavage rules for popular cleavage agents.
25 |     missed_cleavages : int, optional
26 |         Maximum number of allowed missed cleavages. Defaults to 0.
27 |     min_length : int or None, optional
28 |         Minimum peptide length. Defaults to :py:const:`None`.
29 | 
30 |         ..note ::
31 |             This checks for string length, which is only correct for one-letter
32 |             notation and not for full *modX*. Use :py:func:`length` manually if
33 |             you know what you are doing and apply :py:func:`cleave` to *modX*
34 |             sequences.
35 | 
36 |     Returns
37 |     -------
38 |     out : set
39 |         A set of unique (!) peptides.
40 | 
41 |     Examples
42 |     --------
43 |     >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'}
44 |     True
45 |     >>> cleave('GKGKYKCK', expasy_rules['trypsin'], 2) == \
46 |     {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'}
47 |     True
48 | 
49 |     """
50 |     return set(_cleave(sequence, rule, missed_cleavages, min_length))
51 | 
52 | def _cleave(sequence, rule, missed_cleavages=0, min_length=None):
53 |     """Like :py:func:`cleave`, but the result is a list. Refer to
54 |     :py:func:`cleave` for explanation of parameters.
55 |     """
56 |     # cdef list cleavage_sites
57 |     peptides = []
58 |     ml = missed_cleavages+2
59 |     trange = range(ml)
60 |     cleavage_sites = deque([0], maxlen=ml)
61 |     cl = 1
62 |     for i in it.chain([x.end() for x in re.finditer(rule, sequence)],
63 |                    [None]):
64 |         cleavage_sites.append(i)
65 |         if cl < ml:
66 |             cl += 1
67 |         for j in trange[:cl-1]:
68 |             seq = sequence[cleavage_sites[j]:cleavage_sites[-1]]
69 |             if seq:
70 |                 if min_length is None or len(seq) >= min_length:
71 |                     peptides.append((seq, cleavage_sites[j]))
72 |     return peptides


--------------------------------------------------------------------------------
/identipy/cparser.pyx:
--------------------------------------------------------------------------------
 1 | import re
 2 | from collections import deque
 3 | import itertools as it
 4 | 
 5 | def cleave(sequence, rule, missed_cleavages=0, min_length=None):
 6 |     """Cleaves a polypeptide sequence using a given rule.
 7 | 
 8 |     Parameters
 9 |     ----------
10 |     sequence : str
11 |         The sequence of a polypeptide.
12 | 
13 |         .. note::
14 |             The sequence is expected to be in one-letter uppercase notation.
15 |             Otherwise, some of the cleavage rules in :py:data:`expasy_rules`
16 |             will not work as expected.
17 | 
18 |     rule : str or compiled regex
19 |         A regular expression describing the site of cleavage. It is recommended
20 |         to design the regex so that it matches only the residue whose C-terminal
21 |         bond is to be cleaved. All additional requirements should be specified
22 |         using `lookaround assertions
23 |         <http://www.regular-expressions.info/lookaround.html>`_.
24 |         :py:data:`expasy_rules` contains cleavage rules for popular cleavage agents.
25 |     missed_cleavages : int, optional
26 |         Maximum number of allowed missed cleavages. Defaults to 0.
27 |     min_length : int or None, optional
28 |         Minimum peptide length. Defaults to :py:const:`None`.
29 | 
30 |         ..note ::
31 |             This checks for string length, which is only correct for one-letter
32 |             notation and not for full *modX*. Use :py:func:`length` manually if
33 |             you know what you are doing and apply :py:func:`cleave` to *modX*
34 |             sequences.
35 | 
36 |     Returns
37 |     -------
38 |     out : set
39 |         A set of unique (!) peptides.
40 | 
41 |     Examples
42 |     --------
43 |     >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'}
44 |     True
45 |     >>> cleave('GKGKYKCK', expasy_rules['trypsin'], 2) == \
46 |     {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'}
47 |     True
48 | 
49 |     """
50 |     return set(_cleave(sequence, rule, missed_cleavages, min_length))
51 | 
52 | def _cleave(sequence, rule, missed_cleavages=0, min_length=None):
53 |     """Like :py:func:`cleave`, but the result is a list. Refer to
54 |     :py:func:`cleave` for explanation of parameters.
55 |     """
56 |     cdef int ml, cl
57 |     # cdef int i, cl
58 |     cdef str seq
59 |     cdef list peptides
60 |     cdef list trange
61 |     # cdef list cleavage_sites
62 |     peptides = []
63 |     ml = missed_cleavages+2
64 |     trange = list(range(ml))
65 |     cleavage_sites = deque([0], maxlen=ml)
66 |     cl = 1
67 |     for i in it.chain([x.end() for x in re.finditer(rule, sequence)],
68 |                    [None]):
69 |         cleavage_sites.append(i)
70 |         if cl < ml:
71 |             cl += 1
72 |         for j in trange[:cl-1]:
73 |             seq = sequence[cleavage_sites[j]:cleavage_sites[-1]]
74 |             if seq:
75 |                 if min_length is None or len(seq) >= min_length:
76 |                     peptides.append((seq, cleavage_sites[j]))
77 |     return peptides
78 | 


--------------------------------------------------------------------------------
/identipy/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from . import utils, peptide_centric
 3 | import logging
 4 | import re
 5 | import sys
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | def process_file(fname, settings, initial_run=True):
 9 |     if initial_run:
10 |         fmods = settings.get('modifications', 'fixed')
11 |         if fmods:
12 |             for mod in re.split(r'[,;]\s*', fmods):
13 |                 if initial_run and mod.startswith('-'):
14 |                     mod_label = mod[1:]
15 |                     mass_change = settings.getfloat('modifications', mod_label)
16 |                     prev_cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage')
17 |                     settings.set('modifications', 'protein cterm cleavage', prev_cterm_mass + mass_change)
18 |                 elif initial_run and mod.endswith('-'):
19 |                     mod_label = mod[:-1]
20 |                     mass_change = settings.getfloat('modifications', mod_label)
21 |                     prev_nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage')
22 |                     settings.set('modifications', 'protein nterm cleavage', prev_nterm_mass + mass_change)
23 | 
24 |     utils.generate_database(settings)
25 |     stage1 = settings.get('misc', 'first stage')
26 |     if stage1:
27 |         return double_run(fname, settings, utils.import_(stage1))
28 |     else:
29 |         logger.debug('Starting one-stage search.')
30 |         utils.seen_target.clear()
31 |         utils.seen_decoy.clear()
32 |         logger.debug((peptide_centric.process_peptides, fname, settings))
33 |         return peptide_centric.process_peptides(fname, settings)
34 | 
35 | 
36 | def double_run(fname, settings, stage1):
37 |     logger.info('[double run] stage 1 starting ...')
38 |     settings.set('misc', 'fast first stage', 1)
39 |     new_settings = stage1(fname, settings)
40 |     logger.info('[double run] stage 2 starting ...')
41 |     new_settings.set('misc', 'fast first stage', 0)
42 |     return process_file(fname, new_settings, initial_run=False)
43 | 
44 | 
45 | def settings(fname=None, default_name=os.path.join(
46 |         os.path.dirname(os.path.abspath(__file__)), 'default.cfg')):
47 |     """Read a configuration file and return a :py:class:`RawConfigParser` object.
48 |     """
49 |     kwargs = dict(dict_type=dict, allow_no_value=True)
50 |     if sys.version_info.major == 3:
51 |         kwargs['inline_comment_prefixes'] = ('#', ';')
52 | 
53 |     raw_config = utils.CustomRawConfigParser(**kwargs)
54 |     if default_name:
55 |         logger.info('Reading defaults from %s', default_name)
56 |         if not os.path.isfile(default_name):
57 |             logger.error('FILE NOT FOUND: %s', default_name)
58 |         raw_config.read(default_name)
59 |     if fname:
60 |         logger.info('Reading config from %s', fname)
61 |         if not os.path.isfile(fname):
62 |             logger.error('FILE NOT FOUND: %s', fname)
63 |         raw_config.read(fname)
64 | 
65 |     acc_unit = raw_config.get('search', 'product accuracy unit')
66 |     if acc_unit == 'ppm':
67 |         acc_ppm = raw_config.getfloat('search', 'product accuracy')
68 |         acc_raw = acc_ppm / 1e6 * 2000
69 |         raw_config.set('search', 'product accuracy', acc_raw)
70 |         raw_config.set('search', 'product accuracy ppm', acc_ppm)
71 | 
72 |     return raw_config
73 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''
 4 | setup.py file for identipy
 5 | '''
 6 | import os
 7 | from setuptools import setup, Extension
 8 | import subprocess
 9 | import sys
10 | 
11 | def get_version():
12 |     try:
13 |         version = subprocess.check_output(['git', 'describe']).strip().decode('ascii').replace('-', '.')
14 |         if version[0] == 'v':
15 |             version = version[1:]
16 |         head, tail = version.rsplit('.', 1)
17 |         if not tail.isdigit():
18 |             version = head
19 |     except subprocess.CalledProcessError:
20 |         version = open('VERSION').readline().strip()
21 |     return version
22 | 
23 | 
24 | def make_extensions():
25 |     is_ci = bool(os.getenv("CI", ""))
26 |     include_diagnostics = False
27 |     try:
28 |         import numpy
29 |     except ImportError:
30 |         print("C Extensions require `numpy`")
31 |         raise
32 |     try:
33 |         from pyteomics import _capi
34 |     except ImportError:
35 |         print("C Extensions require `pyteomics.cythonize`")
36 |         raise
37 |     try:
38 |         from Cython.Build import cythonize
39 |         cython_directives = {
40 |             'embedsignature': True,
41 |             'profile': include_diagnostics,
42 |             'language_level': sys.version_info.major
43 |         }
44 |         macros = []
45 |         if include_diagnostics:
46 |             macros.append(("CYTHON_TRACE_NOGIL", "1"))
47 |         if is_ci and include_diagnostics:
48 |             cython_directives['linetrace'] = True
49 | 
50 |         extensions = cythonize([
51 |             Extension(name='identipy.cparser', sources=['identipy/cparser.pyx']),
52 |             Extension(name='identipy.cutils', sources=['identipy/cutils.pyx'],
53 |                       include_dirs=[numpy.get_include(), _capi.get_include()])
54 |         ], compiler_directives=cython_directives)
55 |     except ImportError:
56 |         extensions = [
57 |             Extension(name='identipy.cparser', sources=['identipy/cparser.c']),
58 |             Extension(name='identipy.cutils', sources=['identipy/cutils.c'],
59 |                       include_dirs=[numpy.get_include(), _capi.get_include()])
60 | 
61 |         ]
62 |     return extensions
63 | 
64 | 
65 | def do_setup(cext=True):
66 |     setup(
67 |         name             = 'identipy',
68 |         version          = get_version(),
69 |         description      = '''Pyteomics-based search engine''',
70 |         long_description = (''.join(open('README.md').readlines()) + '\n'
71 |                             + ''.join(open('INSTALL').readlines())),
72 |         author           = 'Lev Levitsky & Mark Ivanov',
73 |         author_email     = 'pyteomics@googlegroups.com',
74 |         url              = 'https://github.com/levitsky/identipy',
75 |         packages         = ['identipy', ],
76 |         package_data     = {'identipy': ['default.cfg', ]},
77 |         install_requires = [line.strip() for line in open('requirements.txt')],
78 |         ext_modules      = make_extensions() if cext else None,
79 |         classifiers      = ['Intended Audience :: Science/Research',
80 |                             'Programming Language :: Python :: 2.7',
81 |                             'Programming Language :: Python :: 3',
82 |                             'Topic :: Scientific/Engineering :: Bio-Informatics',
83 |                             'Topic :: Software Development :: Libraries'],
84 |         license          = 'License :: OSI Approved :: Apache Software License',
85 |         entry_points     = {'console_scripts': ['identipy = identipy.cli:run',
86 |                                                 'identipy2pin = identipy.identipy2pin:run']}
87 |         )
88 | 
89 | 
90 | try:
91 |     do_setup(True)
92 | except Exception as err:
93 |     print("*" * 60)
94 |     print("Could not compile C Extensions due to %r, attempting pure Python installation." % (err,))
95 |     print("*" * 60)
96 |     do_setup(False)
97 |     print("Could not compile C Extensions due to %r, speedups are not enabled." % (err,))
98 | 


--------------------------------------------------------------------------------
/identipy/default.cfg:
--------------------------------------------------------------------------------
  1 | # Default configuration file for IdentiPy search enfine.
  2 | # It will be used for everything that is missing in your custom config file.
  3 | # Interpolation of values and line continuation are not supported.
  4 | 
  5 | # Caution: comments on empty lines must start at the beginning of the line.
  6 | 
  7 | [search]
  8 | precursor accuracy unit: ppm ; can be ppm or Th or Da
  9 | # (the latter two mean the same, Th is correct)
 10 | precursor accuracy left: 100
 11 | precursor accuracy right: 100
 12 | # any of the above two can be negative if you want to account for
 13 | # systematic error
 14 | product accuracy: 0.1 ; mass accuracy of fragments
 15 | product accuracy unit: Da ; can be ppm or Da
 16 | product minimum m/z: 100
 17 | peptide maximum length: 60
 18 | peptide minimum length: 6
 19 | peptide maximum mass: 6000
 20 | peptide minimum mass: 250
 21 | enzyme: trypsin ; can be a name of enzyme from pyteomics.parser.expasy_rules or
 22 | # can be written in X!Tandem enzyme style. Examples:
 23 | # [RK]|{P} means cleave after R and K, but not before P
 24 | # [X]|[D] means cleave before D
 25 | # [RK]|{P},[M]|[X] means mix of trypsin and cnbr
 26 | number of missed cleavages: 1
 27 | semitryptic: 0
 28 | maximum charge: 9
 29 | minimum charge: 1
 30 | maximum unknown charge: 0
 31 | minimum unknown charge: 0
 32 | precursor isotope mass error: 0 ; When the value for this parameter is not 0,
 33 | ;the parent ion mass tolerance is expanded by opening up multiple tolerance windows centered on the given number of 13C isotope peaks for a peptide.
 34 | shifts: 0 ; example: 0, 16.000, 23.000, 12
 35 | snp: 0 ; 1 means make SNP (point mutations) check for ALL peptides
 36 | # use only for small protein databases because search time increases significantly
 37 | clip N-terminal methionine: true
 38 | rapid_check: 0 ; 1 means leave only 2000 random spectra for processing
 39 | 
 40 | [modifications]
 41 | # Examples: camC, oxM, acetyl-, -anyctermmodification
 42 | # Must be written in lowercase
 43 | fixed: camC
 44 | variable:
 45 | protein variable:
 46 | maximum variable mods: 2
 47 | protein nterm cleavage: 1.007825
 48 | protein cterm cleavage: 17.002735
 49 | p = 79.966331
 50 | ox = 15.994915
 51 | cam = 57.021464
 52 | 
 53 | [output]
 54 | format: pepXML ; can be pepxml or tsv
 55 | # separator can be specified for csv/tsv format. Default is comma for csv and tab for tsv
 56 | separator:
 57 | 
 58 | # in case of label-based quantitation Identipy can write tags intensities in the output pepXML file.
 59 | tags:
 60 | # Can be tmt10plex, tmt6plex or custom format label1:mass1,label2:mass2...
 61 | # Example for custom tmt6plex - tmt_126:126.12773,tmt_127N:127.12476...
 62 | 
 63 | #path:
 64 | candidates: 1 ; 0 means all sequence candidates
 65 | score threshold: 0
 66 | minimum matched: 4 ; minimum matched ions for reporting identification
 67 | # higher value reduces analysis time
 68 | show empty: no
 69 | precursor accuracy unit: ; can be ppm or Th or Da
 70 | # (the latter two mean the same, Th is correct)
 71 | generated database:
 72 | # with "add decoy", the filename of generated database can be specified for reuse e.g. in post-processing
 73 | 
 74 | [input]
 75 | database:
 76 | add decoy: no
 77 | # enable if your DB does not have decoy sequences
 78 | # but you want to add them to the search
 79 | decoy method: reverse ; one of 'reverse' or 'shuffle'
 80 | decoy prefix: DECOY_ ; prefix for decoy protein description
 81 | # if the decoy label is somewhere in the middle of the protein header, use this:
 82 | decoy infix:
 83 | deisotoping mass tolerance: 0.3
 84 | deisotope: yes
 85 | 
 86 | [scoring]
 87 | score: identipy.scoring.RNHS
 88 | # this can be 'identipy.scoring.RNHS2', 'identipy.scoring.RNHS',
 89 | # 'identipy.scoring.hyperscore', 'identipy.scoring.morpheusscore' or a dot-delimited
 90 | # name of a third-party function. It will be given a spectrum,
 91 | # a sequence of a candidate and config.
 92 | # score is supposed to be higher for better matches
 93 | condition:
 94 | # condition can be a path to a function (or a function object added dynamically
 95 | # within your Python program) that wil be called and given the same arguments
 96 | # as the score function. If this function returns a falsy value, the candidate
 97 | # is discarded.
 98 | minimum peaks: 4 ; minimum number of peaks in fragmentation spectrum
 99 | maximum peaks: 200 ; Getting only N peaks with max intensity from the fragmentation spectrum
100 | # set to 0 to disable
101 | dynamic range: 1000  ; affected by 'spectrum processor'
102 | # disregard all peaks that are less than
103 | # 1/x of the highest in the spectrum, where X is this value
104 | # 0 means no filtering
105 | e-values for candidates: 1
106 | maximum fragment charge: 1
107 | 
108 | [optimization]
109 | increase precursor mass tolerance: yes
110 | 
111 | [performance]
112 | processes: 0 ; 0 means auto
113 | out queue size: 40000
114 | pre-calculation: some
115 | folder:
116 | # where to store/look for precalculated files
117 | 
118 | [misc]
119 | first stage:
120 | # use identipy.extras.optimization for auto-tune
121 | # if you want custom refinement, put your function name here
122 | # the function will be given the search results and expected
123 | # to return new settings for a second search.
124 | # The same technique is used for searching with variable mods.
125 | 
126 | hash: md5
127 | # used for hashing of the database contents
128 | 
129 | 


--------------------------------------------------------------------------------
/identipy/identipy2pin.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from pyteomics import pepxml, achrom, auxiliary as aux, mass, fasta
  3 | import numpy as np
  4 | from os import path
  5 | from collections import Counter
  6 | import argparse
  7 | 
  8 | def calc_RT(seq, RC):
  9 |     try:
 10 |         return achrom.calculate_RT(seq, RC)
 11 |     except:
 12 |         return 0
 13 |     
 14 | def is_decoy(proteins, decoy_prefix):
 15 |     return all(z.startswith(decoy_prefix) for z in proteins)
 16 | 
 17 | def parse_mods(df_raw):
 18 |     mods_counter = Counter()
 19 |     sequence, mods = df_raw['peptide'], df_raw['modifications']
 20 |     if isinstance(mods, list):
 21 |         for mod in mods:
 22 |             mod_mass, aa_ind = mod.split('@')
 23 |             mod_mass = float(mod_mass)
 24 |             aa_ind = int(aa_ind)
 25 |             if aa_ind == 0:
 26 |                 aa = 'N_term'
 27 |                 mod_mass = round(mod_mass - 1.007825, 3)
 28 |             elif aa_ind == len(sequence) + 1:
 29 |                 aa = 'C_term'
 30 |                 mod_mass = round(mod_mass - 17.002735, 3)
 31 |             else:
 32 |                 aa = sequence[aa_ind-1]
 33 |                 mod_mass = round(mod_mass - mass.std_aa_mass[aa], 3)
 34 |             mod_name = 'mass shift %.3f at %s' % (mod_mass, aa)
 35 |             mods_counter[mod_name] += 1
 36 |     return mods_counter
 37 | 
 38 | def add_mod_info(df_raw, mod):
 39 |     sequence, mods_counter = df_raw['peptide'], df_raw['mods_counter']
 40 |     mod_aa = mod.split(' at ')[1]
 41 |     if 'term' not in mod_aa and mod_aa not in sequence:
 42 |         return -1
 43 |     else:
 44 |         return mods_counter.get(mod, 0)
 45 | 
 46 | def prepare_mods(df):
 47 |     all_mods = set()
 48 |     for cnt in df['mods_counter'].values:
 49 |         for k in cnt.keys():
 50 |             all_mods.add(k)
 51 |     for mod in all_mods:
 52 |         df[mod] = df.apply(add_mod_info, axis=1, mod=mod)
 53 |     return df
 54 | 
 55 | def getlabel(decoy):
 56 |     return -1 if decoy else 1
 57 | 
 58 | def prepare_dataframe(infile_path, decoy_prefix='DECOY_', use_rt=1):
 59 |     df1 = pepxml.DataFrame(infile_path, read_schema=False)
 60 |     df1['length'] = df1['peptide'].apply(len)
 61 |     try:
 62 |         df1['y-b_ions'] = df1['matched_y1_ions'] - df1['matched_b1_ions']
 63 |     except:
 64 |         pass
 65 |     df1 = df1[df1['length'] >= 6]
 66 |     df1['spectrum'] = df1['spectrum'].apply(lambda x: x.split(' RTINSECONDS')[0])
 67 |     df1['massdiff_int'] = df1['massdiff'].apply(lambda x: int(round(x, 0)))
 68 |     df1['massdiff_ppm'] = 1e6 * df1['massdiff'] / df1['calc_neutral_pep_mass']
 69 |     df1['decoy'] = df1['protein'].apply(is_decoy, decoy_prefix=decoy_prefix)
 70 |     df1['mods_counter'] = df1.apply(parse_mods, axis=1)
 71 |     df1 = prepare_mods(df1)
 72 |     
 73 |     if use_rt:
 74 |         try:
 75 |             df1['RT exp'] = df1['retention_time_sec'] / 60
 76 |             df1 = df1.drop(['retention_time_sec', ], axis=1)
 77 |             df1_f = aux.filter(df1, fdr=0.01, key='expect', is_decoy='decoy', correction=1)
 78 |             print('Default target-decoy filtering, 1%% PSM FDR: Number of target PSMs = %d' \
 79 |                     % (df1_f[~df1_f['decoy']].shape[0]))
 80 |             print('Calibrating retention model...')
 81 |             retention_coefficients = achrom.get_RCs_vary_lcp(df1_f['peptide'].values, \
 82 |                                                             df1_f['RT exp'].values)
 83 |             df1_f['RT pred'] = df1_f['peptide'].apply(lambda x: calc_RT(x, retention_coefficients))
 84 |             df1['RT pred'] = df1['peptide'].apply(lambda x: calc_RT(x, retention_coefficients))
 85 |             _, _, r_value, std_value = aux.linear_regression(df1_f['RT pred'], df1_f['RT exp'])
 86 |             print('R^2 = %f , std = %f' % (r_value**2, std_value))
 87 |             df1['RT diff'] = df1['RT pred'] - df1['RT exp']
 88 |         except:
 89 |             pass
 90 | 
 91 |     df1['Label'] = df1['decoy'].apply(getlabel)
 92 |     df1['SpecId'] = df1['index'] + 1
 93 |     df1['ScanNr'] = df1['index'] + 1
 94 |     try:
 95 |         prev_aa = df1['peptide_prev_aa'][0]
 96 |         next_aa = df1['peptide_next_aa'][0]
 97 |         df1['Peptide'] = df1['peptide'].apply(lambda x: prev_aa + '.' + x + '.' + next_aa)
 98 |     except:
 99 |         df1['Peptide'] = df1['peptide'].apply(lambda x: 'K.' + x + '.K')
100 |     df1['Proteins'] = df1['protein']
101 |     
102 |     return df1
103 | 
104 | def get_features(dataframe):
105 |     feature_columns = dataframe.columns
106 |     columns_to_remove = []
107 |     for feature in feature_columns:
108 |         if feature not in ['expect', 'hyperscore', 'calc_neutral_pep_mass', 'bscore', 'yscore', \
109 |                             'massdiff', 'massdiff_ppm', 'RT pred', 'RT diff', \
110 |                             'sumI', 'RT exp', 'precursor_neutral_mass', 'massdiff_int', \
111 |                             'num_missed_cleavages', 'tot_num_ions', 'num_matched_ions', 'length',\
112 |                             'SpecId', 'Label', 'ScanNr', 'Peptide', 'Proteins',
113 |                             'matched_y1_ions', 'matched_b1_ions', 'y-b_ions', 'fragmentMT']:
114 |             if not feature.startswith('mass shift'):
115 |                 columns_to_remove.append(feature)
116 |     feature_columns = feature_columns.drop(columns_to_remove)
117 |     return feature_columns
118 | 
119 | def run():
120 |     parser = argparse.ArgumentParser(
121 |         description='Convert Identipy pep.xml to pin for Percolator',
122 |         epilog='''
123 | 
124 |     Example usage
125 |     -------------
126 |     $ identipy2pin input.pep.xml
127 |     -------------
128 | 
129 |     Also can be used for MSFragger and X!Tandem pep.xml files.
130 |     ''',
131 |         formatter_class=argparse.RawDescriptionHelpFormatter)
132 | 
133 |     parser.add_argument('file',     help='input .pep.xml file')
134 |     parser.add_argument('-out',     help='path to output .pin file. By default put pin file near the pep.xml', default='')
135 |     parser.add_argument('-prefix',  help='decoy prefix', default='DECOY_')
136 |     parser.add_argument('-rt',      help='add RT prediction to features. 1 or 0', default=1, type=int)
137 | 
138 | 
139 |     args = vars(parser.parse_args())
140 |     infile = args['file']
141 |     prefix = args['prefix']
142 |     use_rt = args['rt']
143 |     out = args['out']
144 |     if out:
145 |         outfile = out
146 |     else:
147 |         outfile = infile.replace('.pep.xml', '.pin')
148 |     df1 = prepare_dataframe(infile, decoy_prefix=prefix, use_rt=use_rt)
149 |     df00 = df1[get_features(df1)]
150 |     df00_col = list(df00.columns.values)
151 |     df00_col.remove('SpecId')
152 |     df00_col.remove('Label')
153 |     df00_col.remove('ScanNr')
154 |     df00_col.remove('Peptide')
155 |     df00_col.remove('Proteins')
156 | 
157 |     df00_col.insert(0, 'ScanNr')
158 |     df00_col.insert(0, 'Label')
159 |     df00_col.insert(0, 'SpecId')
160 |     df00_col.append('Peptide')
161 |     df00_col.append('Proteins')
162 | 
163 |     dft = df00.reindex(columns=df00_col)
164 |     dft['Proteins'] = dft['Proteins'].apply(lambda x: 'proteinsplittmp'.join(x))
165 |     dft.to_csv(path_or_buf=outfile, index=False, sep='\t')
166 |     with open(outfile, 'r') as f :
167 |         lines = list(f.readlines())
168 |     outf = open(outfile, 'w')
169 |     for l in lines:
170 |         tmp = l.split('\t')
171 |         outf.write('\t'.join(tmp[:-1]) + '\t' + '\t'.join(tmp[-1].split('proteinsplittmp')))
172 |     outf.close()
173 |     
174 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | **IdentiPy** is a search engine for bottom-up proteomics written in Python.
  2 | 
  3 | # Citation #
  4 | 
  5 | IdentiPy is described in this JPR paper: http://dx.doi.org/10.1021/acs.jproteome.7b00640
  6 | 
  7 | Please cite it when using IdentiPy or its parts.
  8 | 
  9 | # License #
 10 | 
 11 | IdentiPy is published under the Apache 2.0 license.
 12 | 
 13 | # How to install #
 14 | 
 15 | ```
 16 | pip install git+https://github.com/levitsky/identipy.git
 17 | ```
 18 | or:
 19 | 
 20 | ```
 21 | $ git clone https://github.com/levitsky/identipy
 22 | $ cd identipy
 23 | $ pip install .
 24 | 
 25 | ```
 26 | 
 27 | # Requirements #
 28 | 
 29 | See [requirements.txt](requirements.txt). Key dependencies are:
 30 | 
 31 |  - Python
 32 |  - scipy
 33 |  - pyteomics
 34 |  - lxml
 35 |  - cython
 36 |  - pyteomics.cythonize
 37 | 
 38 | # How to use #
 39 | 
 40 | ## GUI way ##
 41 | 
 42 | You can separately install a web-based GUI for IdentiPy, [IdentiPy Server](https://github.com/levitsky/identipy_server).
 43 | Please refer to the linked page for system requirements and installation instructions.
 44 | 
 45 | ## CLI way ##
 46 | 
 47 | A typical command to process a file would look like this:
 48 | 
 49 | ```
 50 | $ identipy -cfg my.cfg spectra.mgf
 51 | ```
 52 | 
 53 | Here, `my.cfg` is a settings file specifying the search parameters. Allowed parameters and their default values are listed in the
 54 | [default configuration file](identipy/default.cfg).
 55 | Settings not specified in `my.cfg` will be taken from the default file.
 56 | 
 57 | Search settings can also be overriden using command-line options.
 58 | 
 59 | For help on command-line usage, run:
 60 | 
 61 | ```
 62 | $ identipy --help
 63 | ```
 64 | 
 65 | You will see a message like this:
 66 | 
 67 | ```
 68 | $ identipy --help
 69 | usage: identipy [-h] [-db FASTA] [-cfg CONFIG_FILE] [-out PATH] [-of FORMAT]
 70 |                 [-sep SEP] [-at] [-nopwide] [-punit UNIT] [-ptol VALUE]
 71 |                 [-lptol VALUE] [-rptol VALUE] [-funit UNIT] [-ftol VALUE]
 72 |                 [-fminmz VALUE] [-lmin N] [-lmax N] [-massmin VALUE]
 73 |                 [-massmax VALUE] [-e RULE] [-mc N] [-semi] [-noclip] [-cmin N]
 74 |                 [-cmax N] [-cumin N] [-cumax N] [-ime N] [-shifts SHIFTS]
 75 |                 [-snp SNP] [-rapid] [-mm N] [-ad] [-prefix PREFIX]
 76 |                 [-infix INFIX] [-method {reverse,shuffle}] [-deis]
 77 |                 [-deistol DEISTOL]
 78 |                 [-score {RNHS2,RNHS,hyperscore,morpheusscore}] [-minp N]
 79 |                 [-maxp N] [-dyn DYN] [-mfc N] [-nproc N] [-maxmods N]
 80 |                 [-ncleave NCLEAVE] [-ccleave CCLEAVE] [-fmods FMODS]
 81 |                 [-vmods VMODS] [-pmods PMODS] [-tags TAGS] [-debug]
 82 |                 [-dino DINO] [-dinoargs [DINOARGS ...]] [-demixing] [-pif]
 83 |                 file
 84 | 
 85 | Search proteins using LC-MS/MS spectra
 86 | 
 87 | positional arguments:
 88 |   file                  input .mzML or .mgf file with MS/MS spectra
 89 | 
 90 | options:
 91 |   -h, --help            show this help message and exit
 92 |   -db FASTA             path to protein FASTA file
 93 |   -cfg CONFIG_FILE      path to file with parameters
 94 |   -out PATH, -o PATH    output path
 95 |   -of FORMAT            output format
 96 |   -sep SEP              output column separator (for table format)
 97 |   -at                   Use auto-tuning of search parameters
 98 |   -nopwide              Do not increase initial precursor mass accuracy for
 99 |                         auto-tuning
100 |   -punit UNIT           precursor mass tolerance unit
101 |   -ptol VALUE           precursor mass tolerance
102 |   -lptol VALUE          *left precursor mass tolerance
103 |   -rptol VALUE          *right precursor mass tolerance
104 |   -funit UNIT           fragment mass tolerance unit
105 |   -ftol VALUE           fragment mass tolerance
106 |   -fminmz VALUE         fragment min m/z
107 |   -lmin N               min length of peptides
108 |   -lmax N               max length of peptides
109 |   -massmin VALUE        min mass of peptides
110 |   -massmax VALUE        max mass of peptides
111 |   -e RULE               cleavage rule in quotes!. X!Tandem style for cleavage
112 |                         rules
113 |   -mc N                 number of missed cleavages
114 |   -semi                 include semitryptic peptides
115 |   -noclip               Disable clipping of N-terminal methionine
116 |   -cmin N               min precursor charge
117 |   -cmax N               max precursor charge
118 |   -cumin N              min unknown precursor charge
119 |   -cumax N              max unknown precursor charge
120 |   -ime N                precursor isotope mass error. The parent ion mass
121 |                         tolerance is expanded by opening up multiple tolerance
122 |                         windows centered on the given number of 13C isotope
123 |                         peaks for a peptide.
124 |   -shifts SHIFTS        shifts. example: 0,16.000,23.000,12
125 |   -snp SNP              1 means make SNP changes for ALL peptides
126 |   -rapid                leave only 2000 random spectra for processing
127 |   -mm N                 number of minimum matched ions
128 |   -ad                   add decoy
129 |   -prefix PREFIX        decoy prefix
130 |   -infix INFIX          decoy infix
131 |   -method {reverse,shuffle}
132 |                         decoy method; reverse or shuffle
133 |   -deis                 use MS/MS deisotoping
134 |   -deistol DEISTOL      deisotope mass accuracy
135 |   -score {RNHS2,RNHS,hyperscore,morpheusscore}
136 |                         used scoring function
137 |   -minp N               minumum peaks in MS/MS spectra
138 |   -maxp N               maximum peaks in MS/MS spectra
139 |   -dyn DYN              dynamic range
140 |   -mfc N                maximum fragment charge
141 |   -nproc N              number of processes. 0 means auto
142 |   -maxmods N            maximum variable mods per sequence
143 |   -ncleave NCLEAVE      protein nterm cleavage
144 |   -ccleave CCLEAVE      protein cterm cleavage
145 |   -fmods FMODS          fixed modifications. Format:
146 |                         mass1@aminoacid1,mass2@aminoacid2
147 |   -vmods VMODS          variable modifications. Format:
148 |                         mass1@aminoacid1,mass2@aminoacid2
149 |   -pmods PMODS          variable protein terminal modifications
150 |   -tags TAGS            Add quantitation tags to the pepXML output. Can be
151 |                         tmt10plex, tmt6plex, tmt11plex or custom format
152 |                         label1:mass1,label2:mass2...
153 |   -debug                Print debugging messages
154 |   -dino DINO            path to Dinosaur JAR file or Biosaur executable. Used
155 |                         for chimeric spectrum processing and MS1 Intensity
156 |                         calculation
157 |   -dinoargs [DINOARGS ...]
158 |                         extra arguments to Dinosaur or Biosaur.
159 |   -demixing             Use demixing
160 |   -pif                  Calculate PIF
161 | 
162 |     Example usage
163 |     -------------
164 |     $ identipy input.mgf -db human.fasta
165 |     -------------
166 | 
167 | ```
168 | 
169 | 
170 | # Related projects #
171 | 
172 |  - Pyteomics: https://github.com/levitsky/pyteomics
173 | 
174 |  - pyteomics.cythonize: https://github.com/mobiusklein/pyteomics.cythonize
175 | 
176 |  - Scavager: https://github.com/markmipt/scavager
177 | 
178 |  - IdentiPy Server: https://github.com/levitsky/identipy_server
179 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | IdentiPy is distributed under the conditions of the
  2 | Apache License, Version 2.0:
  3 | http://www.opensource.org/licenses/Apache-2.0
  4 | 
  5 | Copyright (c) 2018, Lev Levitsky & Mark Ivanov
  6 | 
  7 | Apache License, Version 2.0
  8 | Apache License
  9 | Version 2.0, January 2004
 10 | http://www.apache.org/licenses/
 11 | 
 12 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 13 | 
 14 | 1. Definitions.
 15 | 
 16 | "License" shall mean the terms and conditions for use, reproduction, and
 17 | distribution as defined by Sections 1 through 9 of this document.
 18 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
 19 | owner that is granting the License.
 20 | 
 21 | "Legal Entity" shall mean the union of the acting entity and all other entities
 22 | that control, are controlled by, or are under common control with that entity.
 23 | For the purposes of this definition, "control" means (i) the power, direct or
 24 | indirect, to cause the direction or management of such entity, whether by
 25 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
 26 | outstanding shares, or (iii) beneficial ownership of such entity.
 27 | "You" (or "Your") shall mean an individual or Legal Entity exercising
 28 | permissions granted by this License.
 29 | 
 30 | "Source" form shall mean the preferred form for making modifications, including
 31 | but not limited to software source code, documentation source, and configuration
 32 | files.
 33 | 
 34 | "Object" form shall mean any form resulting from mechanical transformation or
 35 | translation of a Source form, including but not limited to compiled object code,
 36 | generated documentation, and conversions to other media types.
 37 | 
 38 | "Work" shall mean the work of authorship, whether in Source or Object form, made
 39 | available under the License, as indicated by a copyright notice that is included
 40 | in or attached to the work (an example is provided in the Appendix below).
 41 | 
 42 | "Derivative Works" shall mean any work, whether in Source or Object form, that
 43 | is based on (or derived from) the Work and for which the editorial revisions,
 44 | annotations, elaborations, or other modifications represent, as a whole, an
 45 | original work of authorship. For the purposes of this License, Derivative Works
 46 | shall not include works that remain separable from, or merely link (or bind by
 47 | name) to the interfaces of, the Work and Derivative Works thereof.
 48 | 
 49 | "Contribution" shall mean any work of authorship, including the original version
 50 | of the Work and any modifications or additions to that Work or Derivative Works
 51 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
 52 | by the copyright owner or by an individual or Legal Entity authorized to submit
 53 | on behalf of the copyright owner. For the purposes of this definition,
 54 | "submitted" means any form of electronic, verbal, or written communication sent
 55 | to the Licensor or its representatives, including but not limited to
 56 | communication on electronic mailing lists, source code control systems, and
 57 | issue tracking systems that are managed by, or on behalf of, the Licensor for
 58 | the purpose of discussing and improving the Work, but excluding communication
 59 | that is conspicuously marked or otherwise designated in writing by the copyright
 60 | owner as "Not a Contribution."
 61 | 
 62 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
 63 | of whom a Contribution has been received by Licensor and subsequently
 64 | incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License.
 67 | 
 68 | Subject to the terms and conditions of this License, each Contributor hereby
 69 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 70 | irrevocable copyright license to reproduce, prepare Derivative Works of,
 71 | publicly display, publicly perform, sublicense, and distribute the Work and such
 72 | Derivative Works in Source or Object form.
 73 | 
 74 | 3. Grant of Patent License.
 75 | 
 76 | Subject to the terms and conditions of this License, each Contributor hereby
 77 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 78 | irrevocable (except as stated in this section) patent license to make, have
 79 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
 80 | such license applies only to those patent claims licensable by such Contributor
 81 | that are necessarily infringed by their Contribution(s) alone or by combination
 82 | of their Contribution(s) with the Work to which such Contribution(s) was
 83 | submitted. If You institute patent litigation against any entity (including a
 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 85 | Contribution incorporated within the Work constitutes direct or contributory
 86 | patent infringement, then any patent licenses granted to You under this License
 87 | for that Work shall terminate as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution.
 90 | 
 91 | You may reproduce and distribute copies of the Work or Derivative Works thereof
 92 | in any medium, with or without modifications, and in Source or Object form,
 93 | provided that You meet the following conditions:
 94 | 
 95 | You must give any other recipients of the Work or Derivative Works a copy of
 96 | this License; and
 97 | You must cause any modified files to carry prominent notices stating that You
 98 | changed the files; and
 99 | You must retain, in the Source form of any Derivative Works that You distribute,
100 | all copyright, patent, trademark, and attribution notices from the Source form
101 | of the Work, excluding those notices that do not pertain to any part of the
102 | Derivative Works; and
103 | If the Work includes a "NOTICE" text file as part of its distribution, then any
104 | Derivative Works that You distribute must include a readable copy of the
105 | attribution notices contained within such NOTICE file, excluding those notices
106 | that do not pertain to any part of the Derivative Works, in at least one of the
107 | following places: within a NOTICE text file distributed as part of the
108 | Derivative Works; within the Source form or documentation, if provided along
109 | with the Derivative Works; or, within a display generated by the Derivative
110 | Works, if and wherever such third-party notices normally appear. The contents of
111 | the NOTICE file are for informational purposes only and do not modify the
112 | License. You may add Your own attribution notices within Derivative Works that
113 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
114 | provided that such additional attribution notices cannot be construed as
115 | modifying the License.
116 | 
117 | You may add Your own copyright statement to Your modifications and may provide
118 | additional or different license terms and conditions for use, reproduction, or
119 | distribution of Your modifications, or for any such Derivative Works as a whole,
120 | provided Your use, reproduction, and distribution of the Work otherwise complies
121 | with the conditions stated in this License.
122 | 
123 | 5. Submission of Contributions.
124 | 
125 | Unless You explicitly state otherwise, any Contribution intentionally submitted
126 | for inclusion in the Work by You to the Licensor shall be under the terms and
127 | conditions of this License, without any additional terms or conditions.
128 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
129 | any separate license agreement you may have executed with Licensor regarding
130 | such Contributions.
131 | 
132 | 6. Trademarks.
133 | 
134 | This License does not grant permission to use the trade names, trademarks,
135 | service marks, or product names of the Licensor, except as required for
136 | reasonable and customary use in describing the origin of the Work and
137 | reproducing the content of the NOTICE file.
138 | 
139 | 7. Disclaimer of Warranty.
140 | 
141 | Unless required by applicable law or agreed to in writing, Licensor provides the
142 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
143 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
144 | including, without limitation, any warranties or conditions of TITLE,
145 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
146 | solely responsible for determining the appropriateness of using or
147 | redistributing the Work and assume any risks associated with Your exercise of
148 | permissions under this License.
149 | 
150 | 8. Limitation of Liability.
151 | 
152 | In no event and under no legal theory, whether in tort (including negligence),
153 | contract, or otherwise, unless required by applicable law (such as deliberate
154 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
155 | liable to You for damages, including any direct, indirect, special, incidental,
156 | or consequential damages of any character arising as a result of this License or
157 | out of the use or inability to use the Work (including but not limited to
158 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
159 | any and all other commercial damages or losses), even if such Contributor has
160 | been advised of the possibility of such damages.
161 | 
162 | 9. Accepting Warranty or Additional Liability.
163 | 
164 | While redistributing the Work or Derivative Works thereof, You may choose to
165 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
166 | other liability obligations and/or rights consistent with this License. However,
167 | in accepting such obligations, You may act only on Your own behalf and on Your
168 | sole responsibility, not on behalf of any other Contributor, and only if You
169 | agree to indemnify, defend, and hold each Contributor harmless for any liability
170 | incurred by, or claims asserted against, such Contributor by reason of your
171 | accepting any such warranty or additional liability.
172 | 
173 | END OF TERMS AND CONDITIONS
174 | 


--------------------------------------------------------------------------------
/identipy/cutils.pyx:
--------------------------------------------------------------------------------
  1 | cimport cython
  2 | from cpython.sequence cimport PySequence_GetSlice
  3 | from cpython.dict cimport PyDict_GetItem, PyDict_SetItem
  4 | from cpython.float cimport PyFloat_AsDouble
  5 | from cpython.tuple cimport PyTuple_GetItem
  6 | 
  7 | from pyteomics import cmass
  8 | from math import factorial
  9 | 
 10 | cimport pyteomics.cmass as cmass
 11 | 
 12 | from pyteomics import electrochem as ec
 13 | import numpy as np
 14 | cimport numpy as np
 15 | 
 16 | np.import_array()
 17 | 
 18 | 
 19 | cdef:
 20 |     dict std_aa_mass = cmass.std_aa_mass
 21 |     dict std_ion_comp = cmass.std_ion_comp
 22 |     dict nist_mass = cmass._nist_mass
 23 | 
 24 | cdef dict ion_shift_dict
 25 | 
 26 | ion_shift_dict = {
 27 |     'a': 46.00547930326002,
 28 |     'b': 18.010564683699954,
 29 |     'c': 0.984015582689949,
 30 |     'x': -25.979264555419945,
 31 |     'y': 0.0,
 32 |     'z': 17.026549101010005,
 33 | }
 34 | 
 35 | 
 36 | 
 37 | @cython.cdivision(True)
 38 | @cython.boundscheck(False)
 39 | @cython.wraparound(True)
 40 | def RNHS_ultrafast(dict cur_idict, dict theoretical_set, int min_matched, dict best_res, set allowed_idx, int max_v):
 41 | 
 42 |     cdef int total_matched, nm_key, num_b_ions, num_y_ions
 43 |     cdef dict cnt_b, cnt_y
 44 |     cdef set out
 45 |     cdef float best_res_val
 46 |  
 47 |     if not cur_idict:
 48 |         return None
 49 | 
 50 |     total_matched = 0
 51 | 
 52 |     cnt_b = dict()
 53 |     cnt_y = dict()
 54 | 
 55 |     for ion in theoretical_set['b']:
 56 |         if ion in cur_idict:
 57 |             for xx in cur_idict[ion]:
 58 |                 if xx not in cnt_b:
 59 |                     cnt_b[xx] = 1
 60 |                 else:
 61 |                     cnt_b[xx] += 1
 62 |             total_matched += 1
 63 | 
 64 |     for ion in theoretical_set['y']:
 65 |         if ion in cur_idict:
 66 |             for xx in cur_idict[ion]:
 67 |                 if xx not in cnt_y:
 68 |                     cnt_y[xx] = 1
 69 |                 else:
 70 |                     cnt_y[xx] += 1
 71 |             total_matched += 1
 72 | 
 73 |     if total_matched < min_matched:
 74 |         return None
 75 | 
 76 |     out = set()
 77 |     for k in allowed_idx:
 78 |         num_b_ions = 0
 79 |         num_y_ions = 0
 80 |         if k in cnt_b:
 81 |             num_b_ions = cnt_b[k]
 82 |         if k in cnt_y:
 83 |             num_y_ions = cnt_y[k]
 84 |         if num_b_ions + num_y_ions >= min_matched:
 85 |             best_res_val = best_res.get(k, 0)
 86 |             if not best_res_val or -factorial(num_b_ions) * factorial(num_y_ions) <= best_res_val:
 87 |                 out.add(k)
 88 |     return out
 89 | 
 90 | @cython.cdivision(True)
 91 | @cython.boundscheck(False)
 92 | @cython.wraparound(True)
 93 | def RNHS_fast_old(set spectrum_fastset, dict spectrum_idict , dict theoretical_set, int min_matched):
 94 |     cdef int matched_approx_b, matched_approx_y, matched_approx
 95 |     cdef set matched_b, matched_y
 96 |     cdef float isum
 97 |     isum = 0
 98 |     matched_b = spectrum_fastset.intersection(theoretical_set['b'])
 99 |     matched_y = spectrum_fastset.intersection(theoretical_set['y'])
100 |     matched_approx_b = len(matched_b)
101 |     matched_approx_y = len(matched_y)
102 |     matched_approx = matched_approx_b + matched_approx_y
103 |     if matched_approx >= min_matched:
104 |         for fr in matched_b:
105 |             isum += spectrum_idict[fr]
106 |         for fr in matched_y:
107 |             isum += spectrum_idict[fr]
108 |         return matched_approx, factorial(matched_approx_b) * factorial(matched_approx_y) * isum
109 |     else:
110 |         return 0, 0
111 | 
112 | @cython.cdivision(True)
113 | @cython.boundscheck(False)
114 | @cython.wraparound(True)
115 | def RNHS_fast(set spectrum_fastset, dict spectrum_idict , dict theoretical_set, int min_matched, dict rank_map):
116 |     cdef int matched_approx_b, matched_approx_y, matched_approx
117 |     cdef set matched_b, matched_y
118 |     cdef float isum
119 |     cdef list all_matched
120 |     isum = 0
121 | 
122 |     all_matched = []
123 |     for ion in theoretical_set:
124 |         matched_tmp = spectrum_fastset.intersection(theoretical_set[ion])
125 |         all_matched.append((ion, matched_tmp))
126 |     matched_approx = sum(len(z) for z in all_matched)
127 |     if matched_approx >= min_matched:
128 |         for ion, matched_tmp in all_matched:
129 |             for fr in matched_tmp:
130 |                 i_rank = spectrum_idict[fr]
131 |                 if i_rank in rank_map:
132 |                     tmp_d = rank_map[i_rank]
133 |                     if ion in tmp_d:
134 |                         isum += tmp_d[ion]
135 |                     else:
136 |                         isum += rank_map['m']
137 |         return matched_approx, isum
138 |     else:
139 |         return 0, 0
140 | 
141 | 
142 | @cython.cdivision(True)
143 | @cython.boundscheck(False)
144 | @cython.wraparound(True)
145 | def RNHS_fast_basic(set spectrum_fastset, dict spectrum_idict , dict theoretical_set, int min_matched):
146 |     cdef int matched_approx_b, matched_approx_y, matched_approx
147 |     cdef set matched_b, matched_y
148 |     cdef float isum
149 |     cdef list all_matched
150 |     isum = 0
151 | 
152 |     all_matched = []
153 |     for ion in theoretical_set:
154 |         matched_tmp = spectrum_fastset.intersection(theoretical_set[ion])
155 |         all_matched.append(matched_tmp)
156 |     matched_approx = sum(len(z) for z in all_matched)
157 |     if matched_approx >= min_matched:
158 |         for matched_tmp in all_matched:
159 |             for fr in matched_tmp:
160 |                 isum += spectrum_idict[fr]
161 |         for matched_tmp in all_matched:
162 |             isum *= factorial(len(matched_tmp))
163 |         return matched_approx, isum
164 |     else:
165 |         return 0, 0
166 | 
167 | @cython.cdivision(True)
168 | @cython.boundscheck(False)
169 | @cython.wraparound(True)
170 | cdef float calc_ions_from_neutral_mass(str peptide, float nm, str ion_type, int charge, dict aa_mass, float cterm_mass, float nterm_mass):
171 |     cdef float nmi
172 |     if ion_type in 'abc':
173 |         nmi = nm - aa_mass[peptide[-1]] - ion_shift_dict[ion_type] - (cterm_mass - 17.002735)
174 |     else:
175 |         nmi = nm - aa_mass[peptide[0]] - ion_shift_dict[ion_type] - (nterm_mass - 1.007825)
176 |     return (nmi + 1.0072764667700085 * charge) / charge 
177 | 
178 | @cython.cdivision(True)
179 | @cython.boundscheck(False)
180 | @cython.wraparound(True)
181 | cdef list get_n_ions(str peptide, float maxmass, int pl, int charge, dict k_aa_mass):
182 |     cdef int i
183 |     cdef list tmp
184 |     tmp = [maxmass, ]
185 |     for i in xrange(1, pl):
186 |         tmp.append(tmp[-1] - k_aa_mass[peptide[-i-1]]/charge)
187 |     return tmp
188 | 
189 | @cython.cdivision(True)
190 | @cython.boundscheck(False)
191 | @cython.wraparound(True)
192 | cdef list get_c_ions(str peptide, float maxmass, int pl, int charge, dict k_aa_mass):
193 |     cdef int i
194 |     cdef list tmp
195 |     tmp = [maxmass, ]
196 |     for i in xrange(pl-2, -1, -1):
197 |         tmp.append(tmp[-1] - k_aa_mass[peptide[-(i+2)]]/charge)
198 |     return tmp
199 | 
200 | @cython.cdivision(True)
201 | @cython.boundscheck(False)
202 | @cython.wraparound(True)
203 | cdef tuple ctheor_spectrum(str peptide, double acc_frag, double nterm_mass, double cterm_mass, tuple types,
204 |                            int maxcharge, bint reshape, dict kwargs):
205 |     cdef int pl, charge, i, n, i_type, n_types
206 |     cdef bint nterminal
207 |     cdef str ion_type, maxpart, part
208 |     cdef float maxmass, part_mass, nm
209 |     cdef dict peaks, theoretical_set
210 |     cdef dict aa_mass, ion_comp, mass_data
211 |     cdef list theoretical_set_item
212 |     cdef list ions_scaled, marr
213 |     cdef object marr_storage
214 | 
215 |     aa_mass = kwargs.get("aa_mass")
216 |     if aa_mass is None:
217 |         aa_mass = std_aa_mass
218 |     ion_comp = kwargs.get("ion_comp")
219 |     if ion_comp is None:
220 |         ion_comp = std_ion_comp
221 |     mass_data = kwargs.get("mass_data")
222 |     if mass_data is None:
223 |         mass_data = nist_mass
224 |     nm = kwargs.get("nm")
225 |     if nm is None:
226 |         nm = cmass.fast_mass(peptide, **kwargs) + (nterm_mass - 1.007825) + (cterm_mass - 17.002735)
227 |     peaks = {}
228 |     theoretical_set = dict()
229 | 
230 |     pl = len(peptide) - 1
231 |     n_types = len(types)
232 |     for charge in range(1, maxcharge + 1):
233 |         for i_type in range(n_types):
234 |             ion_type = <str>PyTuple_GetItem(types, i_type)
235 |             nterminal = ion_type[0] in 'abc'
236 |             if nterminal:
237 |                 maxmass = calc_ions_from_neutral_mass(peptide, nm, ion_type=ion_type, charge=charge,
238 |                                 aa_mass=kwargs['aa_mass'], cterm_mass=cterm_mass, nterm_mass=nterm_mass)
239 |                 marr = get_n_ions(peptide, maxmass, pl, charge, kwargs['aa_mass'])
240 |             else:
241 |                 maxmass = calc_ions_from_neutral_mass(peptide, nm, ion_type=ion_type, charge=charge,
242 |                                 aa_mass=kwargs['aa_mass'], cterm_mass=cterm_mass, nterm_mass=nterm_mass)
243 |                 marr = get_c_ions(peptide, maxmass, pl, charge, kwargs['aa_mass'])
244 | 
245 |             iname = (ion_type, charge)
246 |             ions_scaled = [<int>(x / acc_frag) for x in marr]
247 |             if iname in theoretical_set:
248 |                 theoretical_set_item = <list>PyDict_GetItem(theoretical_set, iname)
249 |                 theoretical_set_item.extend(ions_scaled)
250 |             else:
251 |                 theoretical_set[iname] = ions_scaled                
252 | 
253 |             if reshape:
254 |                 marr_storage = np.array(marr)
255 |                 marr_storage.sort()
256 |                 n = marr_storage.size
257 |                 marr_storage = marr_storage.reshape((n, 1))
258 | 
259 |                 iname = (ion_type, charge)
260 |                 peaks[iname] = marr_storage
261 |             else:
262 |                 iname = (ion_type, charge)
263 |                 peaks[iname] = sorted(marr)
264 |     return peaks, theoretical_set
265 | 
266 | 
267 | def theor_spectrum(peptide, acc_frag, nterm_mass, cterm_mass, types=('b', 'y'), maxcharge=None, reshape=False, **kwargs):
268 |     if not maxcharge:
269 |         maxcharge = 1 + int(ec.charge(peptide, pH=2))
270 |     return ctheor_spectrum(peptide, acc_frag, nterm_mass, cterm_mass, tuple(types), maxcharge, reshape, kwargs)
271 | 
272 | 


--------------------------------------------------------------------------------
/identipy/extras.py:
--------------------------------------------------------------------------------
  1 | from scipy.stats import percentileofscore, scoreatpercentile
  2 | from scipy.optimize import curve_fit
  3 | from pyteomics import achrom, auxiliary as aux, parser, mass
  4 | from collections import Counter, defaultdict
  5 | from .main import *
  6 | from .scoring import get_fragment_mass_tol, get_fragment_mass_tol_ppm
  7 | import logging
  8 | logger = logging.getLogger(__name__)
  9 | import numpy as np
 10 | from .utils import get_info, get_aa_mass, get_enzyme, calculate_RT, get_title
 11 | try:
 12 |     from pyteomics import cmass
 13 | except ImportError:
 14 |     cmass = mass
 15 | from scipy.stats import rankdata
 16 | from copy import deepcopy
 17 | from scipy.optimize import curve_fit
 18 | 
 19 | def FDbinSize(X):
 20 |     """Calculates the Freedman-Diaconis bin size for
 21 |     a data set for use in making a histogram
 22 |     Arguments:
 23 |     X:  1D Data set
 24 |     Returns:
 25 |     h:  F-D bin size
 26 |     """
 27 |     X = np.sort(X)
 28 |     upperQuartile = scoreatpercentile(X, 75)
 29 |     lowerQuartile = scoreatpercentile(X, 25)
 30 |     IQR = upperQuartile - lowerQuartile
 31 |     h = 2. * IQR / len(X) ** (1. / 3.)
 32 |     return h
 33 | 
 34 | def get_peptides_subset(results):
 35 |     tmp_dict = dict()
 36 | 
 37 |     massdif = np.array([res['candidates'][0][4]['mzdiff']['ppm'] for res in results])
 38 | 
 39 |     for result in results:
 40 |         r_spectrum = get_title(result['spectrum'])
 41 |         r_sequence = str(result['candidates'][0][1])
 42 |         r_mass_diff_abs = abs(result['candidates'][0][4]['mzdiff']['ppm'])
 43 |         if r_sequence not in tmp_dict or r_mass_diff_abs < tmp_dict[r_sequence][1]:
 44 |             tmp_dict[r_sequence] = (r_spectrum, r_mass_diff_abs)
 45 |             # print(r_spectrum)
 46 | 
 47 |     new_results = []
 48 |     for result in results:
 49 |         r_spectrum = get_title(result['spectrum'])
 50 |         r_sequence = str(result['candidates'][0][1])
 51 |         if r_spectrum == tmp_dict[r_sequence][0]:
 52 |             new_results.append(result)
 53 |     return new_results
 54 | 
 55 | def get_subset(results, settings, fdr=0.01):
 56 |     """Filter results to given FDR using top 1 candidates"""
 57 |     subset = aux.filter(results, key=lambda x: x['e-values'][0],
 58 |             is_decoy = lambda x: x['candidates'][0][2] == 'd',
 59 |             fdr=fdr)
 60 |     return subset
 61 | 
 62 | def optimization(fname, settings):
 63 |     settings = settings.copy()
 64 |     settings.set('misc', 'first stage', '')
 65 |     efc = settings.get('scoring', 'e-values for candidates')
 66 |     settings.set('scoring', 'e-values for candidates', 1)
 67 |     left = settings.getfloat('search', 'precursor accuracy left')
 68 |     right = settings.getfloat('search', 'precursor accuracy right')
 69 |     wide = settings.getboolean('optimization', 'increase precursor mass tolerance')
 70 |     if settings.get('search', 'precursor accuracy unit') != 'ppm':
 71 |         left *= 1000
 72 |         right *= 1000
 73 |     if left < 100 and wide:
 74 |         settings.set('search', 'precursor accuracy left', 100)
 75 |     if right < 100 and wide:
 76 |         settings.set('search', 'precursor accuracy right', 100)
 77 |     # settings.set('search', 'precursor accuracy unit', 'ppm')
 78 |     results = list(process_file(fname, settings, initial_run=False))
 79 |     filtered = get_subset(results, settings, fdr=0.01)
 80 |     filtered = get_peptides_subset(filtered)
 81 |     logger.info('%s PSMs with 1%% FDR.', len(filtered))
 82 |     if len(filtered) < 250:
 83 |         if len(filtered) < 250:
 84 |             logger.warning('OPTIMIZATION ABORTED')
 85 |             return settings
 86 |         else:
 87 |             functions = [precursor_mass_optimization, fragment_mass_optimization,
 88 |                     missed_cleavages_optimization]
 89 |     else:
 90 |         functions = [
 91 |                 rt_filtering,
 92 |                 # precursor_mass_optimization,
 93 |                 fragment_mass_optimization,
 94 | #               missed_cleavages_optimization
 95 |                 ]
 96 |     for func in functions:
 97 |         # settings = func(filtered, settings, get_subset(results, settings, fdr=100.0))
 98 |         settings = func(filtered, settings, [x for x in results if x['candidates'][0][2] == 'd'])
 99 |     settings.set('scoring', 'e-values for candidates', efc)
100 |     settings.set('scoring', 'best peptides', [str(res['candidates'][0][1]) for res in results])
101 |     return settings
102 | 
103 | 
104 | def charge_optimization(results, settings):
105 |     settings = settings.copy()
106 |     chargestates = np.array([get_info(res['spectrum'], res, settings)[1] for res in results])
107 |     mincharge = chargestates.min()
108 |     maxcharge = chargestates.max()
109 |     
110 |     for ch in range(mincharge, maxcharge+1):
111 |         if float(chargestates[chargestates < ch].size) / chargestates.size < 0.01:
112 |             mincharge = ch
113 |     for ch in range(maxcharge, mincharge-1, -1):
114 |         if float(chargestates[chargestates > ch].size) / chargestates.size < 0.01:
115 |             maxcharge = ch
116 |     logger.info('NEW charges = %s:%s', mincharge, maxcharge)
117 |     settings.set('search', 'maximum charge', maxcharge)
118 |     settings.set('search', 'minimum charge', mincharge)
119 |     return settings
120 | 
121 | def calibrate_mass(bwidth, mass_left, mass_right, true_md):
122 |     bbins = np.arange(-mass_left, mass_right, bwidth)
123 |     H1, b1 = np.histogram(true_md, bins=bbins)
124 |     b1 = b1 + bwidth
125 |     b1 = b1[:-1]
126 | 
127 |     popt, pcov = curve_fit(noisygaus, b1, H1, p0=[1, np.median(true_md), 1, 1])
128 |     mass_shift, mass_sigma = popt[1], np.abs(popt[2])
129 |     return mass_shift, mass_sigma, pcov[0][0]
130 | 
131 | def noisygaus(x, a, x0, sigma, b):
132 |     return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2)) + b
133 | 
134 | def precursor_mass_optimization(results, settings, unf):
135 |     settings_nopime = settings.copy()
136 |     settings_nopime.set('search', 'precursor isotope mass error', '0')
137 |     settings_nopime.set('search', 'shifts', '0')
138 | #   results = get_output(results, settings_nopime)
139 | 
140 |     settings = settings.copy()
141 |     mass_left = settings.getfloat('search', 'precursor accuracy left')
142 |     mass_right = settings.getfloat('search', 'precursor accuracy right')
143 |     massdif = np.array([res['candidates'][0][4]['mzdiff']['ppm'] for res in results])
144 |     massdif = massdif[(massdif > -mass_left) & (massdif < mass_right)]
145 |     if settings.get('search', 'precursor accuracy unit') != 'ppm':
146 |         mass_left = mass_left * 1e6 / 400
147 |         mass_right = mass_right * 1e6 / 400
148 |     logger.info('mass_left, mass_right: %s, %s', mass_left, mass_right)
149 |     try:
150 |         mass_shift, mass_sigma, covvalue = calibrate_mass(0.1, mass_left, mass_right, massdif)
151 |         if np.isinf(covvalue):
152 |             mass_shift, mass_sigma, covvalue = calibrate_mass(0.01, mass_left, mass_right, massdif)
153 |         logger.info('%s, %s -> %s +- 8 * %s; %s', mass_left, mass_right, mass_shift, mass_sigma, covvalue)
154 |         best_par_mt_l = mass_shift - 8 * mass_sigma
155 |         best_par_mt_r = mass_shift + 8 * mass_sigma
156 |         logger.info('SMART MASS TOLERANCE = %s:%s', best_par_mt_l, best_par_mt_r)
157 |     except RuntimeError:
158 |         error = True
159 |     else:
160 |         error = False
161 |     if not error and np.isinf(covvalue):
162 |         error = True
163 |         logger.warning('Double error when fitting precursor errors: %s', massdif)
164 |     print(percentileofscore(massdif, best_par_mt_r) - percentileofscore(massdif, best_par_mt_l), '!!!')
165 |     if error or (percentileofscore(massdif, best_par_mt_r) - percentileofscore(massdif, best_par_mt_l) < 95):
166 |         best_par_mt_l = scoreatpercentile(massdif, 0.1)
167 |         best_par_mt_r = scoreatpercentile(massdif, 99.9)
168 |         logger.warning('Percentage sanity check FAILED. Falling back on percentage boundaries')
169 |     else:
170 |         best_par_mt_l = max(best_par_mt_l, scoreatpercentile(massdif, 0.1))
171 |         best_par_mt_r = min(best_par_mt_r, scoreatpercentile(massdif, 99.9))
172 | 
173 |     best_par_mt_l = -10
174 |     best_par_mt_r = 10
175 |     logger.info('NEW PARENT MASS TOLERANCE = %s:%s', best_par_mt_l, best_par_mt_r)
176 |     settings.set('search', 'precursor accuracy left', -best_par_mt_l)
177 |     settings.set('search', 'precursor accuracy right', best_par_mt_r)
178 |     settings.set('search', 'precursor accuracy unit', 'ppm')
179 |     return settings
180 | 
181 | def missed_cleavages_optimization(results, settings, unf):
182 |     settings = settings.copy()
183 |     missedcleavages = np.array([parser.num_sites(str(res['candidates'][0][1]), get_enzyme(str(settings.get('search', 'enzyme'))))
184 |         for res in results])
185 |     best_missedcleavages = missedcleavages.max()
186 |     for mc in range(best_missedcleavages, -1, -1):
187 |         if float(missedcleavages[missedcleavages > mc].size) / missedcleavages.size < 0.002:
188 |             best_missedcleavages = mc
189 |     logger.info('NEW miscleavages = %s', best_missedcleavages)
190 |     settings.set('search', 'number of missed cleavages', best_missedcleavages)
191 |     return settings
192 | 
193 | def fragment_mass_optimization(results, settings, results_unf):
194 |     settings = settings.copy()
195 |     fragmassdif = []
196 |     I_all = []
197 | 
198 |     maxcharge = settings.getint('search', 'maximum charge')
199 |     mincharge = settings.getint('search', 'minimum charge')
200 | 
201 |     fragmassdif = []
202 |     if settings.has_option('misc', 'aa_mass'):
203 |         aa_mass = settings.get('misc', 'aa_mass')
204 |     else:
205 |         aa_mass = get_aa_mass(settings)
206 | 
207 |     for res in results:
208 | 
209 |         neutral_mass, charge_state, RT, comp_voltage = get_info(res['spectrum'], res, settings, aa_mass)
210 |         p_len = len(str(res['candidates'][0][1]))
211 |         tres = get_fragment_mass_tol(res['spectrum'], str(res['candidates'][0][1]), settings, charge_state)
212 |         fragmassdif.extend(tres['fmt'])
213 | 
214 |     fragmassdif = np.array(fragmassdif)
215 | 
216 |     best_frag_mt = scoreatpercentile(fragmassdif, 68) * 4
217 | 
218 |     logger.info('NEW FRAGMENT MASS TOLERANCE ppm = %s', best_frag_mt)
219 |     settings.set('search', 'product accuracy ppm', best_frag_mt)
220 |     settings.set('search', 'product accuracy unit', 'ppm')
221 | 
222 |     return settings
223 | 
224 | 
225 | def rt_filtering(results, settings, unf):
226 |     settings = settings.copy()
227 |     if settings.has_option('misc', 'legend'):
228 |         legend = settings.get('misc', 'legend')
229 |     else:
230 |         legend = None
231 |     RTexp, seqs = zip(*[(utils.get_RT(res['spectrum']), res['candidates'][0][1]) for res in results])
232 |     if legend is not None:
233 |         stdl = set(parser.std_labels)
234 |         newseqs = []
235 |         for s in seqs:
236 |             if parser.fast_valid(s):
237 |                 newseqs.append(list(s))
238 |             else:
239 |                 seq = []
240 |                 c, n = False, False
241 |                 for c in s:
242 |                     if c in stdl:
243 |                         seq.append(c)
244 |                     else:
245 |                         mod, res, term = legend[c]
246 |                         if res == '-':
247 |                             if term == '[':
248 |                                 seq.append(mod+'-')
249 |                                 n = True
250 |                             else:
251 |                                 seq.append('-'+mod)
252 |                                 c = True
253 |                         else:
254 |                             seq.append(mod+res)
255 |                     if not n: seq.append(parser.std_nterm)
256 |                     if not c: seq.append(parser.std_cterm)
257 |                 newseqs.append(seq)
258 |         seqs = newseqs
259 |     RTexp = [float(x) for x in RTexp]
260 |     if np.allclose(RTexp, 0):
261 |         logger.warning('RT is missing. Skipping RT optimization.')
262 |         return settings
263 |     RC_def = achrom.RCs_gilar_rp
264 |     xdict = {}
265 |     for key, val in RC_def['aa'].items():
266 |         xdict[key] = [val, None]
267 |     RC_dict = utils.get_RCs_vary_lcp(seqs, RTexp)
268 |     RC_dict_new = dict()
269 |     for key, val in RC_dict['aa'].items():
270 |         xdict.setdefault(key, [val, None])[1] = val
271 |     a, b, _, _ = aux.linear_regression([x[0] for x in xdict.values() if x[1] != None], [x[1] for x in xdict.values() if x[1] != None])
272 |     for key, x in xdict.items():
273 |         if x[1] == None:
274 |             x[1] = x[0] * a + b
275 |         RC_dict_new[key] = x[1]
276 |     if legend is not None:
277 |         for k, v in legend.items():
278 |             if len(k) == 1: continue
279 |             if k[-1] in '[]':
280 |                 if k[-2] == '-':
281 |                     kk = ('-' + k[1:-1]) if k[-1] == ']' else (k[:-1])
282 |                 else:
283 |                     kk = k[:-1]
284 |             elif len(k) > 1:
285 |                 kk = k
286 |             logger.debug('%s -> %s', k, kk)
287 |             if kk in RC_dict_new:
288 |                 RC_dict_new[v] = RC_dict_new[kk]
289 |             else:
290 |                 if kk[-1].isupper():
291 |                     kkk = kk[-1]
292 |                 elif kk[-1] == '-':
293 |                     kkk = parser.std_nterm
294 |                 elif kk[0] == '-':
295 |                     kkk = parser.std_cterm
296 |                 RC_dict_new[v] = RC_dict_new.get(kkk, 0)
297 |                 logger.info('No RC for %s, using %s or 0: %s', kk, kkk, RC_dict_new[v])
298 | 
299 | 
300 |     RC_dict['aa'] = RC_dict_new
301 | 
302 |     logger.debug('RC dict: %s', RC_dict)
303 |     rtexp = np.array([np.mean(x) for x in RTexp])
304 |     rttheor = np.array([calculate_RT(pep, RC_dict, raise_no_mod=False)
305 |         for pep in seqs])
306 |     deltaRT = rtexp - rttheor
307 |     logger.debug('Linear regression: %s', aux.linear_regression(rtexp, rttheor))
308 |     best_RT_l = scoreatpercentile(deltaRT, 0.05)
309 |     best_RT_r = scoreatpercentile(deltaRT, 99.95)
310 | 
311 |     def condition(spectrum, cand, _, stored_value=False):
312 |         if not stored_value:
313 |             stored_value = calculate_RT(cand, RC_dict)
314 |         rtd = spectrum['RT'] - stored_value
315 |         return best_RT_l <= rtd <= best_RT_r, stored_value
316 |     settings.set('scoring', 'condition', condition)
317 |     return settings
318 | 
319 | 
320 | def calibrate_mass(bwidth, mass_left, mass_right, true_md):
321 |     bbins = np.arange(-mass_left, mass_right, bwidth)
322 |     H1, b1 = np.histogram(true_md, bins=bbins)
323 |     b1 = b1 + bwidth
324 |     b1 = b1[:-1]
325 | 
326 |     popt, pcov = curve_fit(noisygaus, b1, H1, p0=[1, np.median(true_md), 1, 1])
327 |     mass_shift, mass_sigma = popt[1], np.abs(popt[2])
328 |     return mass_shift, mass_sigma, pcov[0][0]
329 | 
330 | def noisygaus(x, a, x0, sigma, b):
331 |     return a * np.exp(-(x - x0) ** 2 / (2 * sigma ** 2)) + b
332 | 


--------------------------------------------------------------------------------
/identipy/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import string
  3 | import logging.config
  4 | import os
  5 | import subprocess
  6 | import copy
  7 | import shlex
  8 | 
  9 | LOGGING = {
 10 |     'version': 1,
 11 |     'disable_existing_loggers': True,
 12 |     'formatters': {
 13 |         'verbose': {
 14 |             'format': '%(levelname)7s %(asctime)s %(module)s %(process)d %(thread)d %(message)s',
 15 |         },
 16 |         'simple': {
 17 |             'format': '%(levelname)7s: %(asctime)s %(message)s',
 18 |             'datefmt': '[%H:%M:%S]',
 19 |         },
 20 |     },
 21 |     'handlers': {
 22 |         'console': {
 23 |             'level': 'DEBUG',
 24 |             'class': 'logging.StreamHandler',
 25 |             'formatter': 'simple',
 26 |         },
 27 |     },
 28 |     'loggers': {
 29 |         'identipy': {
 30 |             'handlers': ['console'],
 31 |             'level': 'INFO',
 32 |         }
 33 |     }
 34 | }
 35 | 
 36 | logging.config.dictConfig(LOGGING)
 37 | import logging
 38 | logger = logging.getLogger(__name__)
 39 | from . import main, utils
 40 | 
 41 | 
 42 | def get_label(modmass, labels):
 43 |     abt = string.ascii_lowercase
 44 |     abt_l = len(abt) - 1
 45 |     if modmass in labels:
 46 |         return labels[modmass], labels, 0
 47 |     else:
 48 |         labels[modmass] = abt[labels['i']] + abt[labels['j']] + abt[labels['k']]
 49 |         labels['k'] += 1
 50 |         if labels['k'] > abt_l:
 51 |             labels['k'] = 0
 52 |             labels['j'] += 1
 53 |         if labels['j'] > abt_l:
 54 |             labels['j'] = 0
 55 |             labels['i'] += 1
 56 |         return labels[modmass], labels, 1
 57 | 
 58 | 
 59 | def process_mods(settings, spec, name, labels):
 60 |     mods_array = []
 61 |     if spec:
 62 |         for mod in spec.split(','):
 63 |             modmass, modaa = mod.split('@')
 64 |             lbl, labels, flag = get_label(modmass, labels)
 65 |             if modaa == '[':
 66 |                 ntermlabel, modaa, ctermlabel = '-', '', ''
 67 |             elif modaa == ']':
 68 |                 ntermlabel, modaa, ctermlabel = '', '', '-'
 69 |             else:
 70 |                 ntermlabel, ctermlabel = '', ''
 71 |             mods_array.append(ctermlabel + lbl + modaa + ntermlabel)
 72 |             if flag:
 73 |                 settings.set('modifications', lbl, modmass)
 74 |     if mods_array or spec is not None:
 75 |         settings.set('modifications', name, ','.join(mods_array))
 76 | 
 77 | 
 78 | def _update(settings, section, name, value):
 79 |     if value is not None:
 80 |         settings.set(section, name, value)
 81 | 
 82 | 
 83 | def run():
 84 |     parser = argparse.ArgumentParser(
 85 |         description='Search proteins using LC-MS/MS spectra',
 86 |         epilog='''
 87 | 
 88 |     Example usage
 89 |     -------------
 90 |     $ identipy input.mgf -db human.fasta
 91 |     -------------
 92 |     ''',
 93 |         formatter_class=argparse.RawDescriptionHelpFormatter)
 94 | 
 95 |     parser.add_argument('file',     help='input mzML or MGF file with MS/MS spectra', nargs='+')
 96 |     parser.add_argument('-db',      help='path to protein FASTA file', metavar='FASTA')
 97 |     parser.add_argument('-cfg',     help='path to file with parameters', metavar='CONFIG_FILE')
 98 |     parser.add_argument('-out', '-o', help='output path', metavar='PATH')
 99 |     parser.add_argument('-of',      help='output format', metavar='FORMAT')
100 |     parser.add_argument('-sep',     help='output column separator (for table format)')
101 |     parser.add_argument('-at',      help='Use auto-tuning of search parameters', action='store_true')
102 |     parser.add_argument('-nopwide', help='Do not increase initial precursor mass accuracy for auto-tuning', action='store_true')
103 |     parser.add_argument('-punit',   help='precursor mass tolerance unit', metavar='UNIT', choices=['ppm', 'Da'])
104 |     parser.add_argument('-ptol',    help='precursor mass tolerance', type=float, metavar='VALUE')
105 |     parser.add_argument('-lptol',   help='*left precursor mass tolerance', type=float, metavar='VALUE')
106 |     parser.add_argument('-rptol',   help='*right precursor mass tolerance', type=float, metavar='VALUE')
107 |     parser.add_argument('-funit',   help='fragment mass tolerance unit', metavar='UNIT', choices=['ppm', 'Da'])
108 |     parser.add_argument('-ftol',    help='fragment mass tolerance', type=float, metavar='VALUE')
109 |     parser.add_argument('-fminmz',  help='fragment min m/z', type=float, metavar='VALUE')
110 |     parser.add_argument('-lmin',    help='min length of peptides', type=int, metavar='N')
111 |     parser.add_argument('-lmax',    help='max length of peptides', type=int, metavar='N')
112 |     parser.add_argument('-massmin', help='min mass of peptides', type=float, metavar='VALUE')
113 |     parser.add_argument('-massmax', help='max mass of peptides', type=float, metavar='VALUE')
114 |     parser.add_argument('-e',       help='cleavage rule in quotes!. X!Tandem style for cleavage rules', metavar='RULE')
115 |     parser.add_argument('-mc',      help='number of missed cleavages', type=int, metavar='N')
116 |     parser.add_argument('-semi',    help='include semitryptic peptides', action='store_true')
117 |     parser.add_argument('-noclip',  help='Disable clipping of N-terminal methionine', action='store_false', dest='clip_M')
118 |     parser.add_argument('-cmin',    help='min precursor charge', type=int, metavar='N')
119 |     parser.add_argument('-cmax',    help='max precursor charge', type=int, metavar='N')
120 |     parser.add_argument('-cumin',   help='min unknown precursor charge', type=int, metavar='N')
121 |     parser.add_argument('-cumax',   help='max unknown precursor charge', type=int, metavar='N')
122 |     parser.add_argument('-ime',     help='precursor isotope mass error. The parent ion\
123 |      mass tolerance is expanded by opening up multiple tolerance windows centered\
124 |       on the given number of 13C isotope peaks for a peptide.', type=int, metavar='N')
125 |     parser.add_argument('-shifts',  help='shifts. example: 0,16.000,23.000,12')
126 |     parser.add_argument('-snp',     help='1 means make SNP changes for ALL peptides', type=int)
127 |     parser.add_argument('-rapid',   help='leave only 2000 random spectra for processing', action='store_true')
128 |     parser.add_argument('-mm',      help='number of minimum matched ions', type=int, metavar='N')
129 |     parser.add_argument('-ad',      help='add decoy', action='store_true')
130 |     parser.add_argument('-prefix',  help='decoy prefix')
131 |     parser.add_argument('-infix',   help='decoy infix')
132 |     parser.add_argument('-method',  help='decoy method; reverse or shuffle', choices=['reverse', 'shuffle'])
133 |     parser.add_argument('-nodeis',  help='do not use MS/MS deisotoping', action='store_true')
134 |     parser.add_argument('-deistol', help='deisotope mass accuracy', type=float)
135 |     parser.add_argument('-score',   help='used scoring function', choices=['RNHS2', 'RNHS', 'hyperscore', 'morpheusscore'])
136 |     parser.add_argument('-minp',    help='minumum peaks in MS/MS spectra', type=int, metavar='N')
137 |     parser.add_argument('-maxp',    help='maximum peaks in MS/MS spectra', type=int, metavar='N')
138 |     parser.add_argument('-dyn',     help='dynamic range', type=float)
139 |     parser.add_argument('-mfc',     help='maximum fragment charge', type=int, metavar='N')
140 |     parser.add_argument('-nproc',   help='number of processes. 0 means auto', type=int, metavar='N')
141 |     parser.add_argument('-maxmods', help='maximum variable mods per sequence', type=int, metavar='N')
142 |     parser.add_argument('-ncleave', help='protein nterm cleavage', type=float)
143 |     parser.add_argument('-ccleave', help='protein cterm cleavage', type=float)
144 |     parser.add_argument('-fmods',   help='fixed modifications. Format: mass1@aminoacid1,mass2@aminoacid2')
145 |     parser.add_argument('-vmods',   help='variable modifications. Format: mass1@aminoacid1,mass2@aminoacid2')
146 |     parser.add_argument('-pmods',   help='variable protein terminal modifications')
147 |     parser.add_argument('-tags',    help='Add quantitation tags to the pepXML output. Can be tmt10plex, tmt6plex, tmt11plex or custom format label1:mass1,label2:mass2...')
148 |     parser.add_argument('-debug',   help='Print debugging messages', action='store_true')
149 |     parser.add_argument('-dino',    help='path to Dinosaur JAR file or Biosaur executable. Used for chimeric spectrum processing and MS1 Intensity calculation', default=False)
150 |     parser.add_argument('-dinoargs', help='extra arguments to Dinosaur or Biosaur.', default='')
151 |     parser.add_argument('-sd', '-skipdino', action='store_true', help='Skip feature detection if a feature file is found.')
152 |     parser.add_argument('-demixing',help='Use demixing', action='store_true')
153 |     parser.add_argument('-pif',     help='Calculate PIF', action='store_true')
154 | 
155 |     args = vars(parser.parse_args())
156 |     if args['debug']:
157 |         logging.getLogger('identipy').setLevel(logging.DEBUG)
158 | 
159 |     if args['cfg']:
160 |         settings = main.settings(args['cfg'])
161 |     else:
162 |         settings = main.settings()
163 | 
164 |     labels = {'i': 0, 'j': 0, 'k': 0}
165 |     process_mods(settings, args['fmods'], 'fixed', labels)
166 |     process_mods(settings, args['vmods'], 'variable', labels)
167 |     process_mods(settings, args['pmods'], 'protein variable', labels)
168 | 
169 |     _update(settings, 'input',  'database', args['db'])
170 |     _update(settings, 'search', 'precursor accuracy unit', args['punit'])
171 |     _update(settings, 'search', 'precursor accuracy left', (args['ptol'] if not args['lptol'] else args['lptol']))
172 |     _update(settings, 'search', 'precursor accuracy right', (args['ptol'] if not args['rptol'] else args['rptol']))
173 |     _update(settings, 'search', 'product accuracy unit', args['funit'])
174 |     _update(settings, 'search', 'product accuracy', args['ftol'])
175 |     _update(settings, 'search', 'product minimum m/z', args['fminmz'])
176 |     _update(settings, 'search', 'peptide maximum length', args['lmax'])
177 |     _update(settings, 'search', 'peptide minimum length', args['lmin'])
178 |     _update(settings, 'search', 'peptide maximum mass', args['massmax'])
179 |     _update(settings, 'search', 'peptide minimum mass', args['massmin'])
180 |     _update(settings, 'search', 'enzyme', args['e'])
181 |     _update(settings, 'search', 'number of missed cleavages', args['mc'])
182 |     _update(settings, 'search', 'semitryptic', args['semi'])
183 |     _update(settings, 'search', 'clip N-terminal methionine', str(args['clip_M']))
184 |     _update(settings, 'search', 'maximum charge', args['cmax'])
185 |     _update(settings, 'search', 'minimum charge', args['cmin'])
186 |     _update(settings, 'search', 'maximum unknown charge', args['cumax'])
187 |     _update(settings, 'search', 'minimum unknown charge', args['cumin'])
188 |     _update(settings, 'search', 'precursor isotope mass error', args['ime'])
189 |     _update(settings, 'search', 'shifts', args['shifts'])
190 |     _update(settings, 'search', 'snp', args['snp'])
191 |     _update(settings, 'output', 'minimum matched', args['mm'])
192 |     if args['ad']:
193 |         _update(settings, 'input', 'add decoy', 'yes')
194 |     if args['rapid']:
195 |         _update(settings, 'search', 'rapid_check', 1)
196 |     _update(settings, 'input',  'decoy prefix', args['prefix'])
197 |     _update(settings, 'input',  'decoy infix', args['infix'])
198 |     _update(settings, 'input',  'decoy method', args['method'])
199 |     if args['nodeis']:
200 |         _update(settings, 'input',  'deisotope', 'no')
201 |     _update(settings, 'input',  'deisotoping mass tolerance', args['deistol'])
202 |     if args['score']:
203 |         _update(settings, 'scoring', 'score', 'identipy.scoring.' + args['score'])
204 |     _update(settings, 'scoring', 'minimum peaks', args['minp'])
205 |     _update(settings, 'scoring', 'maximum peaks', args['maxp'])
206 |     _update(settings, 'scoring', 'dynamic range', args['dyn'])
207 |     _update(settings, 'scoring', 'maximum fragment charge', args['mfc'])
208 |     _update(settings, 'performance', 'processes', args['nproc'])
209 |     _update(settings, 'modifications', 'maximum variable mods', args['maxmods'])
210 |     _update(settings, 'modifications', 'protein nterm cleavage', args['ncleave'])
211 |     _update(settings, 'modifications', 'protein cterm cleavage', args['ccleave'])
212 |     _update(settings, 'output', 'path', args['out'])
213 |     _update(settings, 'output', 'format', args['of'])
214 |     _update(settings, 'output', 'separator', args['sep'])
215 |     _update(settings, 'output', 'tags', args['tags'])
216 |     if args['at']:
217 |         ao_setting = 'identipy.extras.optimization'
218 |         if args['nopwide']:
219 |             _update(settings, 'optimization', 'increase precursor mass tolerance', 'no')
220 |     else:
221 |         ao_setting = None
222 |     _update(settings, 'misc', 'first stage', ao_setting)
223 | 
224 |     dino_path = args['dino']
225 |     demixing = args['demixing']
226 |     calc_PIF = args['pif']
227 |     logger.debug('Args: %s', args)
228 | 
229 |     for inputfile in args['file']:
230 |         csettings = copy.deepcopy(settings)
231 | 
232 |         if dino_path or calc_PIF:
233 |             logger.info('Starting mzML analysis...')
234 |             if os.path.splitext(inputfile)[1].lower() != '.mzml':
235 |                 if dino_path:
236 |                     logger.error('Only mzML supported for Dinosaur!')
237 |                 elif calc_PIF:
238 |                     logger.error('mzML required for PIF calculation!')
239 |             else:
240 |                 try:
241 |                     if dino_path:
242 |                         path_to_features = os.path.splitext(inputfile)[0] + os.extsep + 'features' + os.extsep + 'tsv'
243 |                         if not args['sd'] or not os.path.exists(path_to_features):
244 |                             if dino_path.endswith('.jar'):
245 |                                 advpath = '--advParams=' + os.path.join(os.path.dirname(os.path.realpath(__file__)), 'adv.txt')
246 |                                 logger.info('Starting Dinosaur...')
247 |                                 subprocess.run(['java', '-Djava.awt.headless=true', '-jar', os.path.realpath(dino_path), advpath, '--concurrency=12', inputfile] + args['dinoargs'])
248 |                             elif 'dinosaur' in dino_path:
249 |                                 advpath = '--advParams=' + os.path.join(os.path.dirname(os.path.realpath(__file__)), 'adv.txt')
250 |                                 logger.info('Starting Dinosaur...')
251 |                                 subprocess.run([os.path.realpath(dino_path), advpath, '--concurrency=12', inputfile] + shlex.split(args['dinoargs']))
252 |                             elif 'biosaur2' in dino_path:
253 |                                 logger.info('Starting biosaur2...')
254 |                                 cmd = [os.path.realpath(dino_path), inputfile, '-o', path_to_features] + shlex.split(args['dinoargs'])
255 |                                 logger.debug('Running command: %s', cmd)
256 |                                 subprocess.run(cmd)
257 |                             else:
258 |                                 logger.info('Starting Biosaur...')
259 |                                 subprocess.run([os.path.realpath(dino_path), inputfile, '-out', path_to_features] + shlex.split(args['dinoargs']))
260 |                         if demixing:
261 |                             logger.info('Starting demultiplexing...')
262 |                     else:
263 |                         path_to_features = False
264 |                     path_to_mgf = utils.demix_chimeric(path_to_features, inputfile, demixing, calc_PIF)
265 |                     logger.info('MGF was created.')
266 |                     if demixing:
267 |                         logger.info('Demultiplexing has finished.')
268 |                     utils.write_output(path_to_mgf, csettings, main.process_file(path_to_mgf, csettings))
269 |                 except Exception as e:
270 |                     logger.error(e)
271 |                     break
272 | 
273 |         else:
274 |             utils.write_output(inputfile, csettings, main.process_file(inputfile, csettings))
275 | 
276 | 
277 | if __name__ == '__main__':
278 |     run()
279 | 


--------------------------------------------------------------------------------
/identipy/peptide_centric.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from string import punctuation
  3 | from collections import defaultdict
  4 | import random
  5 | from pyteomics import mass
  6 | from . import utils
  7 | import logging
  8 | logger = logging.getLogger(__name__)
  9 | try:
 10 |     from pyteomics import cmass
 11 | except ImportError:
 12 | #   logger.warning('cmass could not be imported')
 13 |     cmass = mass
 14 | 
 15 | # try:
 16 | # import pyximport; pyximport.install()
 17 | from .cutils import theor_spectrum
 18 | #from .utils import theor_spectrum
 19 | # except:
 20 | #     logger.info('Cython modules were not loaded...')
 21 | #     from .utils import theor_spectrum
 22 | from .utils import reshape_theor_spectrum
 23 | # from .scoring import RNHS_ultrafast
 24 | # from .cutils import RNHS_ultrafast
 25 | 
 26 | def prepare_peptide_processor(fname, settings):
 27 | 
 28 |     global_data = list()
 29 |     n_proc = utils.get_nprocesses(settings)
 30 | 
 31 |     for _ in range(n_proc):
 32 |         global_data.append({
 33 |             'spectra': [],
 34 |             'titles': [],
 35 |             'nmasses': [],
 36 |             'nmasses_set': set(),
 37 |             't2s': {},
 38 |             'charges': [],
 39 |             'effcharges': [],
 40 |             'fulls_global': {},
 41 |         })
 42 | 
 43 |     logger.debug('global data: %s', len(global_data))
 44 | 
 45 |     try:
 46 |         fast_first_stage = settings.getint('misc', 'fast first stage')
 47 |     except:
 48 |         fast_first_stage = 0
 49 | 
 50 |     # t2s = {}
 51 |     maxcharges = {}
 52 |     fcharge = settings.getint('scoring', 'maximum fragment charge')
 53 |     ch_range = range(settings.getint('search', 'minimum charge'),
 54 |                 1 + settings.getint('search', 'maximum charge'))
 55 |     # if fast_first_stage:
 56 |     #     fcharge = 1
 57 |     for c in ch_range:
 58 |         maxcharges[c] = max(1, min(fcharge, c-1) if fcharge else c-1)
 59 | 
 60 |     params = {}
 61 |     params['maxpeaks'] = settings.getint('scoring', 'maximum peaks')
 62 |     params['minpeaks'] = settings.getint('scoring', 'minimum peaks')
 63 |     params['dynrange'] = settings.getfloat('scoring', 'dynamic range')
 64 |     params['acc'] = settings.getfloat('search', 'product accuracy')
 65 |     params['min_mz'] = settings.getfloat('search', 'product minimum m/z')
 66 |     params.update(utils._charge_params(settings))
 67 |     params['dacc'] = settings.getfloat('input', 'deisotoping mass tolerance')
 68 |     params['deisotope'] = settings.getboolean('input', 'deisotope')
 69 |     params['tags'] = utils.get_tags(settings.get('output', 'tags'))
 70 |     rapid_check = settings.getint('search', 'rapid_check')
 71 | 
 72 |     ptol_unit = settings.get('search', 'precursor accuracy unit')
 73 |     lptol = settings.getfloat('search', 'precursor accuracy left')
 74 |     rptol = settings.getfloat('search', 'precursor accuracy right')
 75 |     prec_acc_Da = False
 76 |     # prec_acc_Da = max(abs(lptol), abs(rptol))
 77 |     # if ptol_unit != 'Da' or prec_acc_Da < 1.0:
 78 |     #     prec_acc_Da = False
 79 | 
 80 |     logger.info('Reading spectra ...')
 81 |     if not rapid_check:
 82 |         tmp_spec = utils.iterate_spectra(fname)
 83 |     else:
 84 |         tmp_spec = [spec for spec in utils.iterate_spectra(fname)]
 85 |         if len(tmp_spec) >= 2000:
 86 |             tmp_spec = random.sample(tmp_spec, 2000)
 87 | 
 88 |     num_spectra = 0
 89 | 
 90 |     tmp_spec2 = []
 91 |     nmasses_tmp = []
 92 |     charges_tmp = []
 93 |     global_data_index_map = {}
 94 | 
 95 |     for spec in tmp_spec:
 96 |         ps = utils.preprocess_spectrum(spec, params)
 97 |         if ps is not None:
 98 | 
 99 |             tmp_spec2.append(ps)
100 |             for m, c in utils.neutral_masses(ps, params):
101 |                 nmasses_tmp.append(m)
102 |                 charges_tmp.append(c)
103 | 
104 |     nmasses_tmp = np.array(nmasses_tmp)
105 |     idx_t = np.argsort(nmasses_tmp)
106 |     max_nmass = nmasses_tmp[idx_t[-1]]
107 |     max_l = int(len(nmasses_tmp)/n_proc)+1
108 |     for idx, k in enumerate(idx_t):
109 |         global_data_index_map[k] = idx // max_l
110 |     logger.debug('nproc: %d, nmasses: %d, max_l: %d, maximum index: %d',
111 |         n_proc, nmasses_tmp.size, max_l, max(global_data_index_map.values()))
112 | 
113 |     for ps in tmp_spec2:
114 |         # global_data_index = num_spectra % n_proc
115 |         ttl = utils.get_title(ps)
116 |         # t2s[ttl] = ps
117 |         for m, c in utils.neutral_masses(ps, params):
118 |             global_data_index = global_data_index_map[num_spectra]
119 |             effc = maxcharges[c]
120 |             ps.setdefault('nm', {})[c] = m
121 | 
122 |             global_data[global_data_index]['t2s'][ttl] = ps
123 |             global_data[global_data_index]['nmasses'].append(m)
124 |             global_data[global_data_index]['spectra'].append(ps)
125 |             global_data[global_data_index]['titles'].append(ttl)
126 |             global_data[global_data_index]['charges'].append(c)
127 |             global_data[global_data_index]['effcharges'].append(effc)
128 | 
129 |             num_spectra += 1
130 |     logger.info('%s spectra pass quality criteria.', num_spectra)
131 | 
132 |     if ptol_unit != 'Da':
133 |         max_prec_acc_Da = max_nmass * 1e-6 * max(abs(lptol), abs(rptol))
134 |     else:
135 |         max_prec_acc_Da = max(abs(lptol), abs(rptol))
136 | 
137 | 
138 |     for global_data_index in range(n_proc):
139 | 
140 |         i = np.argsort(global_data[global_data_index]['nmasses'])
141 |         global_data[global_data_index]['nmasses'] = np.array(global_data[global_data_index]['nmasses'])[i]
142 |         global_data[global_data_index]['spectra'] = np.array(global_data[global_data_index]['spectra'])[i]
143 |         global_data[global_data_index]['titles'] = np.array(global_data[global_data_index]['titles'])[i]
144 |         global_data[global_data_index]['charges'] = np.array(global_data[global_data_index]['charges'])[i]
145 |         global_data[global_data_index]['effcharges'] = np.array(global_data[global_data_index]['effcharges'])[i]
146 | 
147 |         tmp = (global_data[global_data_index]['nmasses'] / max_prec_acc_Da).astype(int)
148 |         global_data[global_data_index]['nmasses_set'].update(tmp)
149 |         global_data[global_data_index]['nmasses_set'].update(tmp+1)
150 |         global_data[global_data_index]['nmasses_set'].update(tmp-1)
151 | 
152 |         if prec_acc_Da:
153 |             nmasses_conv = global_data[global_data_index]['nmasses'] / prec_acc_Da
154 |             nmasses_conv = nmasses_conv.astype(int)
155 | 
156 |             tmp_dict = {}
157 |             for idx, nm in enumerate(nmasses_conv):
158 |                 if nm not in tmp_dict:
159 |                     tmp_dict[nm] = {}
160 |                 if nm+1 not in tmp_dict:
161 |                     tmp_dict[nm+1] = {}
162 |                 if nm-1 not in tmp_dict:
163 |                     tmp_dict[nm-1] = {}
164 |                 for spval in global_data[global_data_index]['spectra'][idx]['idict']:
165 |                     if spval not in tmp_dict[nm]:
166 |                         tmp_dict[nm][spval] = [idx, ]
167 |                     else:
168 |                         tmp_dict[nm][spval].append(idx)
169 |                     if spval not in tmp_dict[nm+1]:
170 |                         tmp_dict[nm+1][spval] = [idx, ]
171 |                     else:
172 |                         tmp_dict[nm+1][spval].append(idx)
173 |                     if spval not in tmp_dict[nm-1]:
174 |                         tmp_dict[nm-1][spval] = [idx, ]
175 |                     else:
176 |                         tmp_dict[nm-1][spval].append(idx)
177 | 
178 |             del nmasses_conv
179 | 
180 |             global_data[global_data_index]['nmasses_set'] = tmp_dict
181 | 
182 |     utils.set_mod_dict(settings)
183 | 
184 |     aa_mass = utils.get_aa_mass(settings)
185 |     score = utils.import_(settings.get('scoring', 'score'))
186 |     try:
187 |         score_fast_name = settings.get('scoring', 'score') + '_fast'
188 |         logger.debug('Fast score name: %s', score_fast_name)
189 |         if score_fast_name in {'identipy.scoring.RNHS_fast', 'RNHS_fast'}:
190 |             try:
191 |                 from .cutils import RNHS_fast as score_fast
192 |                 from .cutils import RNHS_fast_basic as score_fast_basic
193 |             except ImportError as e:
194 |                 logger.warning('Could not import from cutils: %s', e.args)
195 |                 score_fast = utils.import_(settings.get('scoring', 'score') + '_fast')
196 |                 score_fast_basic = utils.import_(settings.get('scoring', 'score') + '_fast_basic')
197 |         else:
198 |             score_fast = utils.import_(settings.get('scoring', 'score') + '_fast')
199 |             score_fast_basic = utils.import_(settings.get('scoring', 'score') + '_fast_basic')
200 |     except Exception as e:
201 |         score_fast = False
202 |         logging.debug('No fast score imported: %s', e)
203 |     acc_l = settings.getfloat('search', 'precursor accuracy left')
204 |     acc_r = settings.getfloat('search', 'precursor accuracy right')
205 |     acc_frag = settings.getfloat('search', 'product accuracy')
206 |     frag_unit = settings.get('search', 'product accuracy unit')
207 |     if frag_unit == 'ppm':
208 |         acc_frag_ppm = settings.getfloat('search', 'product accuracy ppm')
209 |     else:
210 |         acc_frag_ppm = False
211 |     unit = settings.get('search', 'precursor accuracy unit')
212 |     rel = utils.relative(unit)
213 | 
214 |     if settings.has_option('scoring', 'condition'):
215 |         cond = settings.get('scoring', 'condition')
216 |     else:
217 |         cond = None
218 |     if isinstance(cond, str) and cond.strip():
219 |         cond = utils.import_(cond)
220 | 
221 |     score = utils.import_(settings.get('scoring', 'score'))
222 | 
223 |     return {'rel': rel, 'aa_mass': aa_mass,
224 |             'acc_l': acc_l, 'acc_r': acc_r, 'acc_frag': acc_frag, 'acc_frag_ppm': acc_frag_ppm,
225 |             'unit': unit,  # 'nmods': nmods, 'maxmods': maxmods,
226 |             'fast first stage': fast_first_stage,
227 |             'sapime': utils.get_shifts_and_pime(settings),
228 |             'cond': cond, 'score': score, 'score_fast': score_fast, 'score_fast_basic': score_fast_basic,
229 |             'settings': settings, 'max_v': num_spectra, 'prec_acc_Da': prec_acc_Da, 'max_prec_acc_Da': max_prec_acc_Da}, global_data
230 | 
231 | 
232 | def peptide_processor_iter_isoforms(peptide, best_res, global_data_local, **kwargs):
233 |     res = peptide_processor(peptide, best_res, global_data_local, **kwargs)
234 |     if res:
235 |         return [res, ]
236 | 
237 |     # nmods, maxmods = op.itemgetter('nmods', 'maxmods')(kwargs)
238 |     # if nmods and maxmods:
239 |     #     out = []
240 |     #     for form in utils.custom_isoforms(peptide, variable_mods=nmods, maxmods=maxmods, snp=kwargs['snp']):
241 |     #         res = peptide_processor(form, best_res, global_data_local, **kwargs)
242 |     #         if res:
243 |     #             out.append(res)
244 |     #     if out:
245 |     #         return out
246 |     # else:
247 |     #     res = peptide_processor(peptide, best_res, global_data_local, **kwargs)
248 |     #     if res:
249 |     #         return [res, ]
250 | 
251 | 
252 | def peptide_processor(peptide, best_res, global_data_local, **kwargs):
253 |     spectra = global_data_local['spectra']
254 |     titles = global_data_local['titles']
255 |     nmasses = global_data_local['nmasses']
256 |     nmasses_set = global_data_local['nmasses_set']
257 |     t2s = global_data_local['t2s']
258 |     charges = global_data_local['charges']
259 |     effcharges = global_data_local['effcharges']
260 |     fulls_global = global_data_local['fulls_global']
261 |     seqm, aachange_pos, snp_label, m = peptide
262 | 
263 |     max_prec_acc_Da = kwargs.get('max_prec_acc_Da')
264 | 
265 |     nterm_mass = kwargs.get('nterm_mass')
266 |     cterm_mass = kwargs.get('cterm_mass')
267 |     rel = kwargs['rel']
268 |     acc_l = kwargs['acc_l']
269 |     acc_r = kwargs['acc_r']
270 |     settings = kwargs['settings']
271 | 
272 |     shifts_and_pime = kwargs['sapime']
273 |     theor = {}
274 |     theoretical_set = {}
275 |     cand_idx = {}
276 |     stored_value = False
277 |     if rel:
278 |         dm_l = acc_l * m / 1.0e6
279 |         dm_r = acc_r * m / 1.0e6
280 |     elif not rel:
281 |         dm_l = acc_l
282 |         dm_r = acc_r
283 |     # for c in spectra:
284 | 
285 |     idx = set()
286 |     for shift in shifts_and_pime:
287 |         if int((m + shift)/max_prec_acc_Da) in nmasses_set:
288 |             start = nmasses.searchsorted(m + shift - dm_l)
289 |             end = nmasses.searchsorted(m + shift + dm_r, side='right')
290 |             if end - start:
291 |                 idx.update(range(start, end))
292 |     if kwargs['cond']:
293 |         idx2 = set()
294 |         for i in idx:
295 |             cond_val, stored_value = kwargs['cond'](spectra[i], seqm, settings, stored_value)
296 |             if cond_val:
297 |                 idx2.add(i)
298 |         idx = idx2
299 | 
300 |     if idx:
301 |         cand_idx = idx
302 |         reshaped = {}
303 |         for c in set(effcharges[i] for i in idx):
304 |             theor[c], theoretical_set[c] = theor_spectrum(seqm, maxcharge=c, aa_mass=kwargs['aa_mass'], reshape=False,
305 |                                                             acc_frag=kwargs['acc_frag'], nterm_mass = nterm_mass,
306 |                                                             cterm_mass = cterm_mass, nm=m)
307 |             reshaped[c] = False
308 |         # reshaped = False
309 | 
310 |     results = []
311 |     # for ind in cand_idx:
312 |     ind = cand_idx
313 |     # reshaped = False
314 |     # if kwargs['prec_acc_Da']:
315 |     #     fulls_global_charge = fulls_global
316 |     #     nm_key = int(m / kwargs['prec_acc_Da'])
317 |     #     cur_idict = fulls_global_charge.get(nm_key, dict())
318 |     #     fc_max = max(theor.keys())
319 |     #     idx_new = RNHS_ultrafast(cur_idict, theoretical_set[fc_max], kwargs['min_matched'], best_res, ind, kwargs['max_v'])
320 |     # else:
321 |     idx_new = ind
322 |     if idx_new:
323 |         # logger.info(len(idx_new))
324 |         for i in idx_new:
325 |             # st = utils.get_title(s)
326 |             # if idx_new.count(st) >= kwargs['min_matched']:#st in idx_new:
327 |             # if i in idx_new:
328 |             fc = effcharges[i]
329 |             s = spectra[i]
330 |             st = titles[i]
331 |             chim = ('params' in s and 'isowidthdiff' in s['params'] and abs(float(s['params']['isowidthdiff'])) >= 0.1)
332 |             spcharge = charges[i]
333 |             # neutral_mass, charge_state, RT = get_info(res['spectrum'], res, settings, aa_mass)
334 |             if kwargs['score_fast']:
335 |                 if 1:
336 |                     hf = kwargs['score_fast_basic'](s['fastset'], s['idict'], theoretical_set[fc], kwargs['min_matched'])
337 |                     if hf[0]:
338 |                         if -hf[1] <= best_res.get(st, 0):
339 |                             if kwargs['fast first stage']:
340 |                                 sc = hf[1]
341 |                                 score = {'match': [], 'sumI': 1, 'dist': [], 'total_matched': 999, 'score_std': 0}
342 |                             else:
343 |                                 if not reshaped[fc]:
344 |                                     theor[fc] = reshape_theor_spectrum(theor[fc])
345 |                                     reshaped[fc] = True
346 |                                 score = kwargs['score'](s, theor[fc], kwargs['acc_frag'], kwargs['acc_frag_ppm'], position=aachange_pos) # FIXME (?)
347 |                                 sc = score.pop('score')
348 |                             if -sc <= best_res.get(st, 0) and score.pop('total_matched') >= kwargs['min_matched']:
349 |                                 results.append((sc, st, charges[i], score))
350 |             else:
351 |                 if not reshaped[fc]:
352 |                     theor[fc] = reshape_theor_spectrum(theor[fc])
353 |                     reshaped[fc] = True
354 |                 score = kwargs['score'](s, theor[fc], kwargs['acc_frag'], kwargs['acc_frag_ppm'], position=aachange_pos) # FIXME (?)
355 |                 sc = score.pop('score')
356 |                 if -sc <= best_res.get(st, 0) and score.pop('total_matched') >= kwargs['min_matched']:
357 |                     results.append((sc, st, charges[i], score))
358 | 
359 |     if results:
360 |         return seqm, m, snp_label, results
361 | 
362 | 
363 | def process_peptides(fname, settings):
364 |     logger.debug('Started process_peptides.')
365 |     spec_results = defaultdict(dict)
366 |     peps = utils.peptide_isoforms(settings)
367 |     kwargs, global_data = prepare_peptide_processor(fname, settings)
368 |     func = peptide_processor_iter_isoforms
369 |     kwargs['min_matched'] = settings.getint('output', 'minimum matched')
370 |     kwargs['snp'] = settings.getint('search', 'snp')
371 |     kwargs['nterm_mass'] = settings.getfloat('modifications', 'protein nterm cleavage')
372 |     kwargs['cterm_mass'] = settings.getfloat('modifications', 'protein cterm cleavage')
373 |     kwargs['qsize'] = settings.getint('performance', 'out queue size')
374 | 
375 |     logger.info('Running the search ...')
376 |     n = utils.get_nprocesses(settings)
377 |     leg = {}
378 |     if settings.has_option('misc', 'legend'):
379 |         leg = settings.get('misc', 'legend').copy()
380 |     if settings.has_option('misc', 'plegend'):
381 |         leg.update(settings.get('misc', 'plegend'))
382 | 
383 |     try:
384 |         kwargs['best_peptides'] = settings.get('scoring', 'best peptides')
385 |     except:
386 |         kwargs['best_peptides'] = False
387 | 
388 |     best_res_raw, best_res = utils.multimap(n, func, peps, global_data=global_data, **kwargs)
389 | 
390 |     t2s_global = {}
391 |     for global_data_local in global_data:
392 |         t2s_global.update(global_data_local['t2s'])
393 | 
394 |     for spec_t, v in best_res_raw.items():
395 |         peptide, m, snp_label, score, st, c, info = v
396 |         spec_results[spec_t]['spectrum'] = t2s_global[spec_t]
397 |         info['pep_nm'] = m
398 |         info['charge'] = c
399 |         spec_results[spec_t]['top_scores'] = -score
400 |         spec_results[spec_t]['sequences'] = peptide
401 |         spec_results[spec_t]['info'] = info
402 |         spec_results[spec_t]['snp_label'] = snp_label
403 | 
404 |     maxlen = settings.getint('search', 'peptide maximum length')
405 |     dtype = np.dtype([('score', np.float64),
406 |         ('seq', np.str_, maxlen + 2), ('note', np.str_, 1),
407 |         ('charge', np.int8), ('info', np.object_), ('sumI', np.float64), ('fragmentMT', np.float64), ('snp_label', np.str_, 15), ('nextscore_std', np.float64)])
408 |     for spec_name, val in spec_results.items():
409 |         s = val['spectrum']
410 |         c = []
411 |         evalues = []
412 |         score = val['top_scores']
413 |         mseq = val['sequences']
414 |         seq = mseq
415 |         info = val['info']
416 |         for x in set(mseq).intersection(punctuation):
417 |             repl = leg[x][1]
418 |             if repl == '-':
419 |                 repl = ''
420 |             seq = seq.replace(x, repl)
421 |         pnm = info['pep_nm']
422 |         c.append((-score, mseq, 't' if seq in utils.seen_target else 'd',
423 |             info['charge'], info, info.pop('sumI'), np.median(info.pop('dist')), val['snp_label'], info.pop('score_std')))
424 |         c[-1][4]['mzdiff'] = {'Da': s['nm'][info['charge']] - pnm}
425 |         c[-1][4]['mzdiff']['ppm'] = 1e6 * c[-1][4]['mzdiff']['Da'] / pnm
426 |         evalues.append(-1./score if -score else 1e6)
427 |         c = np.array(c, dtype=dtype)
428 |         yield {'spectrum': s, 'candidates': c, 'e-values': evalues}
429 | 


--------------------------------------------------------------------------------
/identipy/scoring.py:
--------------------------------------------------------------------------------
  1 | from .utils import get_aa_mass, custom_mass
  2 | from .cutils import theor_spectrum
  3 | from scipy.spatial import cKDTree
  4 | import numpy as np
  5 | from math import factorial
  6 | from copy import copy
  7 | import logging
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | def get_fragment_mass_tol(spectrum, peptide, settings, charge_state):
 12 |     """A function for obtaining optimal fragment mass tolerance, dynamic range"""
 13 |     acc = settings.getfloat('search', 'product accuracy')
 14 |     int_array = spectrum['intensity array']
 15 |     int_array = int_array / int_array.max() * 100
 16 | 
 17 | 
 18 |     fcharge = settings.getint('scoring', 'maximum fragment charge')
 19 |     maxfrag_charge = max(1, min(fcharge, charge_state-1) if fcharge else charge_state-1)
 20 | 
 21 |     cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage')
 22 |     nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage')
 23 |     m = custom_mass(peptide, aa_mass=get_aa_mass(settings), nterm_mass = nterm_mass, cterm_mass = cterm_mass)
 24 | 
 25 | 
 26 |     theor, _ = theor_spectrum(peptide, maxcharge=maxfrag_charge, reshape=True, aa_mass=get_aa_mass(settings), acc_frag=acc,
 27 |         nterm_mass = nterm_mass, cterm_mass=cterm_mass, nm=m)
 28 |     if '__KDTree' not in spectrum:
 29 |         spectrum['__KDTree'] = cKDTree(spectrum['m/z array'].reshape((spectrum['m/z array'].size, 1)))
 30 | 
 31 |     dist_total, int_array_total = np.array([]), np.array([])
 32 |     dist_total_tmp = np.array([])
 33 |     match2 = {}
 34 |     for ion, fragments in theor.items():
 35 |         n = fragments.size
 36 |         dist, ind = spectrum['__KDTree'].query(fragments.reshape((n, 1)), distance_upper_bound=acc)
 37 |         mask = (dist != np.inf)
 38 | #       logger.debug('m/z array: %s', spectrum['m/z array'])
 39 | #       logger.debug('fragments: %s', fragments)
 40 | #       logger.debug('dist: %s\nind: %s\n', dist, ind)
 41 | 
 42 |         logger.debug('%s %s %s', spectrum['intensity array'].size, ind.size, ind[mask])
 43 |         int_array_total = np.append(int_array_total, spectrum['intensity array'][ind[mask]])
 44 | 
 45 |         dist_total = np.append(dist_total, dist[mask] / spectrum['m/z array'][ind[mask]] * 1e6)
 46 |         dist_total_tmp  = np.append(dist_total_tmp, dist[mask])
 47 |         match2[ion] = mask
 48 |         # matchI[ion] = spectrum['intensity array'][ind[mask]]
 49 |         # dist_total = np.append(dist_total, dist[mask])
 50 | 
 51 |     yions = match2[('y', 1)]
 52 |     bions = match2[('b', 1)]
 53 |     new_params = {}
 54 |     if dist_total.size:
 55 |         new_params['fmt'] = dist_total#2 * np.median(dist_total)
 56 |         new_params['fmt_neutral'] = dist_total_tmp
 57 |         new_params['bions'] = bions
 58 |         new_params['yions'] = yions
 59 |     else:
 60 |         new_params['fmt'] = []
 61 |         new_params['fmt_neutral'] = []
 62 |         new_params['bions'] = []
 63 |         new_params['yions'] = []
 64 |     return new_params
 65 | 
 66 | def get_fragment_mass_tol_ppm(spectrum, peptide, settings, charge_state, acc_ppm):
 67 |     """A function for obtaining optimal fragment mass tolerance, dynamic range"""
 68 |     # acc = settings.getfloat('search', 'product accuracy')
 69 |     acc = acc_ppm * 1500 * 1e-6
 70 | #   spectrum = copy(spectrum)
 71 | #   idx = np.nonzero(spectrum['m/z array'] >= 150)
 72 | #   spectrum['intensity array'] = spectrum['intensity array'][idx]
 73 | #   spectrum['m/z array'] = spectrum['m/z array'][idx]
 74 |     int_array = spectrum['intensity array']
 75 |     int_array = int_array / int_array.max() * 100
 76 |     # charge = 1#max(1, max(c for _, c in neutral_masses(spectrum, settings)) - 1)
 77 | 
 78 | 
 79 |     fcharge = settings.getint('scoring', 'maximum fragment charge')
 80 |     maxfrag_charge = max(1, min(fcharge, charge_state-1) if fcharge else charge_state-1)
 81 | 
 82 |     cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage')
 83 |     nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage')
 84 |     m = custom_mass(peptide, aa_mass=get_aa_mass(settings), nterm_mass = nterm_mass, cterm_mass = cterm_mass)
 85 |     theor, _ = theor_spectrum(peptide, maxcharge=maxfrag_charge, reshape=True, aa_mass=get_aa_mass(settings), acc_frag=acc,
 86 |         nterm_mass = nterm_mass, cterm_mass=cterm_mass, nm=m)
 87 |     if '__KDTree' not in spectrum:
 88 |         spectrum['__KDTree'] = cKDTree(spectrum['m/z array'].reshape((spectrum['m/z array'].size, 1)))
 89 | 
 90 |     dist_total, int_array_total = np.array([]), np.array([])
 91 |     dist_total_tmp = np.array([])
 92 |     match2 = {}
 93 |     for ion, fragments in theor.items():
 94 |         n = fragments.size
 95 |         dist, ind = spectrum['__KDTree'].query(fragments.reshape((n, 1)), distance_upper_bound=acc)
 96 |         mask = (dist != np.inf)
 97 | 
 98 | 
 99 |         ind = ind.clip(max=spectrum['m/z array'].size-1)
100 |         nacc = np.where(dist / spectrum['m/z array'][ind] * 1e6 > acc_ppm)[0]
101 |         mask[nacc] = False
102 | 
103 | 
104 | #       logger.debug('m/z array: %s', spectrum['m/z array'])
105 | #       logger.debug('fragments: %s', fragments)
106 | #       logger.debug('dist: %s\nind: %s\n', dist, ind)
107 | 
108 |         logger.debug('%s %s %s', spectrum['intensity array'].size, ind.size, ind[mask])
109 |         int_array_total = np.append(int_array_total, spectrum['intensity array'][ind[mask]])
110 | 
111 |         dist_total = np.append(dist_total, dist[mask] / spectrum['m/z array'][ind[mask]] * 1e6)
112 |         dist_total_tmp  = np.append(dist_total_tmp, dist[mask])
113 |         match2[ion] = mask
114 |         # matchI[ion] = spectrum['intensity array'][ind[mask]]
115 |         # dist_total = np.append(dist_total, dist[mask])
116 | 
117 |     yions = match2[('y', 1)]
118 |     bions = match2[('b', 1)]
119 |     new_params = {}
120 |     if dist_total.size:
121 |         new_params['fmt'] = dist_total#2 * np.median(dist_total)
122 |         new_params['fmt_neutral'] = dist_total_tmp
123 |         new_params['bions'] = bions
124 |         new_params['yions'] = yions
125 |     else:
126 |         new_params['fmt'] = []
127 |         new_params['fmt_neutral'] = []
128 |         new_params['bions'] = []
129 |         new_params['yions'] = []
130 |     return new_params
131 | 
132 | def morpheusscore_fast(spectrum_fastset, spectrum_idict, theoretical_set, min_matched):
133 |     matched_b = spectrum_fastset.intersection(theoretical_set['b'])
134 |     matched_y = spectrum_fastset.intersection(theoretical_set['y'])
135 |     matched_approx_b = len(matched_b)
136 |     matched_approx_y = len(matched_y)
137 |     matched_approx = matched_approx_b + matched_approx_y
138 |     if matched_approx >= min_matched:
139 |         isum = 0
140 |         for fr in matched_b:
141 |             isum += spectrum_idict[fr]
142 |         for fr in matched_y:
143 |             isum += spectrum_idict[fr]
144 |         return matched_approx, matched_approx + isum
145 |         # return matched_approx, factorial(matched_approx_b) * (100 * matched_approx_b) + factorial(matched_approx_y) * (100 * matched_approx_y)
146 |         # return matched_approx, factorial(matched_approx) * (100 * matched_approx)
147 |     else:
148 |         return 0, 0
149 | 
150 | def morpheusscore(spectrum, theoretical, acc, acc_ppm=False, position=False):
151 |     if 'norm' not in spectrum:
152 |         spectrum['norm'] = spectrum['Isum']#spectrum['intensity array'].sum()#spectrum['intensity array'].max() / 100.
153 |     mz_array = spectrum['m/z array']
154 |     score = 0
155 |     match = {}
156 |     match2 = {}
157 |     total_matched = 0
158 |     sumI = 0
159 |     if '__KDTree' not in spectrum:
160 |         spectrum['__KDTree'] = cKDTree(mz_array.reshape((mz_array.size, 1)))
161 | 
162 |     dist_all = []
163 |     for ion, fragments in theoretical.items():
164 |         dist, ind = spectrum['__KDTree'].query(fragments, distance_upper_bound=acc)
165 |         mask1 = (dist != np.inf)
166 |         if acc_ppm:
167 |             mask2 = (dist[mask1] / spectrum['m/z array'][ind[mask1]] * 1e6 <= acc_ppm)
168 |         else:
169 |             mask2 = np.ones_like(dist[mask1], dtype=bool)
170 |         nmatched = mask2.sum()
171 |         if nmatched:
172 |             total_matched += nmatched
173 |             sumi = spectrum['intensity array'][ind[mask1][mask2]].sum()
174 |             sumI += sumi
175 |             score += sumi / spectrum['norm']
176 |             dist_all.extend(dist[mask1][mask2])
177 |         match[ion] = mask2
178 |         match2[ion] = mask1
179 |     if not total_matched:
180 |         return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
181 |     if position:
182 |         yions = match2[('y', 1)]
183 |         bions = match2[('b', 1)]
184 |         plen = len(yions) + 1
185 |         if position == 1:
186 |             if not bions[0]:
187 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
188 |         elif position == plen:
189 |             if not yions[0]:
190 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
191 |         else:
192 |             if not (yions[plen - position] and yions[plen - position - 1]) or (bions[position - 1] and bions[position - 2]):
193 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
194 | 
195 |     score += total_matched
196 |     sumI = np.log10(sumI)
197 | 
198 |     return {'score': score, 'match': match, 'sumI': sumI, 'dist': dist_all, 'total_matched': total_matched, 'score_std': 0}
199 | 
200 | def hyperscore_fast(spectrum_fastset, spectrum_idict, theoretical_set, min_matched):
201 |     matched_b = spectrum_fastset.intersection(theoretical_set['b'])
202 |     matched_y = spectrum_fastset.intersection(theoretical_set['y'])
203 |     matched_approx_b = len(matched_b)
204 |     matched_approx_y = len(matched_y)
205 |     #matched_approx_b = len(spectrum_fastset.intersection(theoretical_set['b']))
206 |     #matched_approx_y = len(spectrum_fastset.intersection(theoretical_set['y']))
207 |     matched_approx = matched_approx_b + matched_approx_y
208 |     if matched_approx >= min_matched:
209 |         isum = 0
210 |         for fr in matched_b:
211 |             isum += spectrum_idict[fr]
212 |         for fr in matched_y:
213 |             isum += spectrum_idict[fr]
214 |         # return matched_approx, factorial(matched_approx_b) * factorial(matched_approx_y)
215 |         return matched_approx, factorial(matched_approx_b) * 100 * isum * (matched_approx_b + matched_approx_y) * factorial(matched_approx_y)
216 |         # return matched_approx, factorial(matched_approx) * (100 * matched_approx)
217 |     else:
218 |         return 0, 0
219 | 
220 | def hyperscore(spectrum, theoretical, acc, acc_ppm=False, position=False):
221 |     if 'norm' not in spectrum:
222 |         spectrum['norm'] = spectrum['intensity array'].max() / 100.
223 |     mz_array = spectrum['m/z array']
224 |     score = 0
225 |     mult = []
226 |     match = {}
227 |     match2 = {}
228 |     total_matched = 0
229 |     sumI = 0
230 |     if '__KDTree' not in spectrum:
231 |         spectrum['__KDTree'] = cKDTree(mz_array.reshape((mz_array.size, 1)))
232 | 
233 |     dist_all = []
234 |     for ion, fragments in theoretical.items():
235 |         dist, ind = spectrum['__KDTree'].query(fragments, distance_upper_bound=acc)
236 |         mask1 = (dist != np.inf)
237 |         if acc_ppm:
238 |             ind = ind.clip(max=mz_array.size-1)
239 |             nacc = np.where(dist / mz_array[ind] * 1e6 > acc_ppm)[0]
240 |             mask2 = mask1.copy()
241 |             mask2[nacc] = False
242 |         else:
243 |             mask2 = np.ones_like(dist[mask1], dtype=bool)
244 |         nmatched = mask2.sum()
245 |         if nmatched:
246 |             total_matched += nmatched
247 |             mult.append(factorial(nmatched))
248 |             sumi = spectrum['intensity array'][ind[mask2]].sum()
249 |             sumI += sumi
250 |             score += sumi / spectrum['norm']
251 |             dist_all.extend(dist[mask2])
252 |         match[ion] = mask2
253 |         match2[ion] = mask1
254 |     if not total_matched:
255 |         return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
256 |     if position:
257 |         yions = match2[('y', 1)]
258 |         bions = match2[('b', 1)]
259 |         plen = len(yions) + 1
260 |         if position == 1:
261 |             if not bions[0]:
262 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
263 |         elif position == plen:
264 |             if not yions[0]:
265 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
266 |         else:
267 |             if not (yions[plen - position] and yions[plen - position - 1]) or (bions[position - 1] and bions[position - 2]):
268 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
269 | 
270 |     for m in mult:
271 |         score *= m
272 |     sumI = np.log10(sumI)
273 | 
274 |     return {'score': score, 'score_std': 0, 'match': match, 'sumI': sumI, 'dist': dist_all, 'total_matched': total_matched}
275 | 
276 | 
277 | def RNHS_ultrafast(spectrum_idict, theoretical_set, min_matched, nm, best_res, allowed_idx, max_v, prec_acc_Da):
278 | 
279 |     nm_key = int(nm / prec_acc_Da)
280 | 
281 |     cur_idict = spectrum_idict.get(nm_key, None)
282 |     if not cur_idict:
283 |         return None
284 | 
285 |     total_matched = 0
286 | 
287 |     cnt_b = dict()
288 |     cnt_y = dict()
289 | 
290 |     for ion in theoretical_set['b']:
291 |         if ion in cur_idict:
292 |             for xx in cur_idict[ion]:
293 |                 if xx not in cnt_b:
294 |                     cnt_b[xx] = 1
295 |                 else:
296 |                     cnt_b[xx] += 1
297 |             total_matched += 1
298 | 
299 |     for ion in theoretical_set['y']:
300 |         if ion in cur_idict:
301 |             for xx in cur_idict[ion]:
302 |                 if xx not in cnt_y:
303 |                     cnt_y[xx] = 1
304 |                 else:
305 |                     cnt_y[xx] += 1
306 |             total_matched += 1
307 | 
308 |     if total_matched < min_matched:
309 |         return None
310 | 
311 |     out = set()
312 |     for k in allowed_idx:
313 |         num_b_ions = 0
314 |         num_y_ions = 0
315 |         if k in cnt_b:
316 |             num_b_ions = cnt_b[k]
317 |         if k in cnt_y:
318 |             num_y_ions = cnt_y[k]
319 |         if num_b_ions + num_y_ions >= min_matched:
320 |             best_res_val = best_res.get(k, 0)
321 |             if not best_res_val or -factorial(num_b_ions) * factorial(num_y_ions) <= best_res_val:
322 |                 out.add(k)
323 |     return out
324 | 
325 |     # isum = 0
326 |     # matched_approx_b, matched_approx_y = 0, 0
327 |     # for ion in theoretical_set['b']:
328 |     #     if ion in spectrum_idict:
329 |     #         matched_approx_b += 1
330 |     #         isum += spectrum_idict[ion]
331 | 
332 |     # for ion in theoretical_set['y']:
333 |     #     if ion in spectrum_idict:
334 |     #         matched_approx_y += 1
335 |     #         isum += spectrum_idict[ion]
336 | 
337 |     #     # # isum = 0
338 |     #     # for fr in matched_b:
339 |     #     #     isum += spectrum_idict[fr]
340 |     #     # for fr in matched_y:
341 |     #     #     isum += spectrum_idict[fr]
342 |     # matched_approx = matched_approx_b + matched_approx_y
343 |     # if matched_approx >= min_matched:
344 |     #     return matched_approx, factorial(matched_approx_b) * factorial(matched_approx_y) * isum
345 |     # else:
346 |     #     return 0, 0
347 | 
348 | def RNHS_fast(spectrum_fastset, spectrum_idict, theoretical_set, min_matched):
349 |     # matched_b = spectrum_fastset.intersection(theoretical_set['b'])
350 |     # matched_y = spectrum_fastset.intersection(theoretical_set['y'])
351 |     # matched_approx_b = len(matched_b)
352 |     # matched_approx_y = len(matched_y)
353 |     #matched_approx_b = len(spectrum_fastset.intersection(theoretical_set['b']))
354 |     #matched_approx_y = len(spectrum_fastset.intersection(theoretical_set['y']))
355 |     # matched_approx = matched_approx_b + matched_approx_y
356 |     # if matched_approx >= min_matched:
357 |     score = 0
358 |     isum = 0
359 |     matched_approx_b, matched_approx_y = 0, 0
360 |     for ion in theoretical_set['b']:
361 |         if ion in spectrum_idict:
362 |             matched_approx_b += 1
363 |             isum += spectrum_idict[ion]
364 |     score = isum * factorial(matched_approx_b)
365 |     isum = 0
366 |     for ion in theoretical_set['y']:
367 |         if ion in spectrum_idict:
368 |             matched_approx_y += 1
369 |             isum += spectrum_idict[ion]
370 |     score += isum * factorial(matched_approx_y)
371 |         # # isum = 0
372 |         # for fr in matched_b:
373 |         #     isum += spectrum_idict[fr]
374 |         # for fr in matched_y:
375 |         #     isum += spectrum_idict[fr]
376 |     matched_approx = matched_approx_b + matched_approx_y
377 |     if matched_approx >= min_matched:
378 |         return matched_approx, score
379 |     else:
380 |         return 0, 0
381 | 
382 | def RNHS(spectrum, theoretical, acc, acc_ppm=False, position=False):
383 |     if 'norm' not in spectrum:
384 |         spectrum['norm'] = spectrum['Isum']
385 |     mz_array = spectrum['m/z array']
386 |     score = 0
387 |     mult = []
388 |     match = {}
389 |     match2 = {}
390 |     total_matched = 0
391 |     sumI = 0
392 |     if '__KDTree' not in spectrum:
393 |         spectrum['__KDTree'] = cKDTree(mz_array.reshape((mz_array.size, 1)))
394 | 
395 |     dist_all = []
396 |     for ion, fragments in theoretical.items():
397 |         dist, ind = spectrum['__KDTree'].query(fragments, distance_upper_bound=acc)
398 |         mask1 = (dist != np.inf)
399 |         if acc_ppm:
400 |             ind = ind.clip(max=mz_array.size-1)
401 |             nacc = np.where(dist / mz_array[ind] * 1e6 > acc_ppm)[0]
402 |             mask2 = mask1.copy()
403 |             mask2[nacc] = False
404 |         else:
405 |             mask2 = mask1
406 |         nmatched = mask2.sum()
407 |         if nmatched:
408 |             total_matched += nmatched
409 |             mult.append(factorial(nmatched))
410 |             sumi = spectrum['intensity array'][ind[mask2]].sum()
411 |             sumI += sumi
412 |             score += sumi# / spectrum['norm']
413 |             dist_all.extend(dist[mask2])
414 |         match[ion] = mask2
415 |         match2[ion] = mask2
416 | 
417 |     score = score / spectrum['norm']
418 | 
419 |     if not total_matched:
420 |         return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0}
421 |     if position:
422 |         yions = match2[('y', 1)]
423 |         bions = match2[('b', 1)]
424 |         plen = len(yions)
425 |         if position > plen + 1:
426 | #           print 'Something wrong with aachange position'
427 |             return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0}
428 |         if position == 1:
429 |             if not bions[0]:
430 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0}
431 |         elif position == plen + 1:
432 |             if not yions[0]:
433 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0}
434 |         else:
435 |             if not (yions[plen - position + 1] and yions[plen - position]):
436 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0}
437 | 
438 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0, 'IPGF': 0, 'IPGF2': 0, 'RNHS': 0}
439 | 
440 | 
441 |     for m in mult:
442 |         score *= m
443 | 
444 |     sumI = np.log10(sumI)
445 | 
446 |     outscore = score
447 | 
448 |     return {'score': outscore, 'match': match, 'sumI': sumI, 'dist': dist_all, 'total_matched': total_matched, 'score_std': 0, 'RNHS': score}
449 | 
450 | def rank_cor(theoretical_list, experimental_list):
451 |     n = len(theoretical_list)
452 |     if n <= 1:
453 |         return 0
454 |     top = 6 * sum((float(z1 - z2))**2 for z1, z2 in zip(theoretical_list, experimental_list))
455 |     bottom = n * (n**2 - 1)
456 |     return 1 - top/bottom
457 | 
458 | import math
459 | def cos_correlation(theoretical_list, experimental_list):
460 |     top = 0
461 |     if len(theoretical_list) <= 1:
462 |         return 0
463 |     bottom = math.sqrt(sum([numb * numb for numb in theoretical_list])) * \
464 |         math.sqrt(sum([numb * numb for numb in experimental_list]))
465 |     if not bottom:
466 |         return 0
467 | 
468 |     for i1, i2 in zip(theoretical_list, experimental_list):
469 |         top += i1 * i2
470 | 
471 |     return top / bottom
472 | 
473 | def RNHS2_ultrafast(spectrum_idict, theoretical_set, min_matched, nm, best_res, allowed_idx):
474 |     return RNHS_ultrafast(spectrum_idict, theoretical_set, min_matched, nm, best_res, allowed_idx)
475 | 
476 | def RNHS2_fast(spectrum_fastset, spectrum_idict, theoretical_set, min_matched):
477 |     return RNHS_fast(spectrum_fastset, spectrum_idict, theoretical_set, min_matched)
478 | 
479 | def RNHS2(spectrum, theoretical, acc, acc_ppm=False, position=False):
480 |     mz_array = copy(spectrum['m/z array'])
481 |     KDT = copy(spectrum['__KDTree'])
482 |     s_ia = copy(spectrum['intensity array'])
483 |     s_is = copy(spectrum['Isum'])
484 | 
485 |     query_dict = {}
486 |     for ion, fragments in theoretical.items():
487 |         query_dict[ion] = KDT.query(fragments, distance_upper_bound=acc)
488 | 
489 |     score_tmp = []
490 |     if not acc_ppm:
491 |         acc_ppm = 0
492 |     for i in range(21, 1, -2):
493 |     # for accc, accc_ppm in zip([acc/3, acc/2, acc], [acc_ppm/3, acc_ppm/2, acc_ppm]):
494 |         accc = acc / i
495 |         accc_ppm = acc_ppm / i
496 |         score = 0
497 |         mult = []
498 |         match = {}
499 |         match2 = {}
500 |         total_matched = 0
501 |         sumI = 0
502 |         dist_all = []
503 |         for ion, fragments in theoretical.items():
504 |             dist, ind = query_dict[ion]#spectrum['__KDTree'].query(fragments, distance_upper_bound=accc)
505 |             # dist, ind = spectrum['__KDTree'].query(fragments, distance_upper_bound=accc)
506 |             mask1 = (dist != np.inf)
507 |             if acc_ppm:
508 |                 ind = ind.clip(max=mz_array.size-1)
509 |                 nacc = np.where(dist / mz_array[ind] * 1e6 > accc_ppm)[0]
510 |                 mask2 = mask1.copy()
511 |                 mask2[nacc] = False
512 |             else:
513 |                 # if len(np.where(np.abs(dist[mask1]) > accc)[0]) > 0:
514 |                 #     logger.info('\n')
515 |                 #     logger.info(dist)
516 |                 #     logger.info(dist[mask1])
517 |                 #     logger.info(np.where(np.abs(dist[mask1]) > accc)[0])
518 |                 #     logger.info('\n')
519 |                 nacc = np.where(dist > accc)[0]
520 |                 mask2 = mask1.copy()
521 |                 mask2[nacc] = False
522 |                 # mask2 = mask1
523 |             nmatched = mask2.sum()
524 |             if nmatched:
525 |                 total_matched += nmatched
526 |                 mult.append(factorial(nmatched))
527 |                 sumi = s_ia[ind[mask2]].sum()
528 |                 sumI += sumi
529 |                 score += sumi# / s_is
530 |                 dist_all.extend(dist[mask2])
531 |             match[ion] = mask2
532 |             match2[ion] = mask2
533 |         score = score / s_is
534 |         if not total_matched:
535 |             # return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0}
536 |             pass
537 |         else:
538 |             for m in mult:
539 |                 score *= m
540 |             sumI = np.log10(sumI)
541 |         score_tmp.append(score)
542 |     if not total_matched:
543 |         return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
544 |     if position:
545 |         yions = match2[('y', 1)]
546 |         bions = match2[('b', 1)]
547 |         plen = len(yions)
548 |         if position > plen + 1:
549 | #           print 'Something wrong with aachange position'
550 |             return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
551 |         if position == 1:
552 |             if not bions[0]:
553 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
554 |         elif position == plen + 1:
555 |             if not yions[0]:
556 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
557 |         else:
558 |             if not (yions[plen - position + 1] and yions[plen - position]):
559 |                 return {'score': 0, 'match': None, 'sumI': 0, 'dist': [], 'total_matched': 0, 'score_std': 0}
560 | 
561 |     score_std = np.std(score_tmp)# / np.mean(score_tmp)
562 |     bions_score_neg = score_tmp[0]
563 |     print(score_tmp[0], np.mean(score_tmp))
564 |     score_tmp = np.mean(score_tmp)
565 |     return {'score': score_tmp, 'score_std': score_std, 'match': match, 'sumI': sumI, 'dist': dist_all, 'total_matched': total_matched,
566 |     'yions_score': 0, 'bions_score': 0, 'yions_score_neg': 0, 'bions_score_neg': bions_score_neg}
567 | 


--------------------------------------------------------------------------------
/identipy/utils.py:
--------------------------------------------------------------------------------
   1 | import re
   2 | from pyteomics import mass, electrochem as ec, auxiliary as aux, fasta, mzml, parser, mgf
   3 | import pandas as pd
   4 | from itertools import combinations, islice
   5 | from collections import defaultdict, Counter
   6 | import numpy as np
   7 | from multiprocessing import Queue, Process, cpu_count
   8 | import string
   9 | from copy import copy
  10 | try:
  11 |     from ConfigParser import RawConfigParser
  12 | except ImportError:
  13 |     from configparser import RawConfigParser
  14 | import tempfile
  15 | import os
  16 | import platform
  17 | import logging
  18 | import itertools as it
  19 | try:
  20 |     from lxml import etree
  21 | except ImportError:
  22 |     etree = None
  23 | from time import strftime
  24 | from os import path
  25 | logger = logging.getLogger(__name__)
  26 | 
  27 | try:
  28 |     from pyteomics import cmass
  29 | except ImportError:
  30 |     logger.warning('pyteomics.cythonize not found. It is highly recommended for good performance.')
  31 |     cmass = mass
  32 | try:
  33 |     from . import cparser
  34 | except ImportError:
  35 |     from . import customparser as cparser
  36 | from scipy.spatial import cKDTree
  37 | import pkg_resources
  38 | try:
  39 |     basestring
  40 | except NameError:
  41 |     basestring = (str, bytes)
  42 | 
  43 | default_tags = {
  44 |     'tmt10plex': {
  45 |         'tmt_126': 126.1277261,
  46 |         'tmt_127N': 127.1247610,
  47 |         'tmt_128C': 128.1344357,
  48 |         'tmt_129N': 129.1314706,
  49 |         'tmt_130C': 130.1411453,
  50 |         'tmt_131': 131.1381802,
  51 |         'tmt_127C': 127.1310809,
  52 |         'tmt_128N': 128.1281158,
  53 |         'tmt_129C': 129.1377905,
  54 |         'tmt_130N': 130.1348254
  55 |     },
  56 |     'tmt11plex': {
  57 |         'tmt_126': 126.1277261,
  58 |         'tmt_127N': 127.1247610,
  59 |         'tmt_127C': 127.1310809,
  60 |         'tmt_128N': 128.1281158,
  61 |         'tmt_128C': 128.1344357,
  62 |         'tmt_129N': 129.1314706,
  63 |         'tmt_129C': 129.1377905,
  64 |         'tmt_130N': 130.1348254,
  65 |         'tmt_130C': 130.1411453,
  66 |         'tmt_131': 131.1381802,
  67 |         'tmt_131C': 131.144999
  68 |     },
  69 |    'tmt_pro': {
  70 |         'tmt_126': 126.1277261,
  71 |         'tmt_127N': 127.1247610,
  72 |         'tmt_127C': 127.1310809,
  73 |         'tmt_128N': 128.1281158,
  74 |         'tmt_128C': 128.1344357,
  75 |         'tmt_129N': 129.1314706,
  76 |         'tmt_129C': 129.1377905,
  77 |         'tmt_130N': 130.1348254,
  78 |         'tmt_130C': 130.1411453,
  79 |         'tmt_131N': 131.1381802,
  80 |         'tmt_131C': 131.1445,
  81 |         'tmt_132N': 132.14153,
  82 |         'tmt_132C': 132.14785,
  83 |         'tmt_133N': 133.14489,
  84 |         'tmt_133C': 133.15121,
  85 |         'tmt_134N': 134.14824,
  86 |     },
  87 |     'tmt6plex': {
  88 |         'tmt_126': 126.1277261,
  89 |         'tmt_127N': 127.1247610,
  90 |         'tmt_128C': 128.1344357,
  91 |         'tmt_129N': 129.1314706,
  92 |         'tmt_130C': 130.1411453,
  93 |         'tmt_131': 131.1381802,
  94 |     }
  95 | }
  96 | default_tags['tmt16plex'] = default_tags['tmt_pro']
  97 | 
  98 | 
  99 | def get_tags(tags):
 100 |     logger.debug('Tags: %s', tags)
 101 |     if tags:
 102 |         if tags in default_tags:
 103 |             return default_tags[tags]
 104 |         else:
 105 |             ctags = dict()
 106 |             for tag in str(tags).split(','):
 107 |                 for lbl, mss in tag.split(':'):
 108 |                     ctags[lbl] = float(mss)
 109 |             return ctags
 110 |     else:
 111 |         return tags
 112 | 
 113 | 
 114 | def get_child_for_mods(mods_str, settings, fixed=True, protein=False):
 115 |     if mods_str:
 116 |         for mod in re.split(r'[,;]\s*', mods_str):
 117 |             term = False
 118 |             if '-' not in mod:
 119 |                 child_mod = etree.Element('aminoacid_modification')
 120 | 
 121 |                 t = None
 122 |                 if '[' in mod:
 123 |                     t = 'n'
 124 |                 elif ']' in mod:
 125 |                     t = 'c'
 126 |                 if t:
 127 |                     child_mod.set('protein_terminus' if protein else 'peptide_terminus', t)
 128 |                 mod = mod.replace('[', '').replace(']', '')
 129 |                 mod_label, mod_aa = parser._split_label(mod)
 130 |                 mod_mass = mass.std_aa_mass.get(mod_aa, 0)
 131 |                 mod_massdiff = settings.getfloat('modifications', mod_label)
 132 | 
 133 |                 child_mod.set('aminoacid', mod_aa)
 134 |                 child_mod.set('massdiff', str(mod_massdiff))
 135 |                 child_mod.set('mass', str(mod_mass+mod_massdiff))
 136 |                 child_mod.set('variable', 'Y' if not fixed else 'N')
 137 |                 yield child_mod
 138 |             elif mod[0] == '-':
 139 |                 term = 'c'
 140 |                 mod_label = mod[1:]
 141 |                 mod_term_mass = settings.getfloat('modifications', 'protein cterm cleavage')
 142 |             elif mod[-1] == '-':
 143 |                 term = 'n'
 144 |                 mod_label = mod[:-1]
 145 |                 mod_term_mass = settings.getfloat('modifications', 'protein nterm cleavage')
 146 | 
 147 |             if term:
 148 |                 mod_massdiff = settings.getfloat('modifications', mod_label)
 149 |                 child_mod = etree.Element('terminal_modification')
 150 |                 child_mod.set('terminus', term)
 151 |                 child_mod.set('massdiff', str(mod_massdiff))
 152 |                 child_mod.set('mass', str((mod_massdiff if not fixed else 0)+mod_term_mass))
 153 |                 child_mod.set('variable', 'Y' if not fixed else 'N')
 154 |                 yield child_mod
 155 | 
 156 | 
 157 | def custom_mass(sequence, nterm_mass, cterm_mass, **kwargs):
 158 |     return cmass.fast_mass(sequence, **kwargs) + (nterm_mass - 1.007825) + (cterm_mass - 17.002735)
 159 | 
 160 | 
 161 | def get_RCs(sequences, RTs, lcp=-0.21, term_aa=False, **kwargs):
 162 | 
 163 |     peptide_lengths = kwargs.get('lengths', np.log([len(peptide) for peptide in sequences]))
 164 |     peptide_dicts = sequences#[Counter(peptide) for peptide in sequences]
 165 | 
 166 |     detected_amino_acids = {aa for peptide_dict in peptide_dicts
 167 |                                 for aa in peptide_dict}
 168 | 
 169 |     # Determine retention coefficients using multidimensional linear
 170 |     # regression.
 171 |     composition_array = []
 172 |     for idx, pdict in enumerate(peptide_dicts):
 173 |         loglen = peptide_lengths[idx]#np.log(parser.length(pdict))
 174 |         composition_array.append([pdict.get(aa, 0.) * (1. + lcp * loglen)
 175 |                for aa in detected_amino_acids] + [1.])
 176 | 
 177 |     # Add normalizing conditions for terminal retention coefficients. The
 178 |     # condition we are using here is quite arbitrary. It implies that the sum
 179 |     # of N- or C-terminal RCs minus the sum of corresponding internal RCs must
 180 |     # be equal to zero.
 181 |     if term_aa:
 182 |         for term_label in ['nterm', 'cterm']:
 183 |             normalizing_peptide = []
 184 |             for aa in detected_amino_acids:
 185 |                 if aa.startswith(term_label):
 186 |                     normalizing_peptide.append(1.0)
 187 |                 elif (term_label+aa) in detected_amino_acids:
 188 |                     normalizing_peptide.append(-1.0)
 189 |                 else:
 190 |                     normalizing_peptide.append(0.0)
 191 |             normalizing_peptide.append(0.0)
 192 |             composition_array.append(normalizing_peptide)
 193 |             RTs.append(0.0)
 194 | 
 195 |     # Use least square linear regression.
 196 |     RCs, res, rank, s = np.linalg.lstsq(np.array(composition_array), np.array(RTs))
 197 | 
 198 |     # Remove normalizing elements from the RTs vector.
 199 |     if term_aa:
 200 |         for term_label in ['nterm', 'cterm']:
 201 |             RTs.pop()
 202 | 
 203 |     # Form output.
 204 |     RC_dict = {}
 205 |     RC_dict['aa'] = dict(
 206 |         zip(list(detected_amino_acids),
 207 |             RCs[:len(detected_amino_acids)]))
 208 |     RC_dict['aa'][parser.std_nterm] = 0.0
 209 |     RC_dict['aa'][parser.std_cterm] = 0.0
 210 |     RC_dict['const'] = RCs[len(detected_amino_acids)]
 211 |     RC_dict['lcp'] = lcp
 212 | 
 213 |     # Find remaining terminal RCs.
 214 |     if term_aa:
 215 |         for term_label in ['nterm', 'cterm']:
 216 |             # Check if there are terminal RCs remaining undefined.
 217 |             undefined_term_RCs = [aa for aa in RC_dict['aa']
 218 |                                 if aa[1:5] != 'term' and term_label + aa not in RC_dict['aa']]
 219 |             if not undefined_term_RCs:
 220 |                 continue
 221 | 
 222 |             # Find a linear relationship between internal and terminal RCs.
 223 |             defined_term_RCs = [aa for aa in RC_dict['aa']
 224 |                               if aa[1:5] != 'term' and term_label + aa in RC_dict['aa']]
 225 | 
 226 |             a, b, r, stderr = aux.linear_regression(
 227 |                 [RC_dict['aa'][aa] for aa in defined_term_RCs],
 228 |                 [RC_dict['aa'][term_label+aa] for aa in defined_term_RCs])
 229 | 
 230 |             # Define missing terminal RCs using this linear equation.
 231 |             for aa in undefined_term_RCs:
 232 |                 RC_dict['aa'][term_label + aa] = a * RC_dict['aa'][aa] + b
 233 | 
 234 |     return RC_dict
 235 | 
 236 | 
 237 | def get_RCs_vary_lcp(sequences, RTs, term_aa=False, lcp_range=(-1.0, 1.0), **kwargs):
 238 | 
 239 |     labels = kwargs.get('labels')
 240 | 
 241 |     best_r = -1.1
 242 |     best_RC_dict = {}
 243 |     lcp_accuracy = kwargs.get('lcp_accuracy', 0.1)
 244 | 
 245 |     min_lcp = lcp_range[0]
 246 |     max_lcp = lcp_range[1]
 247 |     step = (max_lcp - min_lcp) / 10.0
 248 |     peptide_lengths = np.log([len(peptide) for peptide in sequences])
 249 |     peptide_dicts = [Counter(peptide) for peptide in sequences]
 250 |     while step > lcp_accuracy:
 251 |         lcp_grid = np.arange(min_lcp, max_lcp,
 252 |                                 (max_lcp - min_lcp) / 10.0)
 253 |         for lcp in lcp_grid:
 254 |             RC_dict = get_RCs(peptide_dicts, RTs, lcp, term_aa, labels=labels, lengths=peptide_lengths)
 255 |             regression_coeffs = aux.linear_regression(
 256 |                 RTs,
 257 |                 [calculate_RT(peptide, RC_dict) for peptide in peptide_dicts])
 258 |             if regression_coeffs[2] > best_r:
 259 |                 best_r = regression_coeffs[2]
 260 |                 best_RC_dict = dict(RC_dict)
 261 |         min_lcp = best_RC_dict['lcp'] - step
 262 |         max_lcp = best_RC_dict['lcp'] + step
 263 |         step = (max_lcp - min_lcp) / 10.0
 264 | 
 265 |     return best_RC_dict
 266 | 
 267 | 
 268 | def calculate_RT(peptide, RC_dict, raise_no_mod=True):
 269 |     plen = len(peptide)
 270 |     peptide_dict = peptide
 271 |     RT = 0.0
 272 |     for aa in peptide_dict:
 273 |         if aa not in RC_dict['aa']:
 274 |             if len(aa) == 1:
 275 |                 raise aux.PyteomicsError('No RC for residue "{}".'.format(aa))
 276 |             if (not raise_no_mod) and aa[-1] in RC_dict['aa']:
 277 |                 RT += RC_dict['aa'][aa[-1]]
 278 |             else:
 279 |                 raise aux.PyteomicsError(
 280 |                     'Residue "{0}" not found in RC_dict. '.format(aa) +
 281 |                     'Set raise_no_mod=False to ignore this error ' +
 282 |                     'and use the RC for "{0}"" instead.'.format(aa[-1]))
 283 |         else:
 284 |             RT += RC_dict['aa'][aa]
 285 | 
 286 |     length_correction_term = (
 287 |         1.0 + RC_dict.get('lcp', 0) * np.log(plen))
 288 |     RT *= length_correction_term
 289 | 
 290 |     RT += RC_dict.get('const', 0)
 291 | 
 292 |     return RT
 293 | 
 294 | 
 295 | _modchars = set(string.ascii_lowercase + string.digits)
 296 | 
 297 | 
 298 | def custom_split_label(mod):
 299 |     j = 0
 300 |     while mod[j] in _modchars:
 301 |         j += 1
 302 |     if j == 0:
 303 |         return mod[1:], '-', ']'
 304 |     if len(mod[j:]) > 1 and '[' in mod:
 305 |         return mod[:j], mod[j:].replace('[', ''), '['
 306 |     elif len(mod[j:]) > 1 and ']' in mod:
 307 |         return mod[:j], mod[j:].replace(']', ''), ']'
 308 |     elif len(mod[j:]) == 1:
 309 |         if mod.startswith('-'):
 310 |             return mod[:j], '-', ']'
 311 |         elif mod.endswith('-'):
 312 |             return mod[:j], '-', '['
 313 |         else:
 314 |             return mod[:j], mod[j:], ''
 315 | 
 316 | 
 317 | class MS2OnlyMzML(mzml.MzML): 
 318 |      _default_iter_path = '//spectrum[./*[local-name()="cvParam" and @name="ms level" and @value="2"]]' 
 319 |      _use_index = False 
 320 |      _iterative = False
 321 | 
 322 | 
 323 | def iterate_spectra(fname):
 324 |     ftype = fname.rsplit('.', 1)[-1].lower()
 325 |     if ftype == 'mgf':
 326 |         with mgf.read(fname, read_charges=False, use_index=False) as f:
 327 |             for x in f:
 328 |                 yield x
 329 |     elif ftype == 'mzml':
 330 |         for x in MS2OnlyMzML(source=fname):
 331 |             yield x
 332 |         # with mzml.read(fname, use_index=False) as f:
 333 |         #     for x in f:
 334 |         #         if x['ms level'] > 1:
 335 |         #             yield x
 336 |     else:
 337 |         raise ValueError('Unrecognized file type: {}'.format(ftype))
 338 | 
 339 | 
 340 | def get_nprocesses(settings):
 341 |     if platform.system() == 'Windows':
 342 |         return 1
 343 |     n = settings.getint('performance', 'processes')
 344 |     if n == 0:
 345 |         try:
 346 |             n = cpu_count()
 347 |         except NotImplementedError:
 348 |             n = 1
 349 |     return n
 350 | 
 351 | 
 352 | def iterate_and_preprocess(fname, params, settings):
 353 |     it = iterate_spectra(fname)
 354 |     n = get_nprocesses(settings)
 355 |     return multimap(n, preprocess_spectrum, it, kwargs=params)
 356 | 
 357 | 
 358 | def is_decoy_function(settings):
 359 |     prefix = settings.get('input', 'decoy prefix').strip()
 360 |     infix = settings.get('input', 'decoy infix').strip()
 361 |     if infix:
 362 |         return lambda d: infix in d
 363 |     if prefix:
 364 |         return lambda d: d.startswith(prefix)
 365 |     logger.error('No decoy label specified. One of "decoy prefix" or "decoy infix" is needed.')
 366 | 
 367 | 
 368 | def peptide_gen(settings, clear_seen_peptides=False):
 369 |     if clear_seen_peptides:
 370 |         seen_target.clear()
 371 |         seen_decoy.clear()
 372 |     isdecoy = is_decoy_function(settings)
 373 |     enzyme = get_enzyme(settings.get('search', 'enzyme'))
 374 |     logger.debug('Using cleavage rule: %s', enzyme)
 375 |     semitryptic = settings.getint('search', 'semitryptic')
 376 |     mc = settings.getint('search', 'number of missed cleavages')
 377 |     minlen = settings.getint('search', 'peptide minimum length')
 378 |     maxlen = settings.getint('search', 'peptide maximum length')
 379 |     snp = settings.getint('search', 'snp')
 380 |     clip_M = settings.getboolean('search', 'clip N-terminal methionine')
 381 |     for prot in prot_gen(settings):
 382 |         for pep, pos in prot_peptides(prot[1], enzyme, mc, minlen, maxlen,
 383 |             is_decoy=isdecoy(prot[0]), snp=snp, desc=prot[0], semitryptic=semitryptic, position=True, clip_M=clip_M):
 384 |             term = ''
 385 |             if pos == 0:
 386 |                 term += 'n'
 387 |             if pos + len(pep) == len(prot[1]):
 388 |                 term += 'c'
 389 |             yield pep, term
 390 | 
 391 | 
 392 | def peptide_isoforms(settings, clear_seen_peptides=False):
 393 |     snp = settings.getint('search', 'snp')
 394 |     maxmods = settings.getint('modifications', 'maximum variable mods')
 395 |     leg = settings.get('misc', 'legend')
 396 |     pleg = settings.get('misc', 'plegend')
 397 |     logger.debug('leg: %s, pleg: %s', leg, pleg)
 398 |     punct = set(string.punctuation)
 399 |     nmods = [(p, mod[1], mod[2]) for p, mod in leg.items() if p in punct]
 400 |     pmods_n, pmods_c = [], []
 401 |     for p, mod in pleg.items():
 402 |         if p in punct:
 403 |             if mod[2] == '[':
 404 |                 pmods_n.append((p, mod[1], mod[2]))
 405 |             if mod[2] == ']':
 406 |                 pmods_c.append((p, mod[1], mod[2]))
 407 |     logger.debug('nmods: %s', nmods)
 408 |     logger.debug('pmods_n: %s', pmods_n)
 409 |     logger.debug('pmods_c: %s', pmods_c)
 410 |     aa_mass = get_aa_mass(settings)
 411 |     nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage')
 412 |     cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage')
 413 |     for peptide, term in peptide_gen(settings, clear_seen_peptides):
 414 |         mods = nmods[:]
 415 |         if 'n' in term:
 416 |             mods += pmods_n
 417 |         if 'c' in term:
 418 |             mods += pmods_c
 419 |         for form in (custom_isoforms(peptide, variable_mods=mods, maxmods=maxmods, snp=snp) if (nmods and maxmods) else [peptide, ]):
 420 |             if snp:
 421 |                 if 'snp' not in form:
 422 |                     seqm = form
 423 |                     aachange_pos = False
 424 |                     snp_label = 'wild'
 425 |                 else:
 426 |                     tmp = form.split('snp')
 427 |                     seqm = tmp[0] + tmp[1].split('at')[0].split('to')[-1] + tmp[2]
 428 |                     aachange_pos = len(tmp[0]) + 1
 429 |                     snp_label = tmp[1]
 430 |                 aachange_pos = False
 431 |             else:
 432 |                 seqm = form
 433 |                 aachange_pos = False
 434 |                 snp_label = False
 435 | 
 436 |             m = custom_mass(seqm, aa_mass=aa_mass, nterm_mass=nterm_mass, cterm_mass=cterm_mass)
 437 |             yield (seqm, aachange_pos, snp_label, m)
 438 | 
 439 | 
 440 | def prot_gen(settings):
 441 |     db = settings.get('input', 'database')
 442 |     # add_decoy = settings.getboolean('input', 'add decoy')
 443 |     # prefix = settings.get('input', 'decoy prefix')
 444 | 
 445 |     with fasta.read(db) as f:
 446 |         for p in f:
 447 |             yield p
 448 | 
 449 | 
 450 | def get_peptides(prot_seq, enzyme, mc, minlen, maxlen, semitryptic=False):
 451 |     peptides = cparser._cleave(prot_seq, enzyme, mc)
 452 |     for pep, startposition in peptides:
 453 |         plen = len(pep)
 454 |         if minlen <= plen <= maxlen:
 455 |             if not semitryptic:
 456 |                 yield pep, startposition, plen
 457 |             else:
 458 |                 for i in range(plen-minlen+1):
 459 |                     yield pep[i:], startposition + i, plen - i
 460 |                 for i in range(1, plen-minlen+1, 1):
 461 |                     yield pep[:-i], startposition, plen - i
 462 | 
 463 | 
 464 | seen_target = set()
 465 | seen_decoy = set()
 466 | def prot_peptides(prot_seq, enzyme, mc, minlen, maxlen, is_decoy,
 467 |         dont_use_seen_peptides=False, snp=False, desc=False, position=False, semitryptic=False, clip_M=True):
 468 | 
 469 |     dont_use_fast_valid = parser.fast_valid(prot_seq)
 470 |     methionine_check = (clip_M and prot_seq[0] == 'M')
 471 |     if snp == 2:
 472 |         if desc:
 473 |             try:
 474 |                 tmp = desc.split(' ')[0].split('|')
 475 |                 pos = int(tmp[1]) - 1
 476 |                 aach = tmp[2]
 477 |             except:
 478 |                 desc = False
 479 |     # peptides = cparser._cleave(prot_seq, enzyme, mc)
 480 |     # for pep, startposition in peptides:
 481 |     #     plen = len(pep)
 482 |     for pep, startposition, plen in get_peptides(prot_seq, enzyme, mc, minlen, maxlen, semitryptic):
 483 |         loopcnt = 0
 484 |         if pep not in seen_target and pep not in seen_decoy and (dont_use_fast_valid or parser.fast_valid(pep)):
 485 |             loopcnt = 1
 486 |             if methionine_check and startposition == 0:
 487 |                 if minlen <= plen - 2:
 488 |                     loopcnt = 3
 489 |                 elif minlen <= plen - 1:
 490 |                     loopcnt = 2
 491 |         while loopcnt:
 492 |             f = pep[loopcnt-1:]
 493 |             if dont_use_seen_peptides:
 494 |                 if snp == 1:
 495 |                     for ff, seq_new in custom_snp(f, startposition):
 496 |                         if not seq_new:
 497 |                             yield ff if not position else (ff, startposition)
 498 |                         else:
 499 |                             yield ff if not position else (ff, startposition)
 500 |                 else:
 501 |                     yield f if not position else (f, startposition)
 502 |             else:
 503 |                 if f not in seen_target and f not in seen_decoy:
 504 |                     if is_decoy:
 505 |                         seen_decoy.add(f)
 506 |                     else:
 507 |                         seen_target.add(f)
 508 |                     if snp == 1:
 509 |                         for ff, seq_new in custom_snp(f, startposition):
 510 |                             if not seq_new:
 511 |                                 yield ff if not position else (ff, startposition)
 512 |                             if seq_new not in seen_decoy and seq_new not in seen_target:
 513 |                                 yield ff if not position else (ff, startposition)
 514 |                     elif snp == 2:
 515 |                         if desc and startposition <= pos <= startposition + plen:
 516 |                             if len(aach) == 3 and aach[0] in parser.std_amino_acids and aach[2] in parser.std_amino_acids:
 517 |                                 pos_diff = pos - startposition
 518 |                                 f = f[:pos_diff] + 'snp%sto%sat%ssnp' % (aach.split('>')[0], aach.split('>')[-1], pos) + f[pos_diff+1:]
 519 |                                 yield f if not position else (f, startposition)
 520 |                         else:
 521 |                             yield f if not position else (f, startposition)
 522 |                     else:
 523 |                         yield f if not position else (f, startposition)
 524 |             loopcnt -= 1
 525 | 
 526 | 
 527 | def custom_snp(peptide, startposition):
 528 |     yield peptide, None
 529 |     j = len(peptide) - 1
 530 |     while j >= 0:
 531 |         for aa in parser.std_amino_acids:
 532 |             if aa != 'L' and aa != peptide[j] and not (aa == 'I' and peptide[j] == 'L'):
 533 |                 aa_label = 'snp%sto%sat%ssnp' % (peptide[j], aa, str(j + startposition))
 534 |                 out = peptide[:j] + aa_label + peptide[j+1:], peptide[:j] + aa + peptide[j+1:]
 535 |                 yield out
 536 |         j -= 1
 537 | 
 538 | 
 539 | def normalize_mods(sequence, settings):
 540 |     leg = settings.get('misc', 'legend')
 541 |     if leg:
 542 |         for char in string.punctuation:
 543 |             if char in leg:
 544 |                 if leg[char][2] == ']' and leg[char][1] == '-':
 545 |                     sequence = sequence.replace(char, '-' + leg[char][0])
 546 |                 else:
 547 |                     sequence = sequence.replace(char, ''.join(leg[char][:2]))
 548 |     return sequence
 549 | 
 550 | 
 551 | def custom_isoforms(peptide, variable_mods, maxmods=2, nterm=False, cterm=False, snp=False):
 552 |     if not variable_mods:
 553 |         yield peptide
 554 |     else:
 555 |         to_char = variable_mods[-1][0]
 556 |         from_char = variable_mods[-1][1]
 557 |         term = variable_mods[-1][2]
 558 |         sites = [s[0] for s in enumerate(peptide) if (not snp or (s[0] - 4 < 0 or peptide[s[0]-4:s[0]-1] != 'snp')) and (from_char == '-' or s[1] == from_char) and (not term or (term == '[' and s[0] == 0) or (term == ']' and s[0] == len(peptide)-1))]
 559 |         for m in range(maxmods+1):
 560 |             for comb in combinations(sites, m):
 561 |                 flag = 0
 562 |                 flag2 = 0
 563 |                 tmpnterm = True if nterm else False
 564 |                 tmpcterm = True if cterm else False
 565 |                 v = ''
 566 |                 cc_prev = 0
 567 |                 for cc in comb:
 568 |                     if from_char == '-':
 569 |                         if term == '[' and not nterm:
 570 |                             flag2 = 1
 571 |                             v += to_char
 572 |                             tmpnterm = True
 573 |                         elif term == ']' and not cterm:
 574 |                             v = v + peptide[cc_prev:cc+1] + to_char
 575 |                             tmpcterm = True
 576 |                         else:
 577 |                             flag = 1
 578 |                     else:
 579 |                         v = v + peptide[cc_prev:cc] + to_char
 580 |                     if not flag2:
 581 |                         cc_prev = cc + 1
 582 |                 if not flag:
 583 |                     v = v + peptide[cc_prev:]
 584 |                     for z in custom_isoforms(v, variable_mods[:-1], maxmods=maxmods - m, nterm=tmpnterm, cterm=tmpcterm, snp=snp):
 585 |                         yield z
 586 | 
 587 | 
 588 | def remove_precursor(mz_prec, spectrum, acc):
 589 |     mz = spectrum['m/z array']
 590 |     intens = spectrum['intensity array']
 591 |     idx = np.full(mz.size, True)
 592 |     i_l = mz.searchsorted(mz_prec - acc)#mz.size-2
 593 |     i_r = mz.searchsorted(mz_prec + acc, side='right')
 594 |     for i in range(i_l, i_r, 1):
 595 |         idx[i] = False
 596 |     spectrum['m/z array'] = mz[idx]
 597 |     spectrum['intensity array'] = intens[idx]
 598 | 
 599 | 
 600 | def deisotope(spectrum, acc, charge):
 601 |     #   acc = 0.3
 602 |     mz = spectrum['m/z array']
 603 |     intens = spectrum['intensity array']
 604 | 
 605 |     h = 1.0057
 606 |     i = mz.size-2
 607 |     skip = set()
 608 |     add = []
 609 |     while i >= 0:
 610 |         j = min(mz.size-1, mz.searchsorted(mz[i] + 1.5, side='right'))
 611 |         while j > i:
 612 |             if intens[i] > intens[j]:
 613 |                 d = mz[j] - mz[i]
 614 |                 if d > 1.5*h:
 615 |                     j -= 1
 616 |                     continue
 617 |                 for z in range(1, charge+1):
 618 |                     if abs(d - 1./z) < acc:
 619 |                         skip.add(j)
 620 |                         if z > 1:
 621 |     #                         skip.add(i)
 622 |                             add.append((i, z))
 623 |             j -= 1
 624 |         i -= 1
 625 |     ix = np.delete(np.arange(mz.size, dtype=int), list(skip))
 626 |     newmz, newint = [], []
 627 |     for i, z in add:
 628 |         newmz.append(mz[i]*z - (z-1)*h)
 629 |         newint.append(intens[i])
 630 |     #   print len(skip), len(add)
 631 |     mz = np.hstack((mz[ix], newmz))
 632 |     intens = np.hstack((intens[ix], newint))
 633 |     spectrum['m/z array'] = mz
 634 |     spectrum['intensity array'] = intens
 635 | 
 636 | 
 637 | def preprocess_spectrum(spectrum, kwargs):
 638 |     spectrum = copy(spectrum)
 639 |     maxpeaks = kwargs['maxpeaks']
 640 |     minpeaks = kwargs['minpeaks']
 641 |     dynrange = kwargs['dynrange']
 642 |     acc = kwargs['acc']
 643 |     tags = kwargs['tags']
 644 | 
 645 |     if 'm/z array' not in spectrum:
 646 |         return None
 647 | 
 648 |     _, states = get_expmass(spectrum, kwargs)
 649 |     if not states:
 650 |         return None
 651 | 
 652 |     if tags:
 653 |         # TODO optimize performance
 654 |         max_mass_label_val = max(tags.values()) + 1.0
 655 |         tmp_idx = np.nonzero(spectrum['m/z array'] <= max_mass_label_val)
 656 |         tags_res = defaultdict(float)
 657 |         for tmt_label, tmt_mass in tags.items():
 658 |             for t_m, t_i in zip(spectrum['m/z array'][tmp_idx], spectrum['intensity array'][tmp_idx]):
 659 |                 if abs(t_m - tmt_mass) / tmt_mass <= 1e-5:
 660 |                     tags_res[tmt_label] += t_i
 661 |         for tmt_label, tmt_intensity in tags_res.items():
 662 |             spectrum[tmt_label] = tmt_intensity
 663 | 
 664 |     if kwargs['deisotope']:
 665 |         dacc = kwargs['dacc']
 666 |         deisotope(spectrum, dacc, states[-1])
 667 | 
 668 |     mz_prec, _ = get_expmass(spectrum, kwargs)
 669 |     remove_precursor(mz_prec, spectrum, acc)
 670 | 
 671 |     mz = spectrum['m/z array']
 672 | 
 673 |     idx = np.nonzero(mz >= kwargs['min_mz'])
 674 |     spectrum['intensity array'] = spectrum['intensity array'][idx]
 675 |     mz = mz[idx]
 676 |     spectrum['intensity array'] = spectrum['intensity array'].astype(np.float32)
 677 | 
 678 |     if minpeaks and spectrum['intensity array'].size < minpeaks:
 679 |         return None
 680 | 
 681 |     spectrum['intensity array'] = spectrum['intensity array'].astype(np.float32)
 682 | 
 683 |     if dynrange:
 684 |         i = spectrum['intensity array'] > spectrum['intensity array'].max(
 685 |                 ) / dynrange
 686 |         spectrum['intensity array'] = spectrum['intensity array'][i]
 687 |         mz = mz[i]
 688 | 
 689 |     if maxpeaks and minpeaks > maxpeaks:
 690 |         raise ValueError('minpeaks > maxpeaks: {} and {}'.format(
 691 |             minpeaks, maxpeaks))
 692 |     if maxpeaks and spectrum['intensity array'].size > maxpeaks:
 693 |         i = np.argsort(spectrum['intensity array'])[-maxpeaks:]
 694 |         j = np.argsort(mz[i])
 695 |         spectrum['intensity array'] = spectrum['intensity array'][i][j]
 696 |         mz = mz[i][j]
 697 | 
 698 |     spectrum['m/z array'] = mz
 699 | 
 700 |     if minpeaks and spectrum['intensity array'].size < minpeaks:
 701 |         return None
 702 | 
 703 |     spectrum['Isum'] = spectrum['intensity array'].sum()
 704 | 
 705 |     tmp2 = dict()
 706 |     tmp = spectrum['m/z array'] / acc
 707 |     tmp = tmp.astype(int)
 708 |     for idx, mt in enumerate(tmp):
 709 |         i_val = spectrum['intensity array'][idx] / spectrum['Isum']
 710 |         for mz_val_int in (mt-1, mt, mt+1):
 711 |             if mz_val_int not in tmp2:
 712 |                 tmp2[mz_val_int] = i_val
 713 |             else:
 714 |                 tmp2[mz_val_int] = max(i_val, tmp2[mz_val_int])
 715 |     tmp = np.concatenate((tmp, tmp-1, tmp+1))
 716 |     spectrum['fastset'] = set(tmp.tolist())
 717 |     spectrum['RT'] = get_RT(spectrum)
 718 |     spectrum['comp_voltage'] = get_comp_voltage(spectrum)
 719 |     spectrum['idict'] = tmp2
 720 | 
 721 |     spectrum['__KDTree'] = cKDTree(spectrum['m/z array'].reshape((spectrum['m/z array'].size, 1)))
 722 | 
 723 |     return spectrum
 724 | 
 725 | 
 726 | def relative(unit):
 727 |     if unit == 'ppm':
 728 |         return True
 729 |     elif unit in {'Th', 'Da', 'amu'}:
 730 |         return False
 731 |     else:
 732 |         raise ValueError('Unrecognized precursor accuracy unit: ' + unit)
 733 | 
 734 | 
 735 | def set_mod_dict(settings):
 736 |     mods = settings.get('modifications', 'variable')
 737 |     pmods = settings.get('modifications', 'protein variable')
 738 | 
 739 |     settings.set('modifications', 'variable_original', mods)
 740 |     settings.set('modifications', 'protein_original', pmods)
 741 |     i = None
 742 |     if isinstance(mods, basestring):
 743 |         mods = mods.strip(' ,')
 744 |         mod_dict = {}
 745 |         legend = {}
 746 | 
 747 |         if mods:
 748 |             mods = [custom_split_label(l) for l in re.split(r',\s*', mods)]
 749 |             mods.sort(key=lambda x: len(x[0]), reverse=True)
 750 |             for i, (mod, char) in enumerate(zip(mods, string.punctuation), 1):
 751 |                 legend[''.join(mod)] = char
 752 |                 legend[char] = mod
 753 |             assert all(len(m) == 3 for m in mods), 'unmodified residue given'
 754 |             for mod, aa, term in mods:
 755 |                 mod_dict.setdefault(mod, []).append(aa)
 756 |         settings.set('modifications', 'variable', mod_dict)
 757 |         logger.info('Setting legend: %s', legend)
 758 |         settings.set('misc', 'legend', legend)
 759 | 
 760 |     if isinstance(pmods, basestring):
 761 |         plegend = {}
 762 |         pmod_dict = {}
 763 |         if pmods:
 764 |             pmods = [custom_split_label(l) for l in re.split(r',\s*', pmods)]
 765 |             pmods.sort(key=lambda x: len(x[0]), reverse=True)
 766 |             for mod, char in zip(pmods, string.punctuation[i:]):
 767 |                 plegend[''.join(mod)] = char
 768 |                 plegend[char] = mod
 769 |             assert all(len(m) == 3 for m in pmods), 'unmodified residue given'
 770 |             for mod, aa, term in pmods:
 771 |                 pmod_dict.setdefault(mod, []).append(aa)
 772 |         settings.set('modifications', 'protein variable', pmod_dict)
 773 |         mod_dict.update(pmod_dict)
 774 |         settings.set('modifications', 'variable', mod_dict)
 775 |         settings.set('misc', 'plegend', plegend)
 776 |         logger.info('Setting plegend: %s', plegend)
 777 | 
 778 | 
 779 | def get_enzyme(enzyme):
 780 |     if enzyme in parser.expasy_rules:
 781 |         return parser.expasy_rules[enzyme]
 782 |     else:
 783 |         try:
 784 |             enzyme = convert_tandem_cleave_rule_to_regexp(enzyme)
 785 |             return enzyme
 786 |         except Exception as e:
 787 |             logger.debug('Exception parsing cleavage rule %s: %s', enzyme, e.args[0])
 788 |             return enzyme
 789 | 
 790 | 
 791 | def convert_tandem_cleave_rule_to_regexp(cleavage_rule):
 792 | 
 793 |     def get_sense(c_term_rule, n_term_rule):
 794 |         if '{' in c_term_rule:
 795 |             return 'N'
 796 |         elif '{' in n_term_rule:
 797 |             return 'C'
 798 |         else:
 799 |             if len(c_term_rule) <= len(n_term_rule):
 800 |                 return 'C'
 801 |             else:
 802 |                 return 'N'
 803 | 
 804 |     def get_cut(cut, no_cut):
 805 |         aminoacids = set(parser.std_amino_acids)
 806 |         cut = ''.join(aminoacids & set(cut))
 807 |         if '{' in no_cut:
 808 |             no_cut = ''.join(aminoacids & set(no_cut))
 809 |             return cut, no_cut
 810 |         else:
 811 |             no_cut = ''.join(set(parser.std_amino_acids) - set(no_cut))
 812 |             return cut, no_cut
 813 | 
 814 |     out_rules = []
 815 |     for protease in cleavage_rule.split(','):
 816 |         protease = protease.replace('X', ''.join(parser.std_amino_acids))
 817 |         c_term_rule, n_term_rule = protease.split('|')
 818 |         sense = get_sense(c_term_rule, n_term_rule)
 819 |         if sense == 'C':
 820 |             cut, no_cut = get_cut(c_term_rule, n_term_rule)
 821 |         else:
 822 |             cut, no_cut = get_cut(n_term_rule, c_term_rule)
 823 | 
 824 |         if no_cut:
 825 |             if sense == 'C':
 826 |                 out_rules.append('([%s](?=[^%s]))' % (cut, no_cut))
 827 |             else:
 828 |                 out_rules.append('([^%s](?=[%s]))' % (no_cut, cut))
 829 |         else:
 830 |             if sense == 'C':
 831 |                 out_rules.append('([%s])' % (cut, ))
 832 |             else:
 833 |                 out_rules.append('(?=[%s])' % (cut, ))
 834 |     return '|'.join(out_rules)
 835 | 
 836 | 
 837 | class CustomRawConfigParser(RawConfigParser, object):
 838 |     def get(self, section, option, **kwargs):
 839 |         val = super(CustomRawConfigParser, self).get(section, option)
 840 |         if isinstance(val, basestring):
 841 |             if section == 'search' and option == 'enzyme':
 842 |                 return val.split('|class')[0]
 843 |             return val[::-1].split('|', 1)[-1][::-1]
 844 |         return val
 845 | 
 846 |     def get_choices(self, section, option):
 847 |         val = super(CustomRawConfigParser, self).get(section, option)
 848 |         if isinstance(val, basestring) and len(val.split('|')) > 1:
 849 |             return val[::-1].split('|', 1)[0][::-1]
 850 |         else:
 851 |             return ''
 852 | 
 853 |     def copy(self):
 854 |         new_config = CustomRawConfigParser()
 855 |         for section in self.sections():
 856 |             new_config.add_section(section)
 857 |             for name, value in self.items(section):
 858 |                 new_config.set(section, name, value)
 859 |         return new_config
 860 | 
 861 | 
 862 | def find_nearest(array, value):
 863 |     return (np.abs(np.array(array) - value)).argmin()
 864 | 
 865 | 
 866 | def _charge_params(settings):
 867 |     params = {}
 868 |     params['maxcharge'] = settings.getint('search', 'maximum charge') or None
 869 |     params['mincharge'] = settings.getint('search', 'minimum charge') or None
 870 |     if settings.has_option('search', 'minimum unknown charge') and settings.getint('search', 'minimum unknown charge'):
 871 |         params['min_ucharge'] = max(settings.getint('search', 'minimum unknown charge'), params['mincharge'])
 872 |     else:
 873 |         params['min_ucharge'] = params['mincharge']
 874 |     if settings.has_option('search', 'maximum unknown charge') and settings.getint('search', 'maximum unknown charge'):
 875 |         params['max_ucharge'] = min(settings.getint('search', 'maximum unknown charge'), params['maxcharge'])
 876 |     else:
 877 |         params['max_ucharge'] = params['maxcharge']
 878 |     return params
 879 | 
 880 | 
 881 | def get_info(spectrum, result, settings, aa_mass=None):
 882 |     'Returns neutral mass, charge state and retention time of the top candidate'
 883 |     if not aa_mass:
 884 |         aa_mass = get_aa_mass(settings)
 885 |     RT = spectrum['RT']#get_RT(spectrum)
 886 |     comp_voltage = spectrum['comp_voltage']
 887 | 
 888 |     params = _charge_params(settings)
 889 | 
 890 |     masses, states = zip(*neutral_masses(spectrum, params))
 891 |     # idx = find_nearest(masses, cmass.fast_mass(str(result['candidates'][0][1]), aa_mass=aa_mass))
 892 | 
 893 | 
 894 |     nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage')
 895 |     cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage')
 896 | 
 897 |     idx = find_nearest(masses, custom_mass(str(result['candidates'][0][1]), aa_mass=aa_mass, nterm_mass=nterm_mass, cterm_mass=cterm_mass))
 898 |     return (masses[idx], states[idx], RT, comp_voltage)
 899 | 
 900 | 
 901 | def reshape_theor_spectrum(peaks):
 902 |     for k in peaks.keys():
 903 |         marr = np.array(peaks[k])
 904 |         n = marr.size
 905 |         peaks[k] = marr.reshape((n, 1))
 906 |     return peaks
 907 | 
 908 | 
 909 | ion_shift_dict = {
 910 |     'a': 46.00547930326002,
 911 |     'b': 18.010564683699954,
 912 |     'c': 0.984015582689949,
 913 |     'x': -25.979264555419945,
 914 |     'y': 0.0,
 915 |     'z': 17.026549101010005,
 916 | }
 917 | 
 918 | 
 919 | def calc_ions_from_neutral_mass(peptide, nm, ion_type, charge, aa_mass, cterm_mass, nterm_mass):
 920 |     if ion_type in 'abc':
 921 |         nmi = nm - aa_mass[peptide[-1]] - ion_shift_dict[ion_type] - (cterm_mass - 17.002735)
 922 |     else:
 923 |         nmi = nm - aa_mass[peptide[0]] - ion_shift_dict[ion_type] - (nterm_mass - 1.007825)
 924 |     return (nmi + 1.0072764667700085 * charge) / charge
 925 | 
 926 | 
 927 | def check_n_term(ion_type):
 928 |     return (ion_type[0] == 'b' or ion_type[0] == 'a' or ion_type[0] == 'c')
 929 | 
 930 | 
 931 | def get_n_ions(peptide, maxmass, pl, charge, k_aa_mass):
 932 |     tmp = [maxmass, ]
 933 |     for i in range(1, pl):
 934 |         tmp.append(tmp[-1] - k_aa_mass[peptide[-i-1]]/charge)
 935 |     return tmp
 936 | 
 937 | 
 938 | def get_c_ions(peptide, maxmass, pl, charge, k_aa_mass):
 939 |     tmp = [maxmass, ]
 940 |     for i in range(pl-2, -1, -1):
 941 |         tmp.append(tmp[-1] - k_aa_mass[peptide[-(i+2)]]/charge)
 942 |     return tmp
 943 | 
 944 | 
 945 | def theor_spectrum(peptide, acc_frag, nterm_mass, cterm_mass, types=('b', 'y'), maxcharge=None, reshape=False, **kwargs):
 946 |     peaks = {}
 947 |     theoretical_set = dict()
 948 |     if 'nm' in kwargs:
 949 |         nm = kwargs['nm']
 950 |     else:
 951 |         nm = custom_mass(peptide, aa_mass=kwargs['aa_mass'], nterm_mass = nterm_mass, cterm_mass = cterm_mass)
 952 |     pl = len(peptide) - 1
 953 |     if not maxcharge:
 954 |         maxcharge = 1 + int(ec.charge(peptide, pH=2))
 955 |     for charge in range(1, maxcharge + 1):
 956 |         for ion_type in types:
 957 |             nterminal = check_n_term(ion_type)
 958 |             if nterminal:
 959 |                 maxmass = calc_ions_from_neutral_mass(peptide, nm, ion_type=ion_type, charge=charge,
 960 |                                 aa_mass=kwargs['aa_mass'], cterm_mass=cterm_mass, nterm_mass=nterm_mass)
 961 |                 marr = get_n_ions(peptide, maxmass, pl, charge, kwargs['aa_mass'])
 962 |             else:
 963 |                 maxmass = calc_ions_from_neutral_mass(peptide, nm, ion_type=ion_type, charge=charge,
 964 |                                 aa_mass=kwargs['aa_mass'], cterm_mass=cterm_mass, nterm_mass=nterm_mass)
 965 |                 marr = get_c_ions(peptide, maxmass, pl, charge, kwargs['aa_mass'])
 966 | 
 967 |             tmp = [int(x / acc_frag) for x in marr]
 968 |             if ion_type in theoretical_set:
 969 |                 theoretical_set[ion_type].extend(tmp)
 970 |             else:
 971 |                 theoretical_set[ion_type] = tmp
 972 | 
 973 |             if reshape:
 974 |                 marr = np.array(marr)
 975 |                 n = marr.size
 976 |                 marr = marr.reshape((n, 1))
 977 |             peaks[ion_type, charge] = marr
 978 |     return peaks, theoretical_set
 979 | 
 980 | 
 981 | def get_expmass(spectrum, kwargs):
 982 |     maxcharge = kwargs['maxcharge'] or None
 983 |     mincharge = kwargs['mincharge'] or None
 984 |     min_ucharge = kwargs['min_ucharge']
 985 |     max_ucharge = kwargs['max_ucharge']
 986 | 
 987 |     if 'params' in spectrum:
 988 |         exp_mass = spectrum['params']['pepmass'][0]
 989 |         charge = spectrum['params'].get('charge')
 990 |     else:
 991 |         ion = spectrum['precursorList']['precursor'][
 992 |                 0]['selectedIonList']['selectedIon'][0]
 993 |         charge = ion.get('charge state')
 994 |         if charge is not None: charge = [int(charge)]
 995 |         exp_mass = ion['selected ion m/z']
 996 | 
 997 |     if isinstance(charge, str):
 998 |         states = [s for s in aux._parse_charge(charge, True)
 999 |                 if (mincharge is None or s >= mincharge) and (maxcharge is None or s <= maxcharge)]
1000 |     elif charge is None:
1001 |         states = list(range(min_ucharge, 1 + max_ucharge))
1002 |     else:
1003 |         states = [c for c in charge if
1004 |             (mincharge is None or c >= mincharge) and (maxcharge is None or c <= maxcharge)]
1005 |     states.sort()
1006 |     return exp_mass, states
1007 | 
1008 | 
1009 | def neutral_masses(spectrum, params):
1010 |     exp_mass, states = get_expmass(spectrum, params)
1011 |     return zip((c * (exp_mass - mass.nist_mass['H+'][0][0])
1012 |             for c in states), states)
1013 | 
1014 | 
1015 | @aux.memoize(10)
1016 | def import_(name):
1017 |     """Import a function by name: module.function or
1018 |     module.submodule.function, etc. By default trying to find
1019 |     function name in identipy.scoring module.
1020 |     Return the function object."""
1021 | 
1022 |     try:
1023 |         mod, f = name.rsplit('.', 1)
1024 |         return getattr(__import__(mod, fromlist=[f]), f)
1025 |     except Exception as e:
1026 |         logger.error('%s', e)
1027 |         return getattr(__import__('identipy.scoring', fromlist=[name]), name)
1028 | 
1029 | 
1030 | def get_aa_mass(settings):
1031 |     if settings.has_option('misc', 'aa_mass'):
1032 |         return settings.get('misc', 'aa_mass')
1033 |     aa_mass = mass.std_aa_mass.copy()
1034 |     aa_mass['-'] = 0.0
1035 |     for k, v in settings.items('modifications'):
1036 |         if k not in {'fixed', 'variable', 'variable_original', 'protein variable', 'protein_original'}:
1037 |             aa_mass[k] = float(v)
1038 |     fmods = settings.get('modifications', 'fixed')
1039 |     if fmods:
1040 |         for mod in re.split(r'[,;]\s*', fmods):
1041 |             if '-' not in mod:
1042 |                 m, aa = parser._split_label(mod)
1043 |                 aa_mass[aa] += settings.getfloat('modifications', m)
1044 |     vmods = settings.get('modifications', 'variable')
1045 |     if vmods:
1046 |         leg = settings.get('misc', 'legend')
1047 |         for p in string.punctuation:
1048 |             if p in leg:
1049 |                 mod, aa, term = leg[p]
1050 |                 if term == ']' and aa == '-':
1051 |                     aa_mass[p] = aa_mass[mod] + aa_mass[aa]
1052 |                     aa_mass[aa+mod] = aa_mass[mod] + aa_mass[aa]
1053 |                 else:
1054 |                     aa_mass[p] = aa_mass[mod] + aa_mass[aa]
1055 |                     aa_mass[mod+aa] = aa_mass[mod] + aa_mass[aa]
1056 |     pmods = settings.get('modifications', 'protein variable')
1057 |     if pmods:
1058 |         leg = settings.get('misc', 'plegend')
1059 |         for p in string.punctuation:
1060 |             if p in leg:
1061 |                 mod, aa, term = leg[p]
1062 |                 if term == ']' and aa == '-':
1063 |                     aa_mass[p] = aa_mass[mod] + aa_mass[aa]
1064 |                     aa_mass[aa+mod] = aa_mass[mod] + aa_mass[aa]
1065 |                 else:
1066 |                     aa_mass[p] = aa_mass[mod] + aa_mass[aa]
1067 |                     aa_mass[mod+aa] = aa_mass[mod] + aa_mass[aa]
1068 |     return aa_mass
1069 | 
1070 | 
1071 | def multimap(n, func, it, global_data, best_res_in=False, best_res_raw_in=False, best_peptides=False, **kw):
1072 |     global best_res
1073 | 
1074 | 
1075 |     rel = kw['rel']
1076 |     nterm_mass = kw.get('nterm_mass')
1077 |     cterm_mass = kw.get('cterm_mass')
1078 |     acc_l = kw['acc_l']
1079 |     acc_r = kw['acc_r']
1080 | 
1081 |     shifts_and_pime = kw['sapime']
1082 | 
1083 |     if best_res_in:
1084 |         best_res = deepcopy(best_res_in)
1085 |         best_res_raw = deepcopy(best_res_raw_in)
1086 |     else:
1087 |         best_res = {}
1088 |         best_res_raw = {}
1089 |     best_res_pep = {}
1090 | 
1091 | 
1092 |     if n == 1:
1093 |         cnt1 = 0
1094 |         for s in it:
1095 |             cnt1 += 1
1096 |             if cnt1 % 10000 == 0:
1097 |                 logger.debug(cnt1)
1098 |             result = func(s, best_res, global_data[0], **kw)
1099 |             if result:
1100 |                 for x in result:
1101 |                     peptide, m, snp_label, res = x
1102 | 
1103 |                     for score, spec_t, c, info in res:
1104 |                         if -score <= best_res.get(spec_t, 0):
1105 |                             best_res_raw[spec_t] = [peptide, m, snp_label, score, spec_t, c, info]
1106 |                             best_res[spec_t] = -score
1107 |         return best_res_raw, best_res
1108 | 
1109 |     else:
1110 | 
1111 |         def worker(qout, start, end, global_data_local):
1112 |             # maxval = len(qin)
1113 |             # start = 0
1114 | 
1115 |             new_best_res = {}
1116 |             new_best_res_raw = {}
1117 |             best_pep_res = {}
1118 | 
1119 |             while start < end:
1120 |                 item = qin[start]
1121 |                 result = func(item, best_res, global_data_local, **kw)
1122 | 
1123 |                 if result:
1124 |                     for x in result:
1125 |                         peptide, m, snp_label, res = x
1126 | 
1127 |                         for score, spec_t, c, info in res:
1128 |                             if -score <= new_best_res.get(spec_t, best_res.get(spec_t, 0)):
1129 |                                 new_best_res[spec_t] = -score
1130 |                                 best_res[spec_t] = -score
1131 |                                 new_best_res_raw[spec_t] = [peptide, m, snp_label, score, spec_t, c, info]
1132 |                 start += 1
1133 |             qout.put(new_best_res_raw)
1134 |             qout.put(None)
1135 |         qsize = kw.pop('qsize')
1136 |         qout = Queue(qsize)
1137 |         count = 0
1138 | 
1139 |         global qin
1140 | 
1141 |         while True:
1142 |             qint = list(islice(it, 5000000))
1143 |             if not len(qint):
1144 |                 break
1145 | 
1146 |             qin = []
1147 |             for seqm, aachange_pos, snp_label, m in qint:
1148 |                 qin.append((seqm, aachange_pos, snp_label, m))
1149 |             qin = sorted(qin, key=lambda x: x[3])
1150 |             qin_masses = np.array([z[3] for z in qin])
1151 | 
1152 |             procs = []
1153 |             for proc_num in range(n):
1154 | 
1155 |                 min_mass = min(global_data[proc_num]['nmasses'])
1156 |                 max_mass = max(global_data[proc_num]['nmasses'])
1157 |                 if rel:
1158 |                     dm_l = acc_l * max_mass / 1.0e6
1159 |                     dm_r = acc_r * max_mass / 1.0e6
1160 |                 elif not rel:
1161 |                     dm_l = acc_l
1162 |                     dm_r = acc_r
1163 |                 dm_l -= min(shifts_and_pime)
1164 |                 dm_r += max(shifts_and_pime)
1165 |                 start = qin_masses.searchsorted(min_mass + dm_l)
1166 |                 end = qin_masses.searchsorted(max_mass + dm_r, side='right')
1167 | 
1168 |                 p = Process(target=worker, args=(qout, start, end, global_data[proc_num]))
1169 |                 p.start()
1170 |                 procs.append(p)
1171 | 
1172 |             count = len(qin)
1173 | 
1174 |             for _ in range(n):
1175 |                 logger.debug('%s %s', _, len(best_res_pep))
1176 |                 for item in iter(qout.get, None):
1177 |                     for k, v in item.items():
1178 |                         if -v[3] <= best_res.get(k, 0):
1179 |                             best_res_raw[k] = v
1180 |                             best_res[k] = -v[3]
1181 |                             best_res_pep[k] = v[0]
1182 |                 logger.debug('%s %s', _, len(best_res_pep))
1183 | 
1184 |             logger.debug('HERE1')
1185 | 
1186 |             for p in procs:
1187 |                 p.join()
1188 | 
1189 |             logger.debug('HERE2')
1190 | 
1191 |         logger.info(len(best_res_pep))
1192 |         return best_res_raw, best_res
1193 | 
1194 | 
1195 | def allow_all(*args):
1196 |     return True
1197 | 
1198 | 
1199 | def get_RT(spectrum):
1200 |     """Return scan retention time in seconds"""
1201 |     # MGF
1202 |     if 'params' in spectrum:
1203 |         try:
1204 |             return float(spectrum['params']['rtinseconds'])
1205 |         except:
1206 |             try:
1207 |                 return float(spectrum['params']['title'].split(',')[-1].strip().split()[0])
1208 |             except:
1209 |                 try:
1210 |                     return 60 * np.average([float(x) for x in spectrum['params']['title'].split('lution from: ')[-1].split(' period:')[0].split(' to ')])
1211 |                 except:
1212 |                     return 0
1213 |     # mzML
1214 |     try:
1215 |         rt = spectrum['scanList']['scan'][0]['scan start time']
1216 |         try:
1217 |             if rt.unit_info == 'second':
1218 |                 return float(rt)
1219 |             else:
1220 |                 return float(rt * 60)
1221 |         except AttributeError:
1222 |             return float(rt)
1223 |     except KeyError:
1224 |         return 0
1225 | 
1226 | 
1227 | 
1228 | def get_comp_voltage(spectrum):
1229 |     """Return scan compensation_voltage"""
1230 |     # MGF
1231 |     if 'params' in spectrum:
1232 |         try:
1233 |             return float(spectrum['params']['FAIMS compensation voltage'])
1234 |         except:
1235 |             return 0
1236 |     # mzML
1237 |     try:
1238 |         return spectrum['FAIMS compensation voltage']
1239 |     except:
1240 |         return 0
1241 | 
1242 | 
1243 | def get_title(spectrum):
1244 |     if 'params' in spectrum:
1245 |         return spectrum['params']['title']
1246 |     else:
1247 |         return spectrum['id']
1248 | 
1249 | 
1250 | def get_precursor_mz(spectrum):
1251 |     try:
1252 |         return spectrum['params']['pepmass'][0]
1253 |     except:
1254 |         return spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z']
1255 | 
1256 | 
1257 | def is_db_target_only(settings):
1258 |     db = settings.get('input', 'database')
1259 |     isdecoy = is_decoy_function(settings)
1260 |     balance = 0
1261 |     for prot in fasta.read(db):
1262 |         if isdecoy(prot[0]):
1263 |             balance -= 1
1264 |         else:
1265 |             balance += 1
1266 |     return bool(balance)
1267 | 
1268 | 
1269 | def get_shifts_and_pime(settings):
1270 |     # try:
1271 |     #     fast_first_stage = settings.getint('misc', 'fast first stage')
1272 |     # except:
1273 |     #     fast_first_stage = 0
1274 |     pime = settings.getint('search', 'precursor isotope mass error')
1275 |     # if fast_first_stage:
1276 |     #     pime = 0
1277 |     shifts =[float(x) for x in settings.get('search', 'shifts').split(',')]
1278 |     dM = mass.nist_mass['C'][13][0] - mass.nist_mass['C'][12][0]
1279 |     shifts_and_pime = shifts[:]
1280 |     for i in range(pime):
1281 |         shifts_and_pime += [x + (i + 1) * dM for x in shifts]
1282 |     return shifts_and_pime
1283 | 
1284 | 
1285 | def build_pept_prot(settings, results):
1286 |     mc = settings.getint('search', 'number of missed cleavages')
1287 |     minlen = settings.getint('search', 'peptide minimum length')
1288 |     maxlen = settings.getint('search', 'peptide maximum length')
1289 |     isdecoy = is_decoy_function(settings)
1290 |     clip_M = settings.getboolean('search', 'clip N-terminal methionine')
1291 | 
1292 |     snp = settings.getint('search', 'snp')
1293 |     pept_prot = {}
1294 |     prots = {}
1295 |     peptides = set()
1296 |     pept_neighbors = {}
1297 |     pept_ntts = {}
1298 |     enzyme = settings.get('search', 'enzyme')
1299 |     semitryptic = settings.getint('search', 'semitryptic')
1300 |     for x in results:
1301 |         peptides.update(re.sub(r'[^A-Z]', '', normalize_mods(x['candidates'][i][1], settings)) for i in range(
1302 |             1 or len(x['candidates'])))
1303 |     seen_target.clear()
1304 |     seen_decoy.clear()
1305 |     enzyme_rule = get_enzyme(enzyme)
1306 |     for desc, prot in prot_gen(settings):
1307 |         dbinfo = desc.split(' ')[0]
1308 |         prots[dbinfo] = desc
1309 |         if semitryptic:
1310 |             cl_positions = set(z for z in it.chain([x.end() for x in re.finditer(enzyme_rule, prot)],
1311 |                    [0, 1, len(prot)]))
1312 |         for pep, startposition in prot_peptides(prot, enzyme_rule, mc, minlen, maxlen, isdecoy(desc),
1313 |             dont_use_seen_peptides=True, snp=snp, desc=desc, position=True, semitryptic=semitryptic, clip_M=clip_M):
1314 |             if snp:
1315 |                 if 'snp' not in pep:
1316 |                     seqm = pep
1317 |                 else:
1318 |                     tmp = pep.split('snp')
1319 |                     seqm = tmp[0] + tmp[1].split('at')[0].split('to')[-1] + tmp[2]
1320 |             else:
1321 |                 seqm = pep
1322 | 
1323 |             if seqm in peptides:
1324 |                 pept_ntts.setdefault(seqm, {})
1325 |                 pept_neighbors.setdefault(seqm, {})
1326 |                 pept_neighbors[seqm][dbinfo] = (prot[startposition - 1] if startposition != 0 else '-',
1327 |                         prot[startposition + len(seqm)] if startposition + len(seqm) < len(prot) else '-')
1328 | 
1329 |                 if not semitryptic:
1330 |                     pept_prot.setdefault(seqm, []).append(dbinfo)
1331 |                     pept_ntts[seqm][dbinfo] = 2
1332 |                 else:
1333 |                     ntt = (startposition in cl_positions) + ((startposition + len(seqm)) in cl_positions)
1334 |                     pept_ntts[seqm][dbinfo] = ntt
1335 |                     pept_prot.setdefault(seqm, []).append(dbinfo)
1336 | 
1337 |     return pept_prot, prots, pept_neighbors, pept_ntts
1338 | 
1339 | 
1340 | def get_outpath(inputfile, settings, suffix):
1341 |     outpath = settings.get('output', 'path')
1342 |     filename = os.path.join(outpath, os.path.splitext(os.path.basename(inputfile))[0] + os.path.extsep + suffix)
1343 |     return filename
1344 | 
1345 | 
1346 | def write_pepxml(inputfile, settings, results):
1347 |     outpath = settings.get('output', 'path')
1348 |     logger.debug('Output path: %s', outpath)
1349 | 
1350 |     set_mod_dict(settings)
1351 | 
1352 |     enzyme = settings.get('search', 'enzyme')
1353 |     search_engine = 'IdentiPy'
1354 |     database = settings.get('input', 'database')
1355 |     missed_cleavages = settings.getint('search', 'number of missed cleavages')
1356 |     fmods = settings.get('modifications', 'fixed')
1357 |     snp = settings.getint('search', 'snp')
1358 |     nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage')
1359 |     cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage')
1360 |     tags = get_tags(settings.get('output', 'tags'))
1361 | 
1362 |     nterm_fixed = 0
1363 |     cterm_fixed = 0
1364 | 
1365 |     for mod in re.split(r'[,;]\s*', fmods):
1366 |         if mod.startswith('-'):
1367 |             cterm_fixed = settings.getfloat('modifications', 'protein cterm cleavage')
1368 |         elif mod.endswith('-'):
1369 |             nterm_fixed = settings.getfloat('modifications', 'protein nterm cleavage')
1370 | 
1371 |     filename = get_outpath(inputfile, settings, 'pep.xml')
1372 |     with open(filename, 'wb') as output:
1373 |         logger.info('Writing %s ...', filename)
1374 |         line1 = b'<?xml version="1.0" encoding="UTF-8"?>\n\
1375 |         <?xml-stylesheet type="text/xsl" href="pepXML_std.xsl"?>\n'
1376 |         output.write(line1)
1377 | 
1378 |         base_name, ftype = path.splitext(inputfile)
1379 |         ftype = ftype.lower()
1380 | 
1381 |         root = etree.Element('msms_pipeline_analysis')
1382 |         root.set("date", strftime("%Y:%m:%d:%H:%M:%S"))
1383 |         root.set("summary_xml", '')
1384 |         root.set("xmlns", 'http://regis-web.systemsbiology.net/pepXML')
1385 |         # TODO
1386 |         #root.set("xmlns:xsi", 'http://www.w3.org/2001/XMLSchema-instance')
1387 |         #root.set("xsi:schemaLocation", 'http://sashimi.sourceforge.net/schema_revision/pepXML/pepXML_v117.xsd')
1388 | 
1389 |         child1 = etree.Element('msms_run_summary')
1390 |         child1.set("base_name", base_name)
1391 |         child1.set("search_engine", search_engine)
1392 |         child1.set("raw_data_type", "raw")  # ?
1393 | 
1394 |         if ftype == '.mgf':
1395 |             child1.set("raw_data", ".mgf")
1396 |         elif ftype == '.mzml':
1397 |             child1.set("raw_data", ".mzML")
1398 |         else:
1399 |             child1.set("raw_data", ".?")
1400 |         root.append(child1)
1401 | 
1402 |         child2 = etree.Element('sample_enzyme')
1403 |         child2.set('name', enzyme)
1404 |         child1.append(child2)
1405 | 
1406 |         child3 = etree.Element('specificity')
1407 |         child3.set("cut", "KR")
1408 |         child3.set("no_cut", "P")
1409 |         child3.set("sense", "C")
1410 | 
1411 |         child2.append(child3)
1412 | 
1413 |         child4 = etree.Element('search_summary')
1414 |         child4.set('base_name', base_name)
1415 |         child4.set('search_engine', search_engine)
1416 |         child4.set("search_engine_version", get_version())
1417 |         child4.set('precursor_mass_type', 'monoisotopic')
1418 |         child4.set('fragment_mass_type', 'monoisotopic')
1419 |         child4.set('search_id', '1')
1420 | 
1421 |         for child_mod in get_child_for_mods(settings.get('modifications', 'fixed'), settings, fixed=True):
1422 |             child4.append(child_mod)
1423 |         for child_mod in get_child_for_mods(settings.get('modifications', 'variable_original'), settings, fixed=False):
1424 |             child4.append(child_mod)
1425 |         for child_mod in get_child_for_mods(settings.get('modifications', 'protein_original'), settings, fixed=False, protein=True):
1426 |             child4.append(child_mod)
1427 | 
1428 |         child1.append(child4)
1429 | 
1430 |         child5 = etree.Element('search_database')
1431 |         child5.set('local_path', database)
1432 |         child5.set('type', 'AA')
1433 | 
1434 |         child4.append(copy(child5))
1435 | 
1436 |         child5 = etree.Element('enzymatic_search_constraint')
1437 |         child5.set('enzyme', enzyme)
1438 |         child5.set('max_num_internal_cleavages', str(missed_cleavages))
1439 |         child5.set('min_number_termini', '2')
1440 | 
1441 |         child4.append(copy(child5))
1442 | 
1443 |         results = [x for x in results if x['candidates'].size]
1444 | #       results = list(get_output(results, settings))
1445 |         logger.info('Accumulated results: %s', len(results))
1446 |         pept_prot, prots, pept_neighbors, pept_ntts = build_pept_prot(settings, results)
1447 |         if settings.has_option('misc', 'aa_mass'):
1448 |             aa_mass = settings.get('misc', 'aa_mass')
1449 |         else:
1450 |             aa_mass = get_aa_mass(settings)
1451 |         vmods = set()
1452 |         variablemods = settings.get('modifications', 'variable')
1453 |         if variablemods:
1454 |             for k, v in variablemods.items():
1455 |                 for aa in v:
1456 |                     vmods.add(k + aa)
1457 |                     vmods.add(aa + k)
1458 | 
1459 |         leg = {}
1460 |         if settings.has_option('misc', 'legend'):
1461 |             leg = settings.get('misc', 'legend')
1462 |         if settings.has_option('misc', 'plegend'):
1463 |             leg.update(settings.get('misc', 'plegend'))
1464 | 
1465 |         ntermcleavage = settings.getfloat('modifications', 'protein nterm cleavage')
1466 |         ctermcleavage = settings.getfloat('modifications', 'protein cterm cleavage')
1467 | 
1468 |         for idx, result in enumerate(results):
1469 |             if result['candidates'].size:
1470 |                 tmp = etree.Element('spectrum_query')
1471 |                 spectrum = result['spectrum']
1472 |                 tmp.set('spectrum', get_title(spectrum))
1473 |                 tmp.set('spectrumNativeID', get_title(spectrum))
1474 |                 tmp.set('start_scan', str(idx))  # ???
1475 |                 tmp.set('end_scan', str(idx))  # ???
1476 |                 tmp.set('index', str(idx))  # ???
1477 | 
1478 |                 neutral_mass, charge_state, RT, comp_voltage = get_info(spectrum, result, settings, aa_mass)
1479 |                 tmp.set('precursor_neutral_mass', str(neutral_mass))
1480 |                 tmp.set('assumed_charge', str(int(charge_state)))
1481 |                 if RT:
1482 |                     tmp.set('retention_time_sec', str(RT))
1483 |                 if comp_voltage:
1484 |                     tmp.set('compensation_voltage', str(comp_voltage))
1485 | 
1486 |                 tmp2 = etree.Element('search_result')
1487 |                 result['candidates'] = result['candidates'][:len(result['e-values'])]
1488 | 
1489 |                 flag = 1
1490 |                 for i, candidate in enumerate(result['candidates']):
1491 |                     match = candidate[4]['match']
1492 |                     if match is None:
1493 |                         break
1494 |                     tmp3 = etree.Element('search_hit')
1495 |                     tmp3.set('hit_rank', str(i + 1))
1496 |                     mod_sequence = normalize_mods(str(candidate[1]), settings)
1497 |                     sequence = re.sub(r'[^A-Z]', '', mod_sequence)
1498 |                     if sequence not in pept_prot:
1499 |                         flag = 0
1500 |                         logger.error('Unaccounted sequence! %s (%s)', sequence, mod_sequence)
1501 |                         break
1502 |                     else:
1503 |                         tmp3.set('peptide', sequence)
1504 | 
1505 |                         proteins = pept_prot[re.sub(r'[^A-Z]', '', sequence)]
1506 | 
1507 |                         tmp3.set('protein', prots[proteins[0]].split(' ', 1)[0] + (('_' + candidate[7]) if snp else ''))
1508 |                         try:
1509 |                             protein_descr = prots[proteins[0]].split(' ', 1)[1]
1510 |                         except:
1511 |                             protein_descr = ''
1512 | 
1513 |                         neighbors = pept_neighbors.get(sequence, {}).get(proteins[0], ('-', '-'))
1514 | 
1515 |                         tmp3.set('peptide_prev_aa', neighbors[0])
1516 |                         tmp3.set('peptide_next_aa', neighbors[1])
1517 |                         tmp3.set('protein_descr', protein_descr)
1518 | 
1519 |                         num_tot_proteins = len(proteins)
1520 |                         tmp3.set('num_tot_proteins', str(num_tot_proteins))
1521 |                         tmp3.set('num_matched_ions', str(sum(v.sum() for v in match.values())))
1522 |                         tmp3.set('tot_num_ions', str((len(sequence) - 1) * 2))
1523 |                         neutral_mass_theor = custom_mass(str(candidate[1]), aa_mass=aa_mass, nterm_mass=nterm_mass, cterm_mass=cterm_mass)
1524 |                         # neutral_mass_theor = cmass.fast_mass(sequence, aa_mass=aa_mass)
1525 |                         tmp3.set('calc_neutral_pep_mass', str(neutral_mass_theor))
1526 |                         tmp3.set('massdiff', str(candidate[4]['mzdiff']['Da']))
1527 |                         tmp3.set('num_tol_term', str(pept_ntts.get(sequence, {}).get(proteins[0], '?')))
1528 |                         tmp3.set('num_missed_cleavages', str(parser.num_sites(sequence, get_enzyme(enzyme))))
1529 |                         tmp3.set('is_rejected', '0')  # ???
1530 | 
1531 |                         if num_tot_proteins > 1 and (not snp or 'wild' not in prots[proteins[0]].split(' ', 1)[0]):
1532 |                             for prot in proteins[1:]:
1533 |                                 tmp4 = etree.Element('alternative_protein')
1534 |                                 tmp4.set('protein', prots[prot].split(' ', 1)[0] + (('_' + candidate[7]) if snp else ''))
1535 |                                 try:
1536 |                                     protein_descr = prots[prot].split(' ', 1)[1]
1537 |                                 except:
1538 |                                     protein_descr = ''
1539 |                                 tmp4.set('protein_descr', protein_descr)
1540 |                                 neighbors = pept_neighbors.get(sequence, {}).get(prot, ('-', '-'))
1541 |                                 tmp4.set('peptide_prev_aa', neighbors[0])
1542 |                                 tmp4.set('peptide_next_aa', neighbors[1])
1543 |                                 tmp4.set('num_tol_term', str(pept_ntts.get(sequence, {}).get(prot, '?')))
1544 |                                 tmp3.append(copy(tmp4))
1545 | 
1546 |                         labels = parser.std_labels + [la[:-1] if la[-1] == '[' else '-' + la[:-2] if la[-1] == ']' else la for la in leg if len(la) > 1]
1547 | #                       logger.debug('Known labels: %s', labels)
1548 |                         try:
1549 |                             aalist = parser.parse(mod_sequence, labels=labels)
1550 |                         except Exception as e:
1551 |                             logger.debug('Problematic sequence: %s\n%s', mod_sequence, e)
1552 |                             aalist = [a[::-1] for a in parser.parse(mod_sequence[::-1], labels=labels)][::-1]
1553 |                         tmp4 = etree.Element('modification_info')
1554 |                         ntermmod = 0
1555 | 
1556 |                         if nterm_fixed:
1557 |                             tmp4.set('mod_nterm_mass', str(nterm_fixed))
1558 |                         if cterm_fixed:
1559 |                             tmp4.set('mod_cterm_mass', str(cterm_fixed))
1560 | 
1561 |                         for idx, aminoacid in enumerate(aalist):
1562 |                             if aminoacid in fmods or aminoacid in vmods:
1563 |                                 if aminoacid.endswith('-') and idx == 0:
1564 |                                     ntermmod = 1
1565 |                                     tmp4.set('mod_nterm_mass', str(str(aa_mass.get(aminoacid) + ntermcleavage)))
1566 |                                 elif aminoacid.startswith('-') and idx == len(aalist) - 1:
1567 |                                     tmp4.set('mod_cterm_mass', str(aa_mass.get(aminoacid) + ctermcleavage))
1568 |                                 else:
1569 |                                     tmp5 = etree.Element('mod_aminoacid_mass')
1570 |                                     tmp5.set('position', str(idx + 1 - ntermmod))
1571 |                                     tmp5.set('mass', str(aa_mass.get(aminoacid)))
1572 |                                     tmp4.append(copy(tmp5))
1573 |                         tmp3.append(copy(tmp4))
1574 | 
1575 |                         if 'RNHS' in candidate[4]:
1576 |                             tmp4 = etree.Element('search_score')
1577 |                             tmp4.set('name', 'hyperscore')
1578 |                             tmp4.set('value', str(candidate[4]['RNHS']))
1579 |                             tmp3.append(copy(tmp4))
1580 | 
1581 |                             tmp4 = etree.Element('search_score')
1582 |                             tmp4.set('name', 'expect')
1583 |                             tmp4.set('value', str(1./candidate[4]['RNHS']))
1584 |                             tmp3.append(copy(tmp4))
1585 | 
1586 |                         else:
1587 |                             tmp4 = etree.Element('search_score')
1588 |                             tmp4.set('name', 'hyperscore')
1589 |                             tmp4.set('value', str(candidate[0]))
1590 |                             tmp3.append(copy(tmp4))
1591 | 
1592 |                             tmp4 = etree.Element('search_score')
1593 |                             tmp4.set('name', 'expect')
1594 |                             tmp4.set('value', str(result['e-values'][i]))
1595 |                             tmp3.append(copy(tmp4))
1596 | 
1597 |                         tmp4 = etree.Element('search_score')
1598 |                         tmp4.set('name', 'sumI')
1599 |                         tmp4.set('value', str(candidate[5]))
1600 |                         tmp3.append(copy(tmp4))
1601 | 
1602 |                         tmp4 = etree.Element('search_score')
1603 |                         tmp4.set('name', 'fragmentMT')
1604 |                         tmp4.set('value', str(candidate[6]))
1605 |                         tmp3.append(copy(tmp4))
1606 | 
1607 |                         tmp4 = etree.Element('search_score')
1608 |                         tmp4.set('name', 'nextscore_std')
1609 |                         tmp4.set('value', str(candidate[8]))
1610 |                         tmp3.append(copy(tmp4))
1611 | 
1612 |                         if 'params' in spectrum:
1613 |                             if 'isowidthdiff' in spectrum['params']:
1614 |                                 tmp4 = etree.Element('search_score')
1615 |                                 tmp4.set('name', 'ISOWIDTHDIFF')
1616 |                                 tmp4.set('value', str(spectrum['params'].get('isowidthdiff', 0)))
1617 |                                 tmp3.append(copy(tmp4))
1618 | 
1619 |                             if 'rtwidth' in spectrum['params']:
1620 |                                 tmp4 = etree.Element('search_score')
1621 |                                 tmp4.set('name', 'RTwidth')
1622 |                                 tmp4.set('value', str(spectrum['params'].get('rtwidth', 0)))
1623 |                                 tmp3.append(copy(tmp4))
1624 | 
1625 |                             if 'ms1intensity' in spectrum['params']:
1626 |                                 tmp4 = etree.Element('search_score')
1627 |                                 tmp4.set('name', 'MS1Intensity')
1628 |                                 tmp4.set('value', str(spectrum['params'].get('ms1intensity', 0)))
1629 |                                 tmp3.append(copy(tmp4))
1630 | 
1631 |                             if 'pif' in spectrum['params']:
1632 |                                 tmp4 = etree.Element('search_score')
1633 |                                 tmp4.set('name', 'PIF')
1634 |                                 tmp4.set('value', str(spectrum['params'].get('pif', -3)))
1635 |                                 tmp3.append(copy(tmp4))
1636 | 
1637 |                             if 'sulfur' in spectrum['params']:
1638 |                                 tmp4 = etree.Element('search_score')
1639 |                                 tmp4.set('name', 'sulfur')
1640 |                                 tmp4.set('value', str(spectrum['params'].get('sulfur', -1)))
1641 |                                 tmp3.append(copy(tmp4))
1642 | 
1643 |                             if 'ionmobility' in spectrum['params']:
1644 |                                 tmp4 = etree.Element('search_score')
1645 |                                 tmp4.set('name', 'ionmobility')
1646 |                                 tmp4.set('value', str(spectrum['params'].get('ionmobility', 0)))
1647 |                                 tmp3.append(copy(tmp4))
1648 | 
1649 |                         if tags:
1650 |                             for tag_label in tags.keys():
1651 |                                 tmp4 = etree.Element('search_score')
1652 |                                 tmp4.set('name', 'tag_' + tag_label)
1653 |                                 tmp4.set('value', str(spectrum.get(tag_label, 0)))
1654 |                                 tmp3.append(copy(tmp4))
1655 | 
1656 | 
1657 |                         for k, v in match.items():
1658 |                             tmp4 = etree.Element('search_score')
1659 |                             tmp4.set('name', 'matched_{}{}_ions'.format(*k))
1660 |                             tmp4.set('value', str(v.sum()))
1661 |                             tmp3.append(copy(tmp4))
1662 | 
1663 |                         tmp2.append(copy(tmp3))
1664 |                 if flag:
1665 |                     tmp.append(copy(tmp2))
1666 |                     child1.append(copy(tmp))
1667 | 
1668 |         s = etree.tostring(root, pretty_print=True)
1669 |         output.write(s)
1670 | 
1671 | 
1672 | def write_csv(inputfile, settings, results):
1673 |     df = dataframe(inputfile, settings, results)
1674 |     if df is None:
1675 |         logger.info('No results to write. File not created.')
1676 |         return
1677 | 
1678 |     sep = settings.get('output', 'separator')
1679 |     of = settings.get('output', 'format').lower()
1680 |     if not sep:
1681 |         sep = ',' if of == 'csv' else '\t'
1682 |     fname = get_outpath(inputfile, settings, of)
1683 |     logger.info('Writing %s ...', fname)
1684 |     df.to_csv(fname, index=False, sep=sep)
1685 | 
1686 | 
1687 | def dataframe(inputfile, settings, results):
1688 |     #   results = list(get_output(results, settings))
1689 |     results = list(results)
1690 |     if not results:
1691 |         return None
1692 | 
1693 |     logger.info('Accumulated results: %s', len(results))
1694 |     #   ensure_decoy(settings)
1695 |     set_mod_dict(settings)
1696 |     fmods = settings.get('modifications', 'fixed')
1697 |     pept_prot, prots, pept_neighbors, pept_ntts = build_pept_prot(settings, results)
1698 |     if settings.has_option('misc', 'aa_mass'):
1699 |         aa_mass = settings.get('misc', 'aa_mass')
1700 |     else:
1701 |         aa_mass = get_aa_mass(settings)
1702 | 
1703 |     nterm_mass = settings.getfloat('modifications', 'protein nterm cleavage')
1704 |     cterm_mass = settings.getfloat('modifications', 'protein cterm cleavage')
1705 | 
1706 |     vmods = set()
1707 |     variablemods = settings.get('modifications', 'variable')
1708 |     if variablemods:
1709 |         for k, v in variablemods.items():
1710 |             for aa in v:
1711 |                 vmods.add(k + aa)
1712 |                 vmods.add(aa + k)
1713 | 
1714 |     leg = {}
1715 |     if settings.has_option('misc', 'legend'):
1716 |         leg = settings.get('misc', 'legend')
1717 | 
1718 |     enzyme = settings.get('search', 'enzyme')
1719 |     snp = settings.getint('search', 'snp')
1720 |     columns = ['Title', 'Assumed charge', 'RT', 'compensation_voltage', 'Rank', 'Matched ions', 'Total ions', 'Calculated mass',
1721 |                 'Mass difference', 'Missed cleavages', 'Proteins', '# proteins', 'Sequence', 'Modified sequence',
1722 |                 'Hyperscore', 'Expect', 'sumI', 'fragmentMT']
1723 |     rows = []
1724 |     for result in results:
1725 |         if result['candidates'].size:
1726 |             row = []
1727 |             spectrum = result['spectrum']
1728 |             row.append(get_title(spectrum))
1729 |             neutral_mass, charge_state, RT, comp_voltage = get_info(spectrum, result, settings, aa_mass)
1730 |             row.append(charge_state)
1731 |             row.append(RT)
1732 |             row.append(comp_voltage)
1733 |             result['candidates'] = result['candidates'][:len(result['e-values'])]
1734 | 
1735 |             flag = 1
1736 |             for i, candidate in enumerate(result['candidates'], 1):
1737 |                 match = candidate[4]['match']
1738 |                 if match is None: break
1739 |                 row.append(i)
1740 |                 mod_sequence = normalize_mods(candidate[1], settings)
1741 | 
1742 |                 sequence = re.sub(r'[^A-Z]', '', mod_sequence)
1743 |                 if sequence not in pept_prot:
1744 |                     flag = 0
1745 |                     logger.error('Unaccounted sequence! %s (%s)', sequence, mod_sequence)
1746 |                     break
1747 |                 else:
1748 |                     allproteins = pept_prot[sequence]
1749 | 
1750 |                     row.append(sum(v.sum() for v in match.values()))
1751 |                     row.append((len(sequence) - 1) * 2)
1752 |                     neutral_mass_theor = custom_mass(candidate[1], aa_mass=aa_mass, nterm_mass = nterm_mass, cterm_mass = cterm_mass)
1753 |                     row.append(neutral_mass_theor)
1754 |                     row.append(candidate[4]['mzdiff']['Da'])
1755 |                     row.append(parser.num_sites(sequence, get_enzyme(enzyme)))
1756 | 
1757 |                     proteins = [allproteins[0]]
1758 |                     if len(allproteins) > 1:
1759 |                         if snp:
1760 |                             wilds = any('wild' in prots[p].split(' ', 1)[0] for p in allproteins)
1761 |                         for prot in allproteins[1:]:
1762 |                             d = prots[prot].split(' ', 1)[0]
1763 |                             if (not snp or not wilds or 'wild' in d):
1764 |                                 proteins.append(prot)
1765 | 
1766 |                     row.append(';'.join(proteins))
1767 |                     row.append(len(proteins))
1768 | 
1769 |                     row.append(sequence)
1770 |                     if fmods:
1771 |                         for mod in re.split(r'[,;]\s*', fmods):
1772 |                             if '-' not in mod:
1773 |                                 m, aa = parser._split_label(mod)
1774 |                                 mod_sequence = mod_sequence.replace(aa, m+aa)
1775 |                             elif mod[0] == '-':
1776 |                                 mod_sequence = mod_sequence + mod
1777 |                             elif mod[-1] == '-':
1778 |                                 mod_sequence = mod + mod_sequence
1779 |                     row.append(mod_sequence)
1780 | 
1781 |                     row.append(candidate[0])
1782 |                     row.append(result['e-values'][i-1])
1783 |                     row.append(candidate[5])
1784 |                     row.append(candidate[6])
1785 | 
1786 |                     rows.append(row)
1787 |     df = pd.DataFrame(rows)
1788 |     df.columns = columns
1789 |     return df
1790 | 
1791 | 
1792 | def write_pickle(inputfile, settings, results):
1793 |     results = list(results)
1794 |     logger.info('Accumulated results: %s', len(results))
1795 |     try:
1796 |         import cPickle as pickle
1797 |     except ImportError:
1798 |         import pickle
1799 |     filename = get_outpath(inputfile, settings, 'pickle')
1800 |     with open(filename, 'wb') as output:
1801 |         pickle.dump((inputfile, settings, results), output, -1)
1802 | 
1803 | 
1804 | def write_output(inputfile, settings, results):
1805 |     formats = {'pepxml': write_pepxml, 'csv': write_csv, 'tsv': write_csv, 'pickle': write_pickle}
1806 |     of = settings.get('output', 'format')
1807 |     writer = formats[re.sub(r'[^a-z]', '', of.lower())]
1808 | 
1809 |     if settings.has_option('output', 'path'):
1810 |         outd = settings.get('output', 'path')
1811 |         if not os.path.isdir(outd):
1812 |             logger.info('Creating %s ...', outd)
1813 |             os.makedirs(outd)
1814 |     else:
1815 |         outpath = os.path.dirname(inputfile)
1816 |         settings.set('output', 'path', outpath)
1817 | 
1818 |     return writer(inputfile, settings, results)
1819 | 
1820 | 
1821 | def demix_chimeric(path_to_features, path_to_mzml, demixing=False, calc_PIF=True):
1822 | 
1823 |     basename_mzml = os.path.splitext(path.basename(path_to_mzml))[0]
1824 | 
1825 |     if path_to_features:
1826 |         df1 = pd.read_table(path_to_features)
1827 |         df1 = df1.rename(columns=lambda x: x.strip())
1828 |         df1 = df1[df1['nIsotopes'] >= 2]
1829 |         logger.info(df1.shape)
1830 |     else:
1831 |         df1 = None
1832 | 
1833 |     mzs = []
1834 |     RTs = []
1835 |     ionmobs = []
1836 |     chs = []
1837 |     titles = []
1838 |     ms2_map = {}
1839 |     isolation_window_left = False
1840 |     isolation_window_right = False
1841 | 
1842 |     cur_ms1 = False
1843 |     mass_acc = 20
1844 |     for a in mzml.read(path_to_mzml):
1845 |         if a['ms level'] == 1:
1846 |             cur_ms1 = a
1847 |         elif a['ms level'] == 2:
1848 |             logger.debug('PROCESSING RT: ' + str(a['scanList']['scan'][0]['scan start time']) )
1849 |         if a['ms level'] == 2 and 'm/z array' in a.keys() and len(a['m/z array']) > 0:
1850 |             # if :
1851 |             if not isolation_window_left:
1852 |                 isolation_window_left = float(a['precursorList']['precursor'][0]['isolationWindow']['isolation window lower offset'])
1853 |                 isolation_window_right = float(a['precursorList']['precursor'][0]['isolationWindow']['isolation window upper offset'])
1854 |             pepmass = float(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z'])
1855 |             RT = float(a['scanList']['scan'][0]['scan start time'])
1856 |             # logger.info(a['mean inverse reduced ion mobility array'])
1857 |             try:
1858 |                 ion_mob = float(a['mean inverse reduced ion mobility array'][0])
1859 |             except:
1860 |                 # logger.info('missing ion mob')
1861 |                 ion_mob = 0.0
1862 |             try:
1863 |                 ch = int(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['charge state'])
1864 |             except:
1865 |                 ch = 0
1866 | 
1867 |             if calc_PIF:
1868 |                 if not cur_ms1:
1869 |                     calc_PIF = False
1870 |                     # logger.info('Missing MS1 spectra in mzML, turning off PIF calculation')
1871 |                 elif not isolation_window_left:
1872 |                     calc_PIF = False
1873 |                     # logger.info('Missing isolation window info in mzML, turning off PIF calculation')
1874 |                 else:
1875 |                     intensity_full_ms2 = 0
1876 |                     intensity_precursor = 0
1877 | 
1878 |                     idx_l = cur_ms1['m/z array'].searchsorted(pepmass - isolation_window_left)
1879 |                     idx_r = cur_ms1['m/z array'].searchsorted(pepmass + isolation_window_right, side='right')
1880 | 
1881 |                     if not ch:
1882 |                         tch = 2
1883 |                     else:
1884 |                         tch = ch
1885 | 
1886 |                     abs_error = pepmass * mass_acc * 1e-6
1887 |                     for mz, intensity in zip(cur_ms1['m/z array'][idx_l:idx_r], cur_ms1['intensity array'][idx_l:idx_r]):
1888 |                         if any(abs(mz - (pepmass + (k * 1.007825) / tch)) <= abs_error for k in [-2, -1, 0, 1, 2, 3, 4]):
1889 |                             intensity_precursor += intensity
1890 |                         intensity_full_ms2 += intensity
1891 |                     if intensity_full_ms2:
1892 |                         PIF = intensity_precursor / intensity_full_ms2 * 100
1893 |                     else:
1894 |                         PIF = -1
1895 |                     a['PIF'] = PIF
1896 | 
1897 | 
1898 |             title = a['id']
1899 |             mzs.append(pepmass)
1900 |             RTs.append(RT)
1901 |             ionmobs.append(ion_mob)
1902 |             chs.append(ch)
1903 |             titles.append(title)
1904 |             ms2_map[title] = a
1905 | 
1906 | 
1907 |     mzs = np.array(mzs)
1908 |     RTs = np.array(RTs)
1909 |     ionmobs = np.array(ionmobs)
1910 |     chs = np.array(chs)
1911 |     titles = np.array(titles)
1912 |     idx = np.argsort(mzs)
1913 |     mzs = mzs[idx]
1914 |     RTs = RTs[idx]
1915 |     ionmobs = ionmobs[idx]
1916 |     chs = chs[idx]
1917 |     titles = titles[idx]
1918 | 
1919 |     if not df1 is None:
1920 |         if 'ion_mobility' not in df1.columns:
1921 |             df1['ion_mobility'] = 0
1922 |         if 'sulfur' not in df1.columns:
1923 |             df1['sulfur'] = 0
1924 |         df1['MSMS'] = df1.apply(findMSMS, axis=1, args = (isolation_window_left, isolation_window_right, mzs, RTs, titles, ionmobs))
1925 |         df1['MSMS_accurate'] = df1.apply(findMSMS_accurate, axis=1, args = (mzs, RTs, titles, ionmobs, chs))
1926 |         # print(df1['MSMS_accurate'])
1927 | 
1928 |     outmgf_name = os.path.splitext(path_to_mzml)[0] + '_identipy' + os.extsep + 'mgf'
1929 |     outmgf = open(outmgf_name, 'w')
1930 | 
1931 |     t_i = 1
1932 |     f_i = 0
1933 | 
1934 |     added_MSMS = set()
1935 | 
1936 |     if demixing:
1937 | 
1938 |         for z in df1[['mz', 'rtApex', 'charge', 'intensityApex', 'MSMS', 'MSMS_accurate', 'rtStart', 'rtEnd', 'ion_mobility', 'sulfur']].values:
1939 |             mz, RT, ch, Intensity, ttls, ttl_ac, rt_ll, rt_rr, ion_mob, sulfur = z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7], z[8], z[9]
1940 |             if ttls:
1941 |                 f_i += 1
1942 |                 for ttl in ttls:
1943 |                     if ttl in ttl_ac:
1944 |                         added_MSMS.add(ttl)
1945 |                     mz_arr, I_arr = ms2_map[ttl]['m/z array'], ms2_map[ttl]['intensity array']
1946 |                     PIF = ms2_map[ttl].get('PIF', -2)
1947 |                     pepmass = float(ms2_map[ttl]['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z'])
1948 |                     t_i_orig = ms2_map[ttl]['index']
1949 |                     outmgf.write('BEGIN IONS\n')
1950 |                     outmgf.write('TITLE=%s.%d.%d.%d.%d\n' % (basename_mzml, t_i_orig, t_i, t_i, ch))
1951 |                     outmgf.write('RTINSECONDS=%f\n' % (RT * 60, ))
1952 |                     outmgf.write('PEPMASS=%f %f\n' % (mz, Intensity))
1953 |                     outmgf.write('CHARGE=%d+\n' % (ch, ))
1954 |                     outmgf.write('ISOWIDTHDIFF=%f\n' % (mz - pepmass, ))
1955 |                     outmgf.write('RTwidth=%f\n' % (rt_rr - rt_ll, ))
1956 |                     outmgf.write('MS1Intensity=%f\n' % (Intensity, ))
1957 |                     outmgf.write('PIF=%f\n' % (PIF, ))
1958 |                     outmgf.write('IonMobility=%f\n' % (ion_mob, ))
1959 |                     outmgf.write('Sulfur=%f\n' % (sulfur, ))
1960 |                     for mz_val, I_val in zip(mz_arr, I_arr):
1961 |                         outmgf.write('%f %f\n' % (mz_val, I_val))
1962 |                     outmgf.write('END IONS\n\n')
1963 |                     t_i += 1
1964 | 
1965 |         for k in ms2_map:
1966 |             if k not in added_MSMS:
1967 |                 f_i += 1
1968 |                 a = ms2_map[k]
1969 |                 mz_arr, I_arr = a['m/z array'], a['intensity array']
1970 |                 PIF = a.get('PIF', -2)
1971 |                 mz = float(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z'])
1972 |                 RT = float(a['scanList']['scan'][0]['scan start time'])
1973 |                 t_i_orig = a['index']
1974 |                 try:
1975 |                     ch = int(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['charge state'])
1976 |                 except:
1977 |                     ch = ''
1978 |                 outmgf.write('BEGIN IONS\n')
1979 |                 outmgf.write('TITLE=%s.%d.%d.%d.%s\n' % (basename_mzml, t_i_orig, t_i, t_i, str(ch)))
1980 |                 outmgf.write('RTINSECONDS=%f\n' % (RT * 60, ))
1981 |                 outmgf.write('PEPMASS=%f %f\n' % (mz, 0))
1982 |                 if ch:
1983 |                     outmgf.write('CHARGE=%d+\n' % (ch, ))
1984 |                 outmgf.write('ISOWIDTHDIFF=%f\n' % (0.0, ))
1985 |                 outmgf.write('RTwidth=%f\n' % (0.0, ))
1986 |                 outmgf.write('MS1Intensity=%f\n' % (0.0, ))
1987 |                 outmgf.write('PIF=%f\n' % (PIF, ))
1988 |                 outmgf.write('IonMobility=%f\n' % (0.0, ))
1989 |                 outmgf.write('Sulfur=%f\n' % (-1.0, ))
1990 |                 for mz_val, I_val in zip(mz_arr, I_arr):
1991 |                     outmgf.write('%f %f\n' % (mz_val, I_val))
1992 |                 outmgf.write('END IONS\n\n')
1993 |                 t_i += 1
1994 | 
1995 |     else:
1996 | 
1997 |         MS2_acc_map = {}
1998 | 
1999 |         if not df1 is None:
2000 |             for z in df1[['mz', 'rtApex', 'charge', 'intensityApex', 'MSMS', 'MSMS_accurate', 'rtStart', 'rtEnd', 'ion_mobility', 'sulfur']].values:
2001 |                 ttl_ac = z[5]
2002 |                 for ttl in ttl_ac:
2003 |                     if ttl not in MS2_acc_map:
2004 |                         MS2_acc_map[ttl] = z
2005 |                     else:
2006 |                         if MS2_acc_map[ttl][3] < z[3]:
2007 |                             MS2_acc_map[ttl] = z
2008 | 
2009 |         # print(MS2_acc_map)
2010 | 
2011 |         for k in ms2_map:
2012 |             a = ms2_map[k]
2013 | 
2014 |             if k in MS2_acc_map:
2015 |                 # print('HERE, ok')
2016 |                 z = MS2_acc_map[k]
2017 |                 mz, RT, ch, Intensity, ttls, ttl_ac, rt_ll, rt_rr, ion_mob, sulfur = z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7], z[8], z[9]
2018 |             else:
2019 |                 mz = float(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z'])
2020 |                 RT = float(a['scanList']['scan'][0]['scan start time'])
2021 |                 try:
2022 |                     ch = int(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['charge state'])
2023 |                 except:
2024 |                     ch = 0
2025 |                 Intensity, ttls, ttl_ac, rt_ll, rt_rr, ion_mob, sulfur = 0, 0, 0, 0, 0, 0, 0
2026 | 
2027 | 
2028 |             mz_arr, I_arr = a['m/z array'], a['intensity array']
2029 |             PIF = a.get('PIF', -2)
2030 | #             mz = float(a['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z'])
2031 | #             RT = float(a['scanList']['scan'][0]['scan start time'])
2032 |             t_i_orig = a['index']
2033 |             outmgf.write('BEGIN IONS\n')
2034 |             outmgf.write('TITLE=%s.%d.%d.%d.%s\n' % (basename_mzml, t_i_orig, t_i, t_i, str(ch)))
2035 |             outmgf.write('RTINSECONDS=%f\n' % (RT * 60, ))
2036 |             outmgf.write('PEPMASS=%f %f\n' % (mz, Intensity))
2037 |             if ch:
2038 |                 outmgf.write('CHARGE=%d+\n' % (ch, ))
2039 |             outmgf.write('ISOWIDTHDIFF=%f\n' % 0)
2040 |             outmgf.write('RTwidth=%f\n' % (rt_rr - rt_ll, ))
2041 |             outmgf.write('MS1Intensity=%f\n' % (Intensity, ))
2042 |             outmgf.write('PIF=%f\n' % (PIF, ))
2043 |             outmgf.write('IonMobility=%f\n' % (ion_mob, ))
2044 |             outmgf.write('Sulfur=%f\n' % (sulfur, ))
2045 |             for mz_val, I_val in zip(mz_arr, I_arr):
2046 |                 outmgf.write('%f %f\n' % (mz_val, I_val))
2047 |             outmgf.write('END IONS\n\n')
2048 |             t_i += 1
2049 |     outmgf.close()
2050 | 
2051 |     return outmgf_name
2052 | 
2053 | 
2054 | def findMSMS(raw, isolation_window_left, isolation_window_right, mzs, RTs, titles, ionmobs):
2055 |     out = []
2056 |     isotope_fix = raw['nIsotopes'] / raw['charge']
2057 |     mz = raw['mz']
2058 |     RT_l = raw['rtStart']
2059 |     RT_r = raw['rtEnd']
2060 |     ion_mob_p = raw['ion_mobility']
2061 |     # There is no error below: -right and +left!
2062 |     id_l = mzs.searchsorted(mz - isolation_window_right)
2063 |     id_r = mzs.searchsorted(mz + isolation_window_left + isotope_fix, side='right')
2064 |     for idx, RT in enumerate(RTs[id_l:id_r]):
2065 |         if RT_l <= RT <= RT_r:
2066 |             if abs(ionmobs[id_l+idx] - ion_mob_p) <= 0.1:
2067 |                 out.append(titles[id_l+idx])
2068 |     if len(out):
2069 |         return out
2070 |     else:
2071 |         return None
2072 | 
2073 | 
2074 | def findMSMS_accurate(raw, mzs, RTs, titles, ionmobs, chs):
2075 |     out = set()
2076 |     acc = 10
2077 |     mz = raw['mz']
2078 |     ch = raw['charge']
2079 |     RT_l = raw['rtStart']
2080 |     RT_r = raw['rtEnd']
2081 |     ion_mob_p = raw['ion_mobility']
2082 |     acc_rel = mz * acc * 1e-6
2083 |     id_l = mzs.searchsorted(mz - acc_rel)
2084 |     id_r = mzs.searchsorted(mz + acc_rel, side='right')
2085 |     for idx, RT in enumerate(RTs[id_l:id_r]):
2086 |         if RT_l <= RT <= RT_r:
2087 |             if abs(ionmobs[id_l+idx] - ion_mob_p) <= 0.1:
2088 |                 ch_msms = chs[id_l+idx]
2089 |                 if not ch_msms or ch_msms == ch:
2090 |                     out.add(titles[id_l+idx])
2091 |             # return True
2092 |     # return False
2093 |     return out
2094 | 
2095 | 
2096 | def generate_database(settings, outname=None):
2097 |     add_decoy = settings.getboolean('input', 'add decoy')
2098 |     prefix = settings.get('input', 'decoy prefix')
2099 |     infix = settings.get('input', 'decoy infix')
2100 |     if infix and add_decoy:
2101 |         if not prefix:
2102 |             prefix = infix
2103 |         logger.warning('infix is specified with "add decoy" = True. Generated decoys will have PREFIX %s', prefix)
2104 |     mode = settings.get('input', 'decoy method')
2105 |     db = settings.get('input', 'database')
2106 |     target_only = is_db_target_only(settings)
2107 |     if add_decoy and target_only:
2108 |         gdbname = outname or settings.get('output', 'generated database')
2109 |         if gdbname:
2110 |             ft = open(gdbname, 'w')
2111 |         else:
2112 |             ft = tempfile.NamedTemporaryFile(mode='w', delete=False)
2113 |         fasta.write_decoy_db(db, ft, mode=mode, prefix=prefix)
2114 |         ft.flush()
2115 |         settings.set('input', 'database', ft.name)
2116 |         settings.set('input', 'add decoy', 'no')
2117 |         logger.debug('Generated database: %s (isfile = %s)', ft.name, os.path.isfile(ft.name))
2118 |         return ft.name
2119 |     else:
2120 |         logger.debug('Skipping database generation. add_decoy = %s, target_only = %s', add_decoy, target_only)
2121 | 
2122 | 
2123 | def get_version():
2124 |     return pkg_resources.get_distribution('identipy').version
2125 | 


--------------------------------------------------------------------------------