├── tests
    ├── data
    │   ├── library_targets.pqp
    │   ├── test_transition_list.pqp
    │   ├── Q99536.fasta
    │   ├── results.sage.tsv
    │   ├── matched_fragments.sage.tsv
    │   ├── config.json
    │   └── test_transition_list.tsv
    ├── test_generate_ionseries.py
    ├── _regtest_outputs
    │   ├── test_convertsage.test_convertsage.out
    │   ├── test_insilico_library.test_insilico_library.out
    │   ├── test_targetedfileconverter.test_targeted_file_converter_tsvtopqp.out
    │   ├── test_openswathassay_generator.test_openswath_assay_generator.out
    │   ├── test_targetedfileconverter.test_targeted_file_converter_pqptotsv.out
    │   └── test_openswathdecoy_generator.test_openswath_decoy_generator.out
    ├── test_convert.py
    ├── README.md
    ├── test_openswathdecoy_generator.py
    ├── test_openswathassay_generator.py
    ├── test_targetedfileconverter.py
    ├── test_convertsage.py
    └── test_insilico_library.py
├── easypqp
    ├── __init__.py
    ├── util.py
    ├── unimoddb.py
    ├── openswathdecoygenerator.py
    ├── targetedfileconverter.py
    ├── openswathassaygenerator.py
    ├── library.py
    └── sage.py
├── CONTRIBUTING.md
├── Dockerfile
├── .github
    └── workflows
    │   ├── dockerpublish.yml
    │   ├── ci.yml
    │   ├── pythonpublish.yml
    │   └── changelog.yml
├── pyproject.toml
├── LICENSE
├── .gitignore
├── CHANGELOG.md
├── requirements.txt
└── README.md


/tests/data/library_targets.pqp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grosenberger/easypqp/HEAD/tests/data/library_targets.pqp


--------------------------------------------------------------------------------
/tests/data/test_transition_list.pqp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grosenberger/easypqp/HEAD/tests/data/test_transition_list.pqp


--------------------------------------------------------------------------------
/tests/test_generate_ionseries.py:
--------------------------------------------------------------------------------
1 | from easypqp.convert import generate_ionseries
2 | 
3 | print(generate_ionseries(".(UniMod:1)ADQLTEEQIAEFK", 2))
4 | 


--------------------------------------------------------------------------------
/easypqp/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | # Global variable for location of packaged unimod.xml file
4 | location = os.path.dirname(os.path.realpath(__file__))
5 | pkg_unimod_db = os.path.join(location, 'data', 'unimod.xml')


--------------------------------------------------------------------------------
/tests/_regtest_outputs/test_convertsage.test_convertsage.out:
--------------------------------------------------------------------------------
1 | Info: Converting Sage inputs: results.sage.tsv + matched_fragments.sage.tsv
2 | Info: Reading Sage PSMs
3 | Info: Reading Sage matched fragment peaks
4 | Info: Wrote LQSRPAAPPAPGPGQLTLR.psmpkl and LQSRPAAPPAPGPGQLTLR.peakpkl
5 | Info: Total elapsed time 0.00 minutes.
6 | 


--------------------------------------------------------------------------------
/tests/data/Q99536.fasta:
--------------------------------------------------------------------------------
1 | >sp|Q99536|VAT1_HUMAN Synaptic vesicle membrane protein VAT-1 homolog OS=Homo sapiens OX=9606 GN=VAT1 PE=1 SV=2
2 | MSDEREVAEAATGEDASSPPPKTEAASDPQHPAASEGAAAAAASPPLLRCLVLTGFGGYD
3 | KVKLQSRPAAPPAPGPGQLTLRLRACGLNFADLMARQGLYDRLPPLPVTPGMEGAGVVIA
4 | VGEGVSDRKAGDRVMVLNRSGMWQEEVTVPSVQTFLIPEAMTFEEAAALLVNYITAYMVL
5 | FDFGNLQPGHSVLVHMAAGGVGMAAVQLCRTVENVTVFGTASASKHEALKENGVTHPIDY
6 | HTTDYVDEIKKISPKGVDIVMDPLGGSDTAKGYNLLKPMGKVVTYGMANLLTGPKRNLMA
7 | LARTWWNQFSVTALQLLQANRAVCGFHLGYLDGEVELVSGVVARLLALYNQGHIKPHIDS
8 | VWPFEKVADAMKQMQEKKNVGKVLLVPGPEKEN
9 | 


--------------------------------------------------------------------------------
/tests/data/results.sage.tsv:
--------------------------------------------------------------------------------
1 | psm_id	peptide	proteins	num_proteins	filename	scannr	rank	label	expmass	calcmass	charge	peptide_len	missed_cleavages	semi_enzymatic	isotope_error	precursor_ppm	fragment_ppm	hyperscore	delta_next	delta_best	rt	aligned_rt	predicted_rt	delta_rt_model	ion_mobility	predicted_mobility	delta_mobility	matched_peaks	longest_b	longest_y	longest_y_pct	matched_intensity_pct	scored_candidates	poisson	sage_discriminant_score	posterior_error	spectrum_q	peptide_q	protein_q	ms2_intensity
2 | 1	LQSRPAAPPAPGPGQLTLR	sp|Q99536|VAT1_HUMAN	1	LQSRPAAPPAPGPGQLTLR.mzML	controllerType=0 controllerNumber=1 scan=30069	1	1	1926.0815	1926.08	3	19	0	0	0.0	0.8239083	0.503857	72.26591573806016	72.26591573806016	0.0	108.2854	0.993444	0.0	0.993444	0.0	0.0	0.0	22	9	12	0.6315789	64.770966	1	-1.9562811911083433	1.2944585	1.0	1.0	1.0	1.0	72609170.0
3 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## Commit Message Convention
 2 | 
 3 | We follow [Conventional Commits](https://www.conventionalcommits.org/):
 4 | 
 5 | ```
 6 | <type>(<scope>): <description>
 7 | 
 8 | [optional body]
 9 | 
10 | [optional footer(s)]
11 | ```
12 | 
13 | ### Types:
14 | - `feat`: New feature
15 | - `fix`: Bug fix
16 | - `docs`: Documentation changes
17 | - `style`: Code style changes (formatting, etc.)
18 | - `refactor`: Code refactoring
19 | - `perf`: Performance improvements
20 | - `test`: Adding or updating tests
21 | - `chore`: Maintenance tasks
22 | 
23 | ### Examples:
24 | ```bash
25 | git commit -m "feat: add support for DuckDB backend"
26 | git commit -m "fix: resolve memory leak in scoring module"
27 | git commit -m "docs: update installation instructions"
28 | git commit -m "chore: update dependencies to latest versions"
29 | ```
30 | 
31 | ### Breaking Changes:
32 | ```bash
33 | git commit -m "feat!: remove deprecated API endpoints
34 | 
35 | BREAKING CHANGE: The /v1/score endpoint has been removed.
36 | Use /v2/score instead."
37 | ```


--------------------------------------------------------------------------------
/tests/data/matched_fragments.sage.tsv:
--------------------------------------------------------------------------------
 1 | psm_id	fragment_type	fragment_ordinals	fragment_charge	fragment_mz_calculated	fragment_mz_experimental	fragment_intensity
 2 | 1	b	2	1	242.14992	242.14989	578440.75
 3 | 1	b	3	1	329.18195	329.18304	33585.195
 4 | 1	b	4	1	485.28305	485.28275	703782.06
 5 | 1	b	5	1	582.3358	582.33417	362622.56
 6 | 1	b	6	1	653.3729	653.3723	678786.2
 7 | 1	b	7	1	724.41003	724.4097	11793456.0
 8 | 1	b	8	1	821.46277	821.4627	1807024.5
 9 | 1	b	9	1	918.5155	918.5154	556689.5
10 | 1	b	10	1	989.5526	989.55237	13877311.0
11 | 1	b	12	1	1143.627	1143.6226	920279.2
12 | 1	y	12	1	1203.6846	1203.6862	1344718.1
13 | 1	y	11	1	1106.6318	1106.6305	3311897.0
14 | 1	y	10	1	1009.57904	1009.57855	1904729.9
15 | 1	y	9	1	938.54193	938.5417	15733808.0
16 | 1	y	8	1	841.4892	841.4886	6905694.0
17 | 1	y	7	1	784.4677	784.46735	5842707.0
18 | 1	y	6	1	687.415	687.4143	2340203.2
19 | 1	y	5	1	630.3935	630.3935	839138.7
20 | 1	y	4	1	502.3349	502.33484	1299269.5
21 | 1	y	3	1	389.25085	389.251	840262.75
22 | 1	y	2	1	288.2032	288.20273	233942.19
23 | 1	y	1	1	175.11914	175.11903	700811.4
24 | 


--------------------------------------------------------------------------------
/tests/test_convert.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from easypqp.convert import get_scan
 3 | 
 4 | 
 5 | class TestConvert(unittest.TestCase):
 6 | 
 7 |     def test_get_scan(self):
 8 |         self.assertEqual(11, get_scan("controllerType=0 controllerNumber=1 scan=2 demux=0", 11))
 9 |         self.assertEqual(11, get_scan("sample=2 period=3 cycle=4 experiment=5", 11))
10 |         self.assertEqual(11, get_scan("frame=2 scan=3", 11))
11 | 
12 |         self.assertEqual(11, get_scan("controllerType=0 controllerNumber=1 scan=11", 22))
13 |         self.assertEqual(11, get_scan("function=0 process=1 scan=11", 22))
14 |         self.assertEqual(11, get_scan("jobRun=0 spotLabel=asw spectrum=11", 22))
15 |         self.assertEqual(11, get_scan("11", 22))
16 |         self.assertEqual(11, get_scan("scan=11", 22))
17 |         self.assertEqual(11, get_scan("spectrum=11", 22))
18 |         self.assertEqual(11, get_scan("scanId=11", 22))
19 |         self.assertEqual(11, get_scan("index=11", 22))
20 |         self.assertEqual(11, get_scan("frame=11", 22))
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     unittest.main()
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # docker build -t easypqp:latest .
 2 | 
 3 | FROM python:3.11-slim
 4 | 
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | ENV PYTHONUNBUFFERED=1
 7 | 
 8 | # Minimal build/runtime dependencies. Add or remove system packages as needed
 9 | # if package compilation fails (e.g., pyopenms may require extra libs).
10 | RUN apt-get update \
11 |  && apt-get install -y --no-install-recommends \
12 |    build-essential \
13 |    gcc \
14 |    git \
15 |    cmake \
16 |    swig \
17 |    pkg-config \
18 |    libxml2-dev \
19 |    zlib1g-dev \
20 |    libbz2-dev \
21 |    liblzma-dev \
22 |    libcurl4-openssl-dev \
23 |    libssl-dev \
24 |    # Runtime libraries required by pyopenms
25 |    libglib2.0-0 \
26 |    libgomp1 \
27 |  && rm -rf /var/lib/apt/lists/*
28 | 
29 | # Upgrade packaging tools
30 | RUN pip install --no-cache-dir --upgrade pip setuptools wheel
31 | 
32 | # Copy project into the image
33 | WORKDIR /tmp/easypqp
34 | COPY . /tmp/easypqp
35 | 
36 | # Install EasyPQP with all optional features by default
37 | RUN pip install --no-cache-dir ".[all]"
38 | 
39 | # Cleanup sources
40 | WORKDIR /
41 | RUN rm -rf /tmp/easypqp
42 | 
43 | # Default command prints help
44 | CMD ["easypqp","--help"]
45 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | README
 2 | ======
 3 | 
 4 | The scripts should be run with `py.test` (>=3.4.1) with installed plugin `pytest-regest`
 5 | (>=1.0.14 see https://pypi.python.org/pypi/pytest-regtest).
 6 | 
 7 | The plugin allows recording of approved output so that later test runs will check if
 8 | the output is still the same. It is simple to use as you can see in `test_via_regression.py`.
 9 | 
10 | In order to record output you have to use the `regtest` fixture like in the following example.
11 | This `regtest` behaves like a file handle, so you can write to it as usual:
12 | 
13 | ````
14 |     def test_0(regtest):
15 |         print >> regtest, "this is the recorded output"
16 | ````
17 | 
18 | If you now create a new test function `test_0` in a file `test_xyz.py`, first run
19 | 
20 | ````
21 |     $ py.test tests/test_xyz.py::test_0
22 | ````
23 | 
24 | which will show you the yet not approved output. You can approve this output using
25 | 
26 | ````
27 |     $ py.test --regtest-reset tests/test_xyz.py::test_0
28 | ````
29 | 
30 | Which will create a file in `tests/_regtest_outputs/test_xyz.test_0.out` which you should not forget to
31 | commit with `git`.
32 | 
33 | 
34 | Later runs like
35 | ````
36 |     $ py.test tests/test_xyz.py
37 | ````
38 | 
39 | will then check if the recorded output is still the same.
40 | 
41 | 


--------------------------------------------------------------------------------
/.github/workflows/dockerpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Docker image
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   push_to_registries:
 9 |     name: Push Docker image to multiple registries
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       packages: write
13 |       contents: read
14 |     steps:
15 |       - name: Check out the repo
16 |         uses: actions/checkout@v3
17 |       
18 |       - name: Log in to Docker Hub
19 |         uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
20 |         with:
21 |           username: ${{ secrets.DOCKER_USERNAME }}
22 |           password: ${{ secrets.DOCKER_PASSWORD }}
23 |       
24 |       - name: Log in to the Container registry
25 |         uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
26 |         with:
27 |           registry: ghcr.io
28 |           username: ${{ github.actor }}
29 |           password: ${{ secrets.GITHUB_TOKEN }}
30 |       
31 |       - name: Extract metadata (tags, labels) for Docker
32 |         id: meta
33 |         uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
34 |         with:
35 |           images: |
36 |             grosenberger/easypqp
37 |             ghcr.io/${{ github.repository }}
38 |       
39 |       - name: Build and push Docker images
40 |         uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
41 |         with:
42 |           context: .
43 |           push: true
44 |           tags: ${{ steps.meta.outputs.tags }}
45 |           labels: ${{ steps.meta.outputs.labels }}
46 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=64.0.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "easypqp"
 7 | version = "0.1.54"
 8 | description = "EasyPQP: Simple library generation for OpenSWATH"
 9 | readme = "README.md"
10 | requires-python = ">=3.9"
11 | authors = [
12 |   { name = "George Rosenberger", email = "gr2578@cumc.columbia.edu" },
13 | ]
14 | license = "BSD-3-Clause"
15 | # Core dependencies required for basic EasyPQP functionality
16 | dependencies = [
17 |   "numba",
18 |   "Click>=8.0.0",
19 |   "numpy==1.26.4",
20 |   "scipy",
21 |   "scikit-learn",
22 |   "statsmodels",
23 |   "pandas>=1.5.0",
24 |   "biopython",
25 |   "pyopenms>=3.3.0",
26 |   "matplotlib>=3.5.1",
27 |   "seaborn",
28 |   "tqdm",
29 | ]
30 | 
31 | [project.urls]
32 | Homepage = "https://github.com/grosenberger/easypqp"
33 | 
34 | [project.scripts]
35 | easypqp = "easypqp.main:cli"
36 | 
37 | [project.optional-dependencies]
38 | 
39 | # PyProphet integration
40 | # Install with: pip install easypqp[pyprophet]
41 | pyprophet = ["pyprophet"]
42 | 
43 | # Rust backend for in-silico library generation
44 | # Install with: pip install easypqp[rust]
45 | rust = ["easypqp_rs>=0.1.5"]
46 | 
47 | # All optional features
48 | # Install with: pip install easypqp[all]
49 | all = [
50 |   "pyprophet",
51 |   "easypqp_rs>=0.1.5"
52 | ]
53 | 
54 | [tool.setuptools]
55 | include-package-data = true
56 | 
57 | [tool.setuptools.packages.find]
58 | where = ["."]
59 | include = ["easypqp*"]
60 | 
61 | [tool.setuptools.package-data]
62 | easypqp = ["data/unimod.xml"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, George Rosenberger
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/tests/data/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "database": {
 3 |         "enzyme": {
 4 |             "missed_cleavages": 1,
 5 |             "min_len": 7,
 6 |             "max_len": 50,
 7 |             "cleave_at": "KR",
 8 |             "restrict": "P",
 9 |             "c_terminal": true,
10 |             "semi_enzymatic": false
11 |         },
12 |         "peptide_min_mass": 500.0,
13 |         "peptide_max_mass": 5000.0,
14 |         "static_mods": {
15 |             "C": 57.0215
16 |         },
17 |         "variable_mods": {},
18 |         "max_variable_mods": 2,
19 |         "decoy_tag": "rev_",
20 |         "generate_decoys": true,
21 |         "fasta": "tests/data/Q99536.fasta"
22 |     },
23 |     "insilico_settings": {
24 |         "precursor_charge": [
25 |             2,
26 |             4
27 |         ],
28 |         "max_fragment_charge": 1,
29 |         "min_transitions": 6,
30 |         "max_transitions": 6,
31 |         "fragmentation_model": "cid",
32 |         "allowed_fragment_types": [
33 |             "b",
34 |             "y"
35 |         ],
36 |         "rt_scale": 100.0
37 |     },
38 |     "dl_feature_generators": {
39 |         "device": "cpu",
40 |         "fine_tune_config": {
41 |             "fine_tune": false,
42 |             "train_data_path": "",
43 |             "batch_size": 256,
44 |             "epochs": 3,
45 |             "learning_rate": 0.001,
46 |             "save_model": true
47 |         },
48 |         "instrument": "QE",
49 |         "nce": 20.0,
50 |         "batch_size": 64
51 |     },
52 |     "peptide_chunking": 0,
53 |     "output_file": "tests/data/easypqp_insilico_library.tsv",
54 |     "write_report": true,
55 |     "parquet_output": false
56 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .idea


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master, main ]
 6 |   pull_request:
 7 |     branches: [ master, main ]
 8 | 
 9 | # Ensure only one run per branch/PR is active at a time; cancel previous runs when new commits arrive
10 | concurrency:
11 |   group: ${{ github.workflow }}-${{ github.ref }}
12 |   cancel-in-progress: true
13 | 
14 | jobs:
15 |   test:
16 |     name: Run tests
17 |     runs-on: ubuntu-latest
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         python-version: [ '3.10', '3.11', '3.12', '3.13' ]
22 | 
23 |     steps:
24 |       - name: Checkout repository
25 |         uses: actions/checkout@v4
26 | 
27 |       - name: Set up Python
28 |         uses: actions/setup-python@v4
29 |         with:
30 |           python-version: ${{ matrix.python-version }}
31 | 
32 |       - name: Install system dependencies
33 |         run: |
34 |           sudo apt-get update
35 |           sudo apt-get install -y --no-install-recommends \
36 |             build-essential gcc git cmake swig pkg-config libxml2-dev zlib1g-dev libbz2-dev liblzma-dev libcurl4-openssl-dev libssl-dev libglib2.0-0 libgomp1
37 | 
38 |       - name: Upgrade pip and setuptools
39 |         run: pip install --upgrade pip setuptools wheel
40 | 
41 |       - name: Cache pip
42 |         uses: actions/cache@v4
43 |         with:
44 |           path: ~/.cache/pip
45 |           key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }}
46 |           restore-keys: |
47 |             ${{ runner.os }}-pip-${{ matrix.python-version }}-
48 | 
49 |       - name: Install package with all extras
50 |         run: pip install --no-cache-dir ".[all]"
51 | 
52 |       - name: Install test dependencies
53 |         run: python -m pip install pytest pytest-regtest
54 | 
55 |       - name: Run pytest
56 |         run: pytest -q
57 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 |   workflow_dispatch:
 7 |     inputs:
 8 |       tag:
 9 |         description: 'Tag to publish (e.g. 0.1.53). If omitted on manual dispatch the workflow will try to use the latest release tag.'
10 |         required: false
11 |         type: string
12 | 
13 | jobs:
14 |   deploy:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout repository (full history)
18 |         uses: actions/checkout@v4
19 |         with:
20 |           fetch-depth: 0
21 | 
22 |       - name: Checkout requested tag (workflow_dispatch)
23 |         if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' }}
24 |         run: |
25 |           git fetch --tags --force
26 |           git checkout tags/${{ github.event.inputs.tag }} -b publish-${{ github.event.inputs.tag }}
27 | 
28 |       - name: Checkout release tag (release event)
29 |         if: ${{ github.event_name == 'release' }}
30 |         run: |
31 |           git fetch --tags --force
32 |           git checkout tags/${{ github.event.release.tag_name }} -b publish-${{ github.event.release.tag_name }}
33 |       - name: Set up Python
34 |         uses: actions/setup-python@v4
35 |         with:
36 |           python-version: '3.x'
37 |       - name: Install build tools
38 |         run: |
39 |           python -m pip install --upgrade pip
40 |           python -m pip install --upgrade build twine
41 |       - name: Build and publish
42 |         env:
43 |           TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
44 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
45 |         run: |
46 |           # Build sdist and wheel using PEP 517 build backend (reads pyproject.toml)
47 |           python -m build --sdist --wheel
48 |           # Upload artifacts with twine
49 |           python -m twine upload dist/*
50 |         
51 | 


--------------------------------------------------------------------------------
/tests/test_openswathdecoy_generator.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import os
 4 | import subprocess
 5 | import shutil
 6 | import sys
 7 | 
 8 | import pandas as pd
 9 | import sqlite3
10 | 
11 | import pytest
12 | 
13 | pd.options.display.expand_frame_repr = False
14 | pd.options.display.precision = 4
15 | pd.options.display.max_columns = None
16 | 
17 | DATA_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
18 | 
19 | def _run_cmdline(cmdline):
20 |     stdout = cmdline + "\n"
21 |     try:
22 |         stdout += str(subprocess.check_output(cmdline, shell=True,
23 |                                           stderr=subprocess.STDOUT))
24 |     except subprocess.CalledProcessError as error:
25 |         print(error, end="", file=sys.stderr)
26 |         raise
27 |     return stdout
28 | 
29 | def _run_openswath_decoy_generator(regtest, temp_folder):
30 |     os.chdir(temp_folder)
31 |     data_path = os.path.join(DATA_FOLDER, "library_targets.pqp")
32 |     shutil.copy(data_path, temp_folder)
33 |     cmdline = "easypqp openswath-decoy-generator  --in library_targets.pqp --out library.pqp --method pseudo-reverse"
34 | 
35 |     stdout = _run_cmdline(cmdline)
36 | 
37 |     conn = sqlite3.connect("library.pqp")
38 |     protein_table = pd.read_sql_query("SELECT * FROM PROTEIN", conn)
39 |     peptide_table = pd.read_sql_query("SELECT * FROM PEPTIDE", conn)
40 |     precursor_table = pd.read_sql_query("SELECT * FROM PRECURSOR", conn)
41 |     transition_table = pd.read_sql_query("SELECT * FROM TRANSITION", conn)
42 |     conn.close()
43 | 
44 |     print(protein_table.sort_values("ID"),file=regtest)
45 |     print(peptide_table.sort_values("ID"),file=regtest)
46 |     print(precursor_table.sort_values("ID"),file=regtest)
47 |     print(transition_table.sort_values("ID"),file=regtest)
48 | 
49 | def test_openswath_decoy_generator(tmpdir, regtest):
50 |     _run_openswath_decoy_generator(regtest, tmpdir.strpath)


--------------------------------------------------------------------------------
/tests/test_openswathassay_generator.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import os
 4 | import subprocess
 5 | import shutil
 6 | import sys
 7 | 
 8 | import pandas as pd
 9 | import sqlite3
10 | 
11 | import pytest
12 | 
13 | pd.options.display.expand_frame_repr = False
14 | pd.options.display.precision = 4
15 | pd.options.display.max_columns = None
16 | 
17 | DATA_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
18 | 
19 | def _run_cmdline(cmdline):
20 |     stdout = cmdline + "\n"
21 |     try:
22 |         stdout += str(subprocess.check_output(cmdline, shell=True,
23 |                                           stderr=subprocess.STDOUT))
24 |     except subprocess.CalledProcessError as error:
25 |         print(error, end="", file=sys.stderr)
26 |         raise
27 |     return stdout
28 | 
29 | def _run_openswath_assay_generator(regtest, temp_folder):
30 |     os.chdir(temp_folder)
31 |     data_path = os.path.join(DATA_FOLDER, "test_transition_list.tsv")
32 |     shutil.copy(data_path, temp_folder)
33 |     cmdline = "easypqp openswath-assay-generator  --in test_transition_list.tsv --out library_targets.pqp"
34 | 
35 |     stdout = _run_cmdline(cmdline)
36 | 
37 |     conn = sqlite3.connect("library_targets.pqp")
38 |     protein_table = pd.read_sql_query("SELECT * FROM PROTEIN", conn)
39 |     peptide_table = pd.read_sql_query("SELECT * FROM PEPTIDE", conn)
40 |     precursor_table = pd.read_sql_query("SELECT * FROM PRECURSOR", conn)
41 |     transition_table = pd.read_sql_query("SELECT * FROM TRANSITION", conn)
42 |     conn.close()
43 | 
44 |     print(protein_table.sort_values("ID"),file=regtest)
45 |     print(peptide_table.sort_values("ID"),file=regtest)
46 |     print(precursor_table.sort_values("ID"),file=regtest)
47 |     print(transition_table.sort_values("ID"),file=regtest)
48 | 
49 | def test_openswath_assay_generator(tmpdir, regtest):
50 |     _run_openswath_assay_generator(regtest, tmpdir.strpath)


--------------------------------------------------------------------------------
/tests/_regtest_outputs/test_insilico_library.test_insilico_library.out:
--------------------------------------------------------------------------------
 1 | Generated library contains 492 transitions
 2 | Number of unique precursors: 41
 3 | Number of unique peptides: 41
 4 | 
 5 | Columns: ['PrecursorMz', 'ProductMz', 'PrecursorCharge', 'ProductCharge', 'LibraryIntensity', 'NormalizedRetentionTime', 'PeptideSequence', 'ModifiedPeptideSequence', 'PeptideGroupLabel', 'LabelType', 'CompoundName', 'SumFormula', 'SMILES', 'Adducts', 'ProteinId', 'UniprotId', 'GeneName', 'FragmentType', 'FragmentSeriesNumber', 'Annotation', 'CollisionEnergy', 'PrecursorIonMobility', 'TransitionGroupId', 'TransitionId', 'Decoy', 'DetectingTransition', 'IdentifyingTransition', 'QuantifyingTransition', 'Peptidoforms']
 6 | 
 7 | First 5 transitions (deterministic columns only):
 8 |    PrecursorMz  ProductMz  PrecursorCharge  ProductCharge PeptideSequence ModifiedPeptideSequence ProteinId UniprotId    GeneName FragmentType  FragmentSeriesNumber Annotation  TransitionGroupId  TransitionId  Decoy
 9 | 0     393.7187   228.1343                2              1         NLMALAR                 NLMALAR    Q99536    Q99536  VAT1_HUMAN            b                     2       b2^1                  0             2      0
10 | 1     393.7187   246.1561                2              1         NLMALAR                 NLMALAR    Q99536    Q99536  VAT1_HUMAN            y                     2       y2^1                  0            20      0
11 | 2     393.7187   359.1748                2              1         NLMALAR                 NLMALAR    Q99536    Q99536  VAT1_HUMAN            b                     3       b3^1                  0             6      0
12 | 3     393.7187   359.2401                2              1         NLMALAR                 NLMALAR    Q99536    Q99536  VAT1_HUMAN            y                     3       y3^1                  0            16      0
13 | 4     393.7187   430.2772                2              1         NLMALAR                 NLMALAR    Q99536    Q99536  VAT1_HUMAN            y                     4       y4^1                  0            12      0
14 | 
15 | Statistics:
16 |   Precursor charge range: 2-3
17 |   Fragment types: ['b', 'y']
18 |   Contains decoys: False
19 |   Number of targets: 492
20 |   Number of decoys: 0
21 | 


--------------------------------------------------------------------------------
/tests/test_targetedfileconverter.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import os
 4 | import subprocess
 5 | import shutil
 6 | import sys
 7 | 
 8 | import pandas as pd
 9 | import sqlite3
10 | 
11 | import pytest
12 | 
13 | pd.options.display.expand_frame_repr = False
14 | pd.options.display.precision = 4
15 | pd.options.display.max_columns = None
16 | 
17 | DATA_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
18 | 
19 | def _run_cmdline(cmdline):
20 |     stdout = cmdline + "\n"
21 |     try:
22 |         stdout += str(subprocess.check_output(cmdline, shell=True,
23 |                                           stderr=subprocess.STDOUT))
24 |     except subprocess.CalledProcessError as error:
25 |         print(error, end="", file=sys.stderr)
26 |         raise
27 |     return stdout
28 | 
29 | def _run_targetedfileconverter(regtest, temp_folder, infile, outfile):
30 |     os.chdir(temp_folder)
31 |     data_path = os.path.join(DATA_FOLDER, infile)
32 |     shutil.copy(data_path, temp_folder)
33 |     cmdline = f"easypqp targeted-file-converter  --in {infile} --out {outfile}"
34 | 
35 |     stdout = _run_cmdline(cmdline)
36 | 
37 |     if outfile.split(".")[1] == "pqp":
38 |         conn = sqlite3.connect(outfile)
39 |         protein_table = pd.read_sql_query("SELECT * FROM PROTEIN", conn)
40 |         peptide_table = pd.read_sql_query("SELECT * FROM PEPTIDE", conn)
41 |         precursor_table = pd.read_sql_query("SELECT * FROM PRECURSOR", conn)
42 |         transition_table = pd.read_sql_query("SELECT * FROM TRANSITION", conn)
43 |         conn.close()
44 | 
45 |         print(protein_table.sort_values("ID"),file=regtest)
46 |         print(peptide_table.sort_values("ID"),file=regtest)
47 |         print(precursor_table.sort_values("ID"),file=regtest)
48 |         print(transition_table.sort_values("ID"),file=regtest)
49 |     elif outfile.split(".")[1] == "tsv":
50 |         print(pd.read_csv(outfile, sep="\t", nrows=100).sort_index(axis=1),file=regtest)
51 | 
52 | 
53 | def test_targeted_file_converter_tsvtopqp(tmpdir, regtest):
54 |     _run_targetedfileconverter(regtest, tmpdir.strpath, "test_transition_list.tsv", "test_transition_list.pqp")
55 | 
56 | def test_targeted_file_converter_pqptotsv(tmpdir, regtest):
57 |     _run_targetedfileconverter(regtest, tmpdir.strpath, "test_transition_list.pqp", "test_transition_list.tsv")


--------------------------------------------------------------------------------
/easypqp/util.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import json
 3 | import tempfile
 4 | from typing import Union
 5 | import click
 6 | 
 7 | 
 8 | def timestamped_echo(message):
 9 |     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
10 |     click.echo(f"{timestamp} - {message}")
11 | 
12 | 
13 | 
14 | 
15 | def create_json_config(as_bytes: bool = False) -> Union[str, bytes]:
16 |     """
17 |     Create a JSON configuration file for EasyPQP In-silico library generation.
18 |     """
19 |     config = {
20 |         "version": "0.1.0",
21 |         "database": {
22 |             "enzyme": {
23 |                 "missed_cleavages": 1,
24 |                 "min_len": None,
25 |                 "max_len": None,
26 |                 "cleave_at": "KR",
27 |                 "restrict": "P",
28 |                 "c_terminal": None,
29 |                 "semi_enzymatic": None
30 |             },
31 |             "peptide_min_mass": 500.0,
32 |             "peptide_max_mass": 5000.0,
33 |             "static_mods": {
34 |                 "C": 57.0215
35 |             },
36 |             "variable_mods": {},
37 |             "max_variable_mods": 2,
38 |             "decoy_tag": "rev_",
39 |             "generate_decoys": True,
40 |             "fasta": ""
41 |         },
42 |         "insilico_settings": {
43 |             "precursor_charge": [2, 4],
44 |             "max_fragment_charge": 1,
45 |             "min_transitions": 6,
46 |             "max_transitions": 6,
47 |             "fragmentation_model": "cid",
48 |             "allowed_fragment_types": ["b", "y"],
49 |             "rt_scale": 100.0
50 |         },
51 |         "dl_feature_generators": {
52 |             "device": "cpu",
53 |             "fine_tune_config": {
54 |                 "fine_tune": False,
55 |                 "train_data_path": "",
56 |                 "batch_size": 256,
57 |                 "epochs": 3,
58 |                 "learning_rate": 0.001,
59 |                 "save_model": True
60 |             },
61 |             "instrument": "QE",
62 |             "nce": 20.0,
63 |             "batch_size": 64
64 |         },
65 |         "peptide_chunking": 0,
66 |         "output_file": "./easypqp_insilico_library.tsv",
67 |         "write_report": True,
68 |         "parquet_output": False
69 |     }
70 | 
71 |     json_str = json.dumps(config, indent=2)
72 | 
73 |     if as_bytes:
74 |         return json_str.encode('utf-8')
75 |     else:
76 |         with tempfile.NamedTemporaryFile('w+', suffix=".json", delete=False) as tmp:
77 |             tmp.write(json_str)
78 |             tmp.flush()
79 |             return tmp.name
80 | 


--------------------------------------------------------------------------------
/tests/test_convertsage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import shutil
 4 | import sys
 5 | 
 6 | import pandas as pd
 7 | import re
 8 | 
 9 | DATA_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
10 | 
11 | 
12 | def _run_cmdline(cmdline):
13 |     try:
14 |         out = subprocess.check_output(cmdline, shell=True, stderr=subprocess.STDOUT)
15 |         return out.decode(errors="replace")
16 |     except subprocess.CalledProcessError as error:
17 |         out = error.output.decode() if error.output else ""
18 |         print(out, file=sys.stderr)
19 |         raise
20 | 
21 | 
22 | def _run_convertsage(temp_folder, regtest):
23 |     os.chdir(temp_folder)
24 | 
25 |     # Copy test files to temp directory
26 |     shutil.copy(os.path.join(DATA_FOLDER, "results.sage.tsv"), temp_folder)
27 |     shutil.copy(os.path.join(DATA_FOLDER, "matched_fragments.sage.tsv"), temp_folder)
28 | 
29 |     cmdline = (
30 |         "easypqp convertsage --sage_psm results.sage.tsv "
31 |         "--sage_fragments matched_fragments.sage.tsv"
32 |     )
33 | 
34 |     out = _run_cmdline(cmdline)
35 |     # Strip leading timestamps of the form 'YYYY-MM-DD HH:MM:SS - ' and
36 |     # filter out pyopenms environment warnings which are non-deterministic
37 |     cleaned_lines = []
38 |     for line in out.splitlines():
39 |         line = re.sub(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - ", "", line)
40 |         # Remove pyopenms/openms data-path warnings which leak local site-packages paths
41 |         if re.search(r"pyopenms", line, flags=re.IGNORECASE) or re.search(
42 |             r"OPENMS_DATA_PATH", line
43 |         ):
44 |             continue
45 |         cleaned_lines.append(line)
46 |     cleaned = "\n".join(cleaned_lines)
47 |     print(cleaned, file=regtest)
48 | 
49 |     # Expect output files for run 'LQSRPAAPPAPGPGQLTLR'
50 |     run_stem = "LQSRPAAPPAPGPGQLTLR"
51 |     psmpkl = f"{run_stem}.psmpkl"
52 |     peakpkl = f"{run_stem}.peakpkl"
53 | 
54 |     assert os.path.exists(psmpkl), f"Missing expected output {psmpkl}"
55 |     assert os.path.exists(peakpkl), f"Missing expected output {peakpkl}"
56 | 
57 |     # Verify pickles load and contain expected columns
58 |     psms = pd.read_pickle(psmpkl)
59 |     peaks = pd.read_pickle(peakpkl)
60 | 
61 |     assert not psms.empty, "psmpkl is empty"
62 |     assert not peaks.empty, "peakpkl is empty"
63 | 
64 |     # Basic schema checks
65 |     assert "run_id" in psms.columns
66 |     assert "scan_id" in psms.columns
67 |     assert "run_id" in peaks.columns
68 |     assert "product_mz" in peaks.columns or "fragment" in peaks.columns
69 | 
70 | 
71 | def test_convertsage(tmpdir, regtest):
72 |     _run_convertsage(tmpdir.strpath, regtest)
73 | 


--------------------------------------------------------------------------------
/.github/workflows/changelog.yml:
--------------------------------------------------------------------------------
 1 | # filepath: .github/workflows/changelog.yml
 2 | name: Generate Changelog
 3 | 
 4 | on:
 5 |   workflow_dispatch:  # Manual trigger
 6 |     inputs:
 7 |       version:
 8 |         description: 'Version tag (e.g., 3.0.4 or v3.0.4). If omitted the latest tag will be used.'
 9 |         required: false
10 |         type: string
11 |   push:
12 |     # Trigger on any tag so releases without a 'v' prefix (e.g. '0.1.53') also run
13 |     tags:
14 |       - '*'
15 | 
16 | concurrency:
17 |   group: ${{ github.workflow }}-${{ github.ref }}
18 |   cancel-in-progress: true
19 | 
20 | jobs:
21 |   changelog:
22 |     name: Generate Changelog
23 |     runs-on: ubuntu-latest
24 |     permissions:
25 |       contents: write
26 |       pull-requests: write
27 |     
28 |     steps:
29 |       - name: Checkout code
30 |         uses: actions/checkout@v4
31 |         with:
32 |           fetch-depth: 0  # Fetch all history for changelog generation
33 | 
34 |       - name: Install git-cliff
35 |         run: |
36 |           wget https://github.com/orhun/git-cliff/releases/download/v2.7.0/git-cliff-2.7.0-x86_64-unknown-linux-gnu.tar.gz
37 |           tar -xzf git-cliff-2.7.0-x86_64-unknown-linux-gnu.tar.gz
38 |           sudo mv git-cliff-2.7.0/git-cliff /usr/local/bin/
39 |           chmod +x /usr/local/bin/git-cliff
40 | 
41 |       - name: Generate full CHANGELOG
42 |         run: |
43 |           git-cliff --output CHANGELOG.md
44 | 
45 |       - name: Generate release notes for latest tag
46 |         if: startsWith(github.ref, 'refs/tags/')
47 |         run: |
48 |           # Get the latest tag
49 |           LATEST_TAG=$(git describe --tags --abbrev=0)
50 |           
51 |           # Generate changelog for this release only
52 |           git-cliff --latest --strip header > RELEASE_NOTES.md
53 |           
54 |           echo "Release notes for ${LATEST_TAG}:"
55 |           cat RELEASE_NOTES.md
56 | 
57 |       - name: Commit and push CHANGELOG
58 |         if: github.event_name == 'workflow_dispatch' || startsWith(github.ref, 'refs/tags/')
59 |         run: |
60 |           git config user.name "github-actions[bot]"
61 |           git config user.email "github-actions[bot]@users.noreply.github.com"
62 |           git add CHANGELOG.md
63 |           
64 |           if git diff --staged --quiet; then
65 |             echo "No changes to CHANGELOG.md"
66 |           else
67 |             git commit -m "chore: update CHANGELOG.md"
68 |             git push origin HEAD:master || git push origin HEAD:main
69 |           fi
70 | 
71 |       - name: Create/Update Release with Changelog
72 |         if: startsWith(github.ref, 'refs/tags/')
73 |         uses: softprops/action-gh-release@v1
74 |         with:
75 |           body_path: RELEASE_NOTES.md
76 |           draft: false
77 |           prerelease: false
78 |         env:
79 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


--------------------------------------------------------------------------------
/tests/_regtest_outputs/test_targetedfileconverter.test_targeted_file_converter_tsvtopqp.out:
--------------------------------------------------------------------------------
 1 |    ID PROTEIN_ACCESSION  DECOY
 2 | 4   0            Q04637      0
 3 | 3   1            Q2M2I8      0
 4 | 2   2            Q86WB0      0
 5 | 1   3            Q8WWI1      0
 6 | 0   4            Q92890      0
 7 |    ID       UNMODIFIED_SEQUENCE                         MODIFIED_SEQUENCE  DECOY
 8 | 4   0      AGQTQPNPGILPIQPALTPR           AGQTQPNPGILPIQPALT(UniMod:21)PR      0
 9 | 3   1  ATLSSTSGLDLMSESGEGEISPQR       ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR      0
10 | 2   2               EAALPPVSPLK                    EAALPPVS(UniMod:21)PLK      0
11 | 1   3           GVEPSPSPIKPGDIK                GVEPSPS(UniMod:21)PIKPGDIK      0
12 | 0   4        SMGTGDTPGLEVPSSPLR  SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR      0
13 |    ID                                  TRAML_ID GROUP_LABEL  PRECURSOR_MZ  CHARGE LIBRARY_INTENSITY  LIBRARY_RT  LIBRARY_DRIFT_TIME  DECOY
14 | 3   0           AGQTQPNPGILPIQPALT(Phospho)PR_2                 1075.0619       2              None     70.6096                -1.0      0
15 | 4   1       ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2                 1266.5622       2              None     67.4251                -1.0      0
16 | 0   2                    EAALPPVS(Phospho)PLK_2                  601.3151       2              None     48.0082                -1.0      0
17 | 1   3                GVEPSPS(Phospho)PIKPGDIK_2                  800.9028       2              None     32.9698                -1.0      0
18 | 2   4  SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2                  948.9241       2              None     52.9017                -1.0      0
19 |     ID TRAML_ID  PRODUCT_MZ  CHARGE TYPE ANNOTATION  ORDINAL  DETECTING  IDENTIFYING  QUANTIFYING  LIBRARY_INTENSITY  DECOY
20 | 0    0     2345    260.1969       1    y       y2^1        2          1            0            1            81.9345      0
21 | 1    1     2346    409.2146       2    y       y7^2        7          1            0            1          2965.7283      0
22 | 2    2     2347    465.7566       2    y       y8^2        8          1            0            1           132.8395      0
23 | 3    3     2348    623.3164       1    y       y5^1        5          1            0            1           101.3607      0
24 | 4    4     2349    720.3692       1    y       y6^1        6          1            0            1          1580.4800      0
25 | ..  ..      ...         ...     ...  ...        ...      ...        ...          ...          ...                ...    ...
26 | 73  73    42472   1264.6090       1    b      b13^1       13          1            0            1           890.7413      0
27 | 74  74    42473   1268.5155       1    y      y11^1       11          1            0            1          1830.4344      0
28 | 75  75    42474   1355.5475       1    y      y12^1       12          1            0            1          2691.2388      0
29 | 76  76    42475   1393.6515       1    b      b14^1       14          1            0            1           870.2799      0
30 | 77  77    42476   1486.5880       1    y      y13^1       13          1            0            1           851.3514      0
31 | 
32 | [78 rows x 12 columns]
33 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | ## [0.1.54] - 2025-12-12
 6 | 
 7 | ### 🚀 Features
 8 | 
 9 | - Add Sage PSM and fragment parsers for EasyPQP conversion
10 | - Add Sage input options for library generation
11 | - Enhance SagePSMParser with protein token parsing and unique accessions handling
12 | - Add max delta mass option for UniMod annotation in library generation
13 | - Implement table reading function for TSV and Parquet files in SagePSMParser
14 | - Add mz precision option to SagePSMParser and conversion function
15 | - Add convertsage cli command
16 | - Add DataFrame parsing methods for Sage PSM and fragment data to support streaming
17 | - Add streaming options to convertsage for improved performance with large inputs
18 | - Enhance convert_sage_streaming for improved memory efficiency and processing speed
19 | - Update streaming threshold to 2GB
20 | - Add EasyPQP In-Silico Library Generation command
21 | - Add EasyPQP In-Silico Library Generation command
22 | - Enhance insilico_library options with RT scaling, report generation, Parquet output, and threading support
23 | - Add insilico library generation test and configuration files
24 | - Add matched_fragments and results data files for testing
25 | - Implement optional in-silico library generation with rust backend support
26 | - Add CI workflow for testing across multiple Python versions
27 | - Add concurrency configuration to CI workflow for improved efficiency
28 | 
29 | ### 🐛 Bug Fixes
30 | 
31 | - Correct documentation for output columns in SagePSMParser
32 | - Handle import for transform_pi0_lambda across PyProphet versions
33 | - Precursor mz calculcation
34 | - Enhance _basename_wo_ext to handle common compression extensions
35 | - Improve logging messages for streaming conversion in convert_sage
36 | - Add missing easypqp_rs dependency in pyproject.toml
37 | - Remove timestamps from convertsage output for deterministic testing
38 | - Enhance output cleaning in _run_convertsage for deterministic testing
39 | - Pyprophet import error
40 | - Update version to 0.1.54 in pyproject.toml
41 | - Update changelog workflow to allow any tag and improve version input description
42 | 
43 | ### 💼 Other
44 | 
45 | - Parameter transformation function for Click options
46 | 
47 | ### 📚 Documentation
48 | 
49 | - Update README to reflect support for Sage
50 | - Update README with CLI commands for easypqp
51 | - Add doc for _get_first_existing function to retrieve existing DataFrame columns with optional casting
52 | - Update README to include in-silico library generation command and details
53 | - Add information about standalone portable rust binary in README
54 | - Update README to clarify installation of optional features and in-silico library generation
55 | 
56 | ### 🧪 Testing
57 | 
58 | - Add test for convertsage functionality with output validation
59 | 
60 | ### ⚙️ Miscellaneous Tasks
61 | 
62 | - Add requirements.txt for dependency management
63 | - Add changelog generation workflow and update CONTRIBUTING guidelines
64 | - Update pyproject.toml to clarify optional dependencies and remove redundant entries
65 | - Update Dockerfile to use Python 3.11-slim
66 | 
67 | <!-- generated by git-cliff -->
68 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with Python 3.10
  3 | # by the following command:
  4 | #
  5 | #    pip-compile --all-extras --output-file=requirements.txt
  6 | #
  7 | biopython==1.85
  8 |     # via easypqp (pyproject.toml)
  9 | click==8.3.0
 10 |     # via
 11 |     #   easypqp (pyproject.toml)
 12 |     #   pyprophet
 13 | contourpy==1.3.2
 14 |     # via matplotlib
 15 | cycler==0.12.1
 16 |     # via matplotlib
 17 | cython==3.1.4
 18 |     # via pyprophet
 19 | duckdb==1.3.2
 20 |     # via
 21 |     #   duckdb-extension-sqlite-scanner
 22 |     #   duckdb-extensions
 23 |     #   pyprophet
 24 | duckdb-extension-sqlite-scanner==1.3.2
 25 |     # via pyprophet
 26 | duckdb-extensions==1.3.2
 27 |     # via pyprophet
 28 | fonttools==4.60.2
 29 |     # via matplotlib
 30 | joblib==1.5.2
 31 |     # via scikit-learn
 32 | kiwisolver==1.4.9
 33 |     # via matplotlib
 34 | llvmlite==0.45.1
 35 |     # via numba
 36 | loguru==0.7.3
 37 |     # via pyprophet
 38 | lxml==6.0.2
 39 |     # via pyprophet
 40 | matplotlib==3.10.7
 41 |     # via
 42 |     #   easypqp (pyproject.toml)
 43 |     #   pyopenms
 44 |     #   pyprophet
 45 |     #   seaborn
 46 | numba==0.62.1
 47 |     # via easypqp (pyproject.toml)
 48 | numexpr==2.14.1
 49 |     # via pyprophet
 50 | numpy==1.26.4
 51 |     # via
 52 |     #   biopython
 53 |     #   contourpy
 54 |     #   easypqp (pyproject.toml)
 55 |     #   matplotlib
 56 |     #   numba
 57 |     #   numexpr
 58 |     #   pandas
 59 |     #   patsy
 60 |     #   pyopenms
 61 |     #   pyprophet
 62 |     #   scikit-learn
 63 |     #   scipy
 64 |     #   seaborn
 65 |     #   statsmodels
 66 |     #   xgboost
 67 | nvidia-nccl-cu12==2.28.3
 68 |     # via xgboost
 69 | packaging==25.0
 70 |     # via
 71 |     #   matplotlib
 72 |     #   statsmodels
 73 | pandas==2.3.3
 74 |     # via
 75 |     #   easypqp (pyproject.toml)
 76 |     #   pyopenms
 77 |     #   pyprophet
 78 |     #   seaborn
 79 |     #   statsmodels
 80 | patsy==1.0.1
 81 |     # via statsmodels
 82 | pillow==12.0.0
 83 |     # via matplotlib
 84 | polars==1.34.0
 85 |     # via pyprophet
 86 | polars-runtime-32==1.34.0
 87 |     # via polars
 88 | psutil==7.1.0
 89 |     # via pyprophet
 90 | pyarrow==21.0.0
 91 |     # via pyprophet
 92 | pyopenms==3.4.0
 93 |     # via
 94 |     #   easypqp (pyproject.toml)
 95 |     #   pyprophet
 96 | pyparsing==3.2.5
 97 |     # via matplotlib
 98 | pypdf==6.4.0
 99 |     # via pyprophet
100 | pyprophet==3.0.2
101 |     # via easypqp (pyproject.toml)
102 | python-dateutil==2.9.0.post0
103 |     # via
104 |     #   matplotlib
105 |     #   pandas
106 | pytz==2025.2
107 |     # via pandas
108 | scikit-learn==1.7.2
109 |     # via
110 |     #   easypqp (pyproject.toml)
111 |     #   pyprophet
112 | scipy==1.15.3
113 |     # via
114 |     #   easypqp (pyproject.toml)
115 |     #   pyprophet
116 |     #   scikit-learn
117 |     #   statsmodels
118 |     #   xgboost
119 | seaborn==0.13.2
120 |     # via
121 |     #   easypqp (pyproject.toml)
122 |     #   pyprophet
123 | six==1.17.0
124 |     # via python-dateutil
125 | statsmodels==0.14.5
126 |     # via
127 |     #   easypqp (pyproject.toml)
128 |     #   pyprophet
129 | tabulate==0.9.0
130 |     # via pyprophet
131 | threadpoolctl==3.6.0
132 |     # via scikit-learn
133 | tqdm==4.67.1
134 |     # via easypqp (pyproject.toml)
135 | typing-extensions==4.15.0
136 |     # via pypdf
137 | tzdata==2025.2
138 |     # via pandas
139 | xgboost==3.1.0
140 |     # via pyprophet
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | EasyPQP: Simple library generation for OpenSWATH
 2 | ================================================
 3 | 
 4 | [![CI](https://github.com/grosenberger/easypqp/actions/workflows/ci.yml/badge.svg)](https://github.com/grosenberger/easypqp/actions/workflows/ci.yml)
 5 | 
 6 | EasyPQP is a Python package that provides simplified and fast peptide query parameter generation for OpenSWATH. It can process input from MSFragger, Sage or other database search engines in pepXML/idXML/tsv format. Statistical validation can be conducted either using PyProphet or PeptideProphet/iProphet. Retention times and ion mobilities are calibrated using an internal or external standard. In addition to a cumulative library, run-specific libraries are generated for non-linear RT alignment in OpenSWATH. For generation of PTM specific libraries that utilizes a unimod.xml database, you can further restrict the unimod.xml database file for modifications and site-specificities of interest. It also supports in-silico library generation.
 7 | 
 8 | Installation
 9 | ============
10 | 
11 | We strongly advice to install EasyPQP in a Python [*virtualenv*](https://virtualenv.pypa.io/en/stable/). EasyPQP is compatible with Python 3.
12 | 
13 | Install the development version of *easypqp* from GitHub:
14 | 
15 | ````
16 |     $ pip install git+https://github.com/grosenberger/easypqp.git@master
17 | ````
18 | 
19 | ### Full Installation
20 | 
21 | To install all optional features:
22 | 
23 | ````
24 |     $ pip install easypqp[all]
25 | ````
26 | 
27 | This will install the `easypqp_rs` package, which provides the in-silico library generation feature and pyprophet for statistical validation.
28 | 
29 | Running EasyPQP
30 | ===============
31 | 
32 | *EasyPQP* is not only a Python package, but also a command line tool:
33 | 
34 | ````
35 |    $ easypqp --help
36 | ````
37 | 
38 | or:
39 | 
40 | ````
41 |    $ easypqp convert --help
42 |    $ easypqp convertpsm --help
43 |    $ easypqp convertsage --help
44 |    $ easypqp library --help
45 |    $ easypqp insilico-library --help
46 |    $ easypqp reduce --help
47 |    $ easypqp filter-unimod --help
48 |    $ easypqp openswath-assay-generator --help
49 |    $ easypqp openswath-decoy-generator --help
50 |    $ easypqp targeted-file-converter --help
51 | ````
52 | 
53 | Generating an *In-Silico* Library
54 | =================================
55 | 
56 | The in-silico library generation feature is included if you installed EasyPQP with the `[all]`  or `[rust]` extras (to install the `easypqp_rs` package).
57 | 
58 | To generate an in-silico library, you can use the `insilico-library` command. For example:
59 | 
60 | ````
61 |    $ easypqp insilico-library --fasta your_proteome.fasta --output_file insilico_library.tsv
62 | ````
63 | 
64 | For more information on the parameters and JSON configuration file, see the [Configuration Reference](https://github.com/singjc/easypqp-rs?tab=readme-ov-file#configuration-reference)
65 | 
66 | > [!NOTE]
67 | > If no `retention_time`, `ion_mobility`, or `ms2_intensity` fields are provided under `dl_feature_generators` in the config, pretrained models will be automatically downloaded and used. The current default pretrained models used are:
68 | > - RT: `rt_cnn_tf` - A CNN-Transformer model trained on the [ProteomicsML repository RT dataset](https://proteomicsml.org/datasets/retentiontime/ProteomeTools_RT.html). This model is based on AlphaPeptDeep's CNN-LSTM implementation, with the biLSTM replaced by a Transformer encoder.
69 | > - CCS: `ccs_cnn_tf` - A CNN-Transformer model trained on the [ProteomicsML repository CCS dataset](https://proteomicsml.org/datasets/ionmobility/Meier_TIMS.html). This model is also based on AlphaPeptDeep's CNN-LSTM implementation, with the biLSTM replaced by a Transformer encoder.
70 | > - MS2: `ms2_bert` - A BERT-based model retreived from AlphaPeptDeep's pretrained models.
71 | 
72 | If you want just a standalone portable rust binary, you can download one from the [easypqp-rs releases page](https://github.com/singjc/easypqp-rs/releases).
73 | 
74 | Docker
75 | ======
76 | 
77 | EasyPQP is also available from Docker (automated builds):
78 | 
79 | Pull the development version of *easypqp* from DockerHub (synced with GitHub):
80 | 
81 | ````
82 |     $ docker pull grosenberger/easypqp:latest
83 | ````
84 | 


--------------------------------------------------------------------------------
/tests/_regtest_outputs/test_openswathassay_generator.test_openswath_assay_generator.out:
--------------------------------------------------------------------------------
 1 |    ID PROTEIN_ACCESSION  DECOY
 2 | 4   0            Q04637      0
 3 | 3   1            Q2M2I8      0
 4 | 2   2            Q86WB0      0
 5 | 1   3            Q8WWI1      0
 6 | 0   4            Q92890      0
 7 |    ID       UNMODIFIED_SEQUENCE                         MODIFIED_SEQUENCE  DECOY
 8 | 4   0      AGQTQPNPGILPIQPALTPR           AGQTQPNPGILPIQPALT(UniMod:21)PR      0
 9 | 3   1  ATLSSTSGLDLMSESGEGEISPQR       ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR      0
10 | 2   2               EAALPPVSPLK                    EAALPPVS(UniMod:21)PLK      0
11 | 1   3           GVEPSPSPIKPGDIK                GVEPSPS(UniMod:21)PIKPGDIK      0
12 | 0   4        SMGTGDTPGLEVPSSPLR  SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR      0
13 |    ID                                  TRAML_ID GROUP_LABEL  PRECURSOR_MZ  CHARGE LIBRARY_INTENSITY  LIBRARY_RT  LIBRARY_DRIFT_TIME  DECOY
14 | 3   0           AGQTQPNPGILPIQPALT(Phospho)PR_2                 1075.0619       2              None     70.6096                -1.0      0
15 | 4   1       ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2                 1266.5622       2              None     67.4251                -1.0      0
16 | 0   2                    EAALPPVS(Phospho)PLK_2                  601.3151       2              None     48.0082                -1.0      0
17 | 1   3                GVEPSPS(Phospho)PIKPGDIK_2                  800.9028       2              None     32.9698                -1.0      0
18 | 2   4  SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2                  948.9241       2              None     52.9017                -1.0      0
19 |     ID TRAML_ID  PRODUCT_MZ  CHARGE TYPE ANNOTATION  ORDINAL  DETECTING  IDENTIFYING  QUANTIFYING  LIBRARY_INTENSITY  DECOY
20 | 0    0    31640    486.2307       1    b       b5^1        5          1            0            1         10000.0000      0
21 | 1    1    31641    697.3264       1    b       b7^1        7          1            0            1          7081.5693      0
22 | 2    2    31643    734.3597       1    y       y6^1        6          1            0            1          8579.1080      0
23 | 3    3    31644    832.4502       2    y      y15^2       15          1            0            1          2923.7356      0
24 | 4    4    31646    964.4847       1    b      b10^1       10          1            0            1          3234.0083      0
25 | 5    5    31647   1072.5551       1    y       y9^1        9          1            0            1          3853.8560      0
26 | 6    6    42446    400.2303       1    y       y3^1        3          1            0            1          7762.3594      0
27 | 7    7    42450    567.2287       1    y       y4^1        4          1            0            1          8796.0370      0
28 | 8    8    42457    818.4254       1    b       b9^1        9          1            0            1          5399.1875      0
29 | 9    9    42458    866.3768       1    y       y7^1        7          1            0            1          6659.5240      0
30 | 10  10    42459    933.4524       1    b      b10^1       10          1            0            1          6236.0680      0
31 | 11  11    42468   1139.4729       1    y      y10^1       10          1            0            1          3636.2630      0
32 | 12  12     2346    409.2146       2    y       y7^2        7          1            0            1          2965.7283      0
33 | 13  13     2347    465.7566       2    y       y8^2        8          1            0            1           132.8395      0
34 | 14  14     2349    720.3692       1    y       y6^1        6          1            0            1          1580.4800      0
35 | 15  15     2350    817.4219       1    y       y7^1        7          1            0            1         10000.0000      0
36 | 16  16     2352    930.5060       1    y       y8^1        8          1            0            1           417.7673      0
37 | 17  17     2353   1001.5431       1    y       y9^1        9          1            0            1           278.9014      0
38 | 18  18    12006    375.2238       1    y       y3^1        3          1            0            1          1621.3933      0
39 | 19  19    12009    529.2980       1    y       y5^1        5          1            0            1         10000.0000      0
40 | 20  20    12010    566.2941       2    y      y10^2       10          1            0            1          3326.0842      0
41 | 21  21    12012    657.3930       1    y       y6^1        6          1            0            1          1924.5614      0
42 | 22  22    12013    658.3365       2    y      y12^2       12          1            0            1          4464.7860      0
43 | 23  23    12015    867.5298       1    y       y8^1        8          1            0            1          5222.4050      0
44 | 24  24    21489    385.2558       1    y       y3^1        3          1            0            1          5179.5244      0
45 | 25  25    21490    393.1438       1    b       b4^1        4          1            0            1          2606.7244      0
46 | 26  26    21491    565.1923       1    b       b6^1        6          1            0            1          3256.2622      0
47 | 27  27    21493    666.2399       1    b       b7^1        7          1            0            1          3735.2622      0
48 | 28  28    21494    736.3389       1    y       y6^1        6          1            0            1         10000.0000      0
49 | 29  29    21496    835.4073       1    y       y7^1        7          1            0            1          3901.4023      0
50 | 


--------------------------------------------------------------------------------
/tests/_regtest_outputs/test_targetedfileconverter.test_targeted_file_converter_pqptotsv.out:
--------------------------------------------------------------------------------
 1 |     Adducts Annotation  CollisionEnergy  CompoundName  Decoy  DetectingTransition  FragmentSeriesNumber FragmentType GeneName  IdentifyingTransition  LabelType  LibraryIntensity              ModifiedPeptideSequence  NormalizedRetentionTime  PeptideGroupLabel           PeptideSequence  Peptidoforms  PrecursorCharge  PrecursorIonMobility  PrecursorMz  ProductCharge  ProductMz ProteinId  QuantifyingTransition  SMILES  SumFormula                    TransitionGroupId  TransitionId  UniprotId
 2 | 0       NaN       y2^1             -1.0           NaN      0                    1                     2            y   EIF4G1                      0        NaN           81.9345               EAALPPVS(UniMod:21)PLK                  48.0082                NaN               EAALPPVSPLK           NaN                2                  -1.0     601.3151              1   260.1969    Q04637                      1     NaN         NaN               EAALPPVS(Phospho)PLK_2          2345        NaN
 3 | 1       NaN       y7^2             -1.0           NaN      0                    1                     7            y   EIF4G1                      0        NaN         2965.7283               EAALPPVS(UniMod:21)PLK                  48.0082                NaN               EAALPPVSPLK           NaN                2                  -1.0     601.3151              2   409.2146    Q04637                      1     NaN         NaN               EAALPPVS(Phospho)PLK_2          2346        NaN
 4 | 2       NaN       y8^2             -1.0           NaN      0                    1                     8            y   EIF4G1                      0        NaN          132.8395               EAALPPVS(UniMod:21)PLK                  48.0082                NaN               EAALPPVSPLK           NaN                2                  -1.0     601.3151              2   465.7566    Q04637                      1     NaN         NaN               EAALPPVS(Phospho)PLK_2          2347        NaN
 5 | 3       NaN       y5^1             -1.0           NaN      0                    1                     5            y   EIF4G1                      0        NaN          101.3607               EAALPPVS(UniMod:21)PLK                  48.0082                NaN               EAALPPVSPLK           NaN                2                  -1.0     601.3151              1   623.3164    Q04637                      1     NaN         NaN               EAALPPVS(Phospho)PLK_2          2348        NaN
 6 | 4       NaN       y6^1             -1.0           NaN      0                    1                     6            y   EIF4G1                      0        NaN         1580.4800               EAALPPVS(UniMod:21)PLK                  48.0082                NaN               EAALPPVSPLK           NaN                2                  -1.0     601.3151              1   720.3692    Q04637                      1     NaN         NaN               EAALPPVS(Phospho)PLK_2          2349        NaN
 7 | ..      ...        ...              ...           ...    ...                  ...                   ...          ...      ...                    ...        ...               ...                                  ...                      ...                ...                       ...           ...              ...                   ...          ...            ...        ...       ...                    ...     ...         ...                                  ...           ...        ...
 8 | 73      NaN      b13^1             -1.0           NaN      0                    1                    13            b     LMO7                      0        NaN          890.7413  ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR                  67.4251                NaN  ATLSSTSGLDLMSESGEGEISPQR           NaN                2                  -1.0    1266.5622              1  1264.6090    Q8WWI1                      1     NaN         NaN  ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2         42472        NaN
 9 | 74      NaN      y11^1             -1.0           NaN      0                    1                    11            y     LMO7                      0        NaN         1830.4344  ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR                  67.4251                NaN  ATLSSTSGLDLMSESGEGEISPQR           NaN                2                  -1.0    1266.5622              1  1268.5155    Q8WWI1                      1     NaN         NaN  ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2         42473        NaN
10 | 75      NaN      y12^1             -1.0           NaN      0                    1                    12            y     LMO7                      0        NaN         2691.2388  ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR                  67.4251                NaN  ATLSSTSGLDLMSESGEGEISPQR           NaN                2                  -1.0    1266.5622              1  1355.5475    Q8WWI1                      1     NaN         NaN  ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2         42474        NaN
11 | 76      NaN      b14^1             -1.0           NaN      0                    1                    14            b     LMO7                      0        NaN          870.2799  ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR                  67.4251                NaN  ATLSSTSGLDLMSESGEGEISPQR           NaN                2                  -1.0    1266.5622              1  1393.6515    Q8WWI1                      1     NaN         NaN  ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2         42475        NaN
12 | 77      NaN      y13^1             -1.0           NaN      0                    1                    13            y     LMO7                      0        NaN          851.3514  ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR                  67.4251                NaN  ATLSSTSGLDLMSESGEGEISPQR           NaN                2                  -1.0    1266.5622              1  1486.5880    Q8WWI1                      1     NaN         NaN  ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2         42476        NaN
13 | 
14 | [78 rows x 29 columns]
15 | 


--------------------------------------------------------------------------------
/easypqp/unimoddb.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | from tqdm import tqdm
  3 | 
  4 | # Unimod parsing
  5 | import xml.etree.cElementTree as ET
  6 | 
  7 | 
  8 | def site_validation(site_input):
  9 |     """
 10 |     Perform a check to ensure inputs are valid
 11 |     Arguments:
 12 |         site_input: (list) list of amino acid residues, or terminal notation, or wild card notation (*).
 13 |     Returns:
 14 |         Nothing is returned. An error is raised if the input contains a non-valid site.
 15 |     """
 16 |     acceptable_sites = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V", 'U', 'O', '[', ']', 'n', 'c', '*']
 17 |     site_check = [site not in acceptable_sites for site in site_input]
 18 |     if any(site_check):
 19 |         raise click.ClickException( f"Incorrect site specificity input, site(s) {', '.join([i for (i, v) in zip(site_input, site_check) if v])} is not valid. Acceptable sites: {', '.join(acceptable_sites)}")
 20 | 
 21 | def site_specificity_transform(site_input):
 22 |     """
 23 |     Transform input site to return the site and position. Transforms terminal notation to site notation in unimod.xml and whether its any terminal site or a protein terminal site.
 24 |     Arguments:
 25 |         site_input: (list) list of amino acid residues, or terminal notation, or wild card notation (*).
 26 |     Returns:
 27 |         Returns a tuple of list of sites and list of positions
 28 |     """
 29 |     # Site and Position Mapping
 30 |     terminal_map = {'[':'N-term', ']':'C-term', 'n':'N-term', 'c':'C-term'}
 31 |     site_position_map = {'[':'Protein N-term', ']':'Protein C-term', 'n':'Any N-term', 'c':'Any C-term'}
 32 |     # Split sites
 33 |     site_input = [site for site in site_input]
 34 |     site_validation(site_input)
 35 |     sites=[]; positions=[]
 36 |     for site in site_input:
 37 |         if site in terminal_map.keys():
 38 |             sites.append(terminal_map[site])
 39 |             positions.append(site_position_map[site])
 40 |         elif site=="*":
 41 |             sites.append("*")
 42 |             positions.append("*")
 43 |         else:
 44 |             sites.append(site)
 45 |             positions.append("Anywhere")
 46 |     return sites, positions
 47 | 
 48 | def unimod_filter(unimod_file, out_file, accession_ids, site_specificity):
 49 |     """
 50 |     Filter an input unimod to restrict for specific modifications and site specificities
 51 |     Arguments:
 52 |         unimod_file: (str) path/filename of input unimod.xml file.
 53 |         out_file: (str) path/filename to write out new filtered unimod.xml file
 54 |         accession_ids: (list) list of unimod accession ids to restrict for. i.e. ['1','21','35]
 55 |         site_specificity: (list) list of site specificties to further restrict corresponding unimod for. i.e. ['n','STY','M], will restrict acetylation for any N-Term, phosphorylation for serine, threonine, and tyrosine, and oxidation for methionine.
 56 |     Returns:
 57 |         Nothing is returned. The restricted unimod database is written to the out_file.
 58 |     """
 59 |     # Register Namespace
 60 |     ET.register_namespace('umod', 'http://www.unimod.org/xmlns/schema/unimod_2')
 61 | 
 62 |     # Read in unimod XML database
 63 |     click.echo(f"INFO: Loading XML data from {unimod_file}")
 64 |     tree = ET.parse(unimod_file)
 65 |     root = tree.getroot()
 66 | 
 67 |     # Namespace
 68 |     ns = {'umod':'http://www.unimod.org/xmlns/schema/unimod_2'}
 69 | 
 70 |     # Generate root for new filtered unimod XML
 71 |     root_out = ET.Element(root.tag, root.attrib)
 72 | 
 73 |     # Append elements subelements
 74 |     umod_elements = root.findall("umod:elements", ns)
 75 |     root_out.append(umod_elements[0])
 76 | 
 77 |     # Append desired modifications
 78 |     mod_entries = root.findall('umod:modifications', ns)[0]
 79 |     mod_out = ET.Element(mod_entries.tag, mod_entries.attrib)
 80 |     i=0
 81 |     pbar = tqdm(accession_ids)
 82 |     pbar_desc = "INFO: Restricting"
 83 |     for record_id in pbar:
 84 |         add_unimod_entry = mod_entries.findall(f"./umod:mod/[@record_id='{record_id}']", ns)[0]
 85 |         if site_specificity is not None:
 86 |             site, position = site_specificity_transform(site_specificity[i])
 87 |             # Update progess bar description
 88 |             pbar_desc = f"INFO: Restricting..{add_unimod_entry.attrib.get('title')}({','.join(site)})"
 89 |             pbar.set_description(pbar_desc)
 90 |             if site != "*":
 91 |                 for unimod_site in add_unimod_entry.findall(f"./umod:specificity", ns):
 92 |                     if unimod_site.attrib['site'] in site and unimod_site.attrib['position'] in position:
 93 |                         # If current specificity element is a requested one, continue on
 94 |                         continue
 95 |                     else:
 96 |                         # Remove specificities that do not match requested specificities
 97 |                         add_unimod_entry.remove(unimod_site)
 98 |         else:
 99 |             # Update progess bar description
100 |             pbar_desc = f"INFO: Restricting..{add_unimod_entry.attrib.get('title')}"
101 |             pbar.set_description(pbar_desc)
102 |         # click.echo(f"INFO: Appending to filtered unimod XML - title={add_unimod_entry.attrib.get('title')} with record_id={add_unimod_entry.attrib.get('record_id')}")
103 |         mod_out.append( add_unimod_entry )
104 |         i+=1
105 |     root_out.append(mod_out)
106 | 
107 |     # Append amino acids
108 |     umod_amino_acids = root.findall("umod:amino_acids", ns)
109 |     root_out.append(umod_amino_acids[0])
110 | 
111 |     # Append mod bricks
112 |     umod_mod_bricks = root.findall("umod:mod_bricks", ns)
113 |     root_out.append(umod_mod_bricks[0])
114 | 
115 |     # Generate element hierarchy to write out to xml
116 |     tree_out = ET.ElementTree(root_out)
117 |     # For Pretty-Printing
118 |     ET.indent(tree_out, '  ')
119 |     # Write out filtered unimod xml database
120 |     click.echo(f"INFO: Writing out filtered unimod XML file to {out_file}")
121 |     tree_out.write(out_file, encoding="UTF-8", xml_declaration=True, method="xml")
122 | 
123 |     # Insert Top Comment
124 |     # TODO: This may not be the best way to add the top level comment in standard unimod.xml database files. Might be able to use lxml instead, requiring an additional dependency
125 |     with open(out_file, 'r+', encoding="utf-8") as file_handle:
126 |         lines = file_handle.readlines()     
127 |         lines.insert(1, "<!--Copyright (C) 2002-2006 Unimod; this information may be copied, distributed and/or-->\n<!--modified under certain conditions, but it comes WITHOUT ANY WARRANTY; see the-->\n<!--accompanying Design Science License for more details-->\n")  # you can use any index if you know the line index
128 |         file_handle.seek(0)                 
129 |         file_handle.writelines(lines)       


--------------------------------------------------------------------------------
/tests/test_insilico_library.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import os
  4 | import subprocess
  5 | import shutil
  6 | import sys
  7 | 
  8 | import pandas as pd
  9 | import pytest
 10 | 
 11 | pd.options.display.expand_frame_repr = False
 12 | pd.options.display.precision = 4
 13 | pd.options.display.max_columns = None
 14 | 
 15 | DATA_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
 16 | 
 17 | # Check if insilico feature is available
 18 | try:
 19 |     from easypqp_rs import generate_insilico_library  # noqa: F401
 20 | 
 21 |     HAS_RUST_BACKEND = True
 22 | except ImportError:
 23 |     HAS_RUST_BACKEND = False
 24 | 
 25 | 
 26 | def _run_cmdline(cmdline):
 27 |     stdout = cmdline + "\n"
 28 |     try:
 29 |         stdout += str(
 30 |             subprocess.check_output(cmdline, shell=True, stderr=subprocess.STDOUT)
 31 |         )
 32 |     except subprocess.CalledProcessError as error:
 33 |         print(error, end="", file=sys.stderr)
 34 |         print(
 35 |             "Command output:",
 36 |             error.output.decode() if error.output else "No output",
 37 |             file=sys.stderr,
 38 |         )
 39 |         raise
 40 |     return stdout
 41 | 
 42 | 
 43 | def _run_insilico_library(regtest, temp_folder):
 44 |     os.chdir(temp_folder)
 45 |     config_path = os.path.join(DATA_FOLDER, "config.json")
 46 |     fasta_path = os.path.join(DATA_FOLDER, "Q99536.fasta")
 47 | 
 48 |     # Copy test files to temp directory
 49 |     shutil.copy(config_path, temp_folder)
 50 |     shutil.copy(fasta_path, temp_folder)
 51 | 
 52 |     # Update config to use local paths in temp folder
 53 |     import json
 54 | 
 55 |     with open("config.json", "r") as f:
 56 |         config = json.load(f)
 57 | 
 58 |     # Update paths to be relative to temp folder
 59 |     config["database"]["fasta"] = "Q99536.fasta"
 60 |     config["output_file"] = "easypqp_insilico_library.tsv"
 61 | 
 62 |     with open("config.json", "w") as f:
 63 |         json.dump(config, f, indent=2)
 64 | 
 65 |     cmdline = "easypqp insilico-library --config config.json"
 66 | 
 67 |     _run_cmdline(cmdline)
 68 | 
 69 |     # Read and verify the output TSV file
 70 |     output_file = "easypqp_insilico_library.tsv"
 71 |     assert os.path.exists(output_file), f"Output file {output_file} was not created"
 72 | 
 73 |     library_df = pd.read_csv(output_file, sep="\t")
 74 | 
 75 |     # Print basic statistics about the generated library
 76 |     print(f"Generated library contains {len(library_df)} transitions", file=regtest)
 77 | 
 78 |     # Use TransitionGroupId for unique precursors if available, otherwise compute from other columns
 79 |     if "TransitionGroupId" in library_df.columns:
 80 |         num_precursors = library_df["TransitionGroupId"].nunique()
 81 |     else:
 82 |         # Compute unique precursors from PrecursorMz, PrecursorCharge, and PeptideSequence
 83 |         num_precursors = library_df.groupby(
 84 |             ["PrecursorMz", "PrecursorCharge", "PeptideSequence"]
 85 |         ).ngroups
 86 | 
 87 |     print(f"Number of unique precursors: {num_precursors}", file=regtest)
 88 | 
 89 |     # Use appropriate column for peptide count
 90 |     peptide_col = (
 91 |         "ModifiedPeptideSequence"
 92 |         if "ModifiedPeptideSequence" in library_df.columns
 93 |         else "PeptideSequence"
 94 |     )
 95 |     print(
 96 |         f"Number of unique peptides: {library_df[peptide_col].nunique()}", file=regtest
 97 |     )
 98 | 
 99 |     # Print column names
100 |     print(f"\nColumns: {list(library_df.columns)}", file=regtest)
101 | 
102 |     # Round LibraryIntensity to make test more stable (DL predictions can vary slightly)
103 |     # Keep only deterministic columns for display
104 |     display_df = library_df.head().copy()
105 |     if "LibraryIntensity" in display_df.columns:
106 |         display_df["LibraryIntensity"] = display_df["LibraryIntensity"].round(0)
107 | 
108 |     # Sort by ProductMz to ensure consistent ordering across runs
109 |     display_df = display_df.sort_values("ProductMz").reset_index(drop=True)
110 | 
111 |     # Print a sample of the data (first few rows) - excluding non-deterministic columns
112 |     print("\nFirst 5 transitions (deterministic columns only):", file=regtest)
113 |     deterministic_cols = [
114 |         "PrecursorMz",
115 |         "ProductMz",
116 |         "PrecursorCharge",
117 |         "ProductCharge",
118 |         "PeptideSequence",
119 |         "ModifiedPeptideSequence",
120 |         "ProteinId",
121 |         "UniprotId",
122 |         "GeneName",
123 |         "FragmentType",
124 |         "FragmentSeriesNumber",
125 |         "Annotation",
126 |         "TransitionGroupId",
127 |         "TransitionId",
128 |         "Decoy",
129 |     ]
130 |     available_cols = [col for col in deterministic_cols if col in display_df.columns]
131 |     print(display_df[available_cols].to_string(), file=regtest)
132 | 
133 |     # Verify core columns exist (using actual column names from the output)
134 |     core_columns = [
135 |         "PrecursorMz",
136 |         "ProductMz",
137 |         "PrecursorCharge",
138 |         "ProductCharge",
139 |         "LibraryIntensity",
140 |         "PeptideSequence",
141 |         "ProteinId",
142 |         "FragmentType",
143 |         "FragmentSeriesNumber",
144 |         "Annotation",
145 |     ]
146 | 
147 |     missing_columns = [col for col in core_columns if col not in library_df.columns]
148 |     if missing_columns:
149 |         print(f"\nWarning: Missing core columns: {missing_columns}", file=regtest)
150 | 
151 |     # Print some statistics
152 |     print("\nStatistics:", file=regtest)
153 |     print(
154 |         f"  Precursor charge range: {library_df['PrecursorCharge'].min()}-{library_df['PrecursorCharge'].max()}",
155 |         file=regtest,
156 |     )
157 |     print(
158 |         f"  Fragment types: {sorted(library_df['FragmentType'].unique())}", file=regtest
159 |     )
160 | 
161 |     # Check for decoys using the Decoy column
162 |     if "Decoy" in library_df.columns:
163 |         print(f"  Contains decoys: {library_df['Decoy'].sum() > 0}", file=regtest)
164 |         print(f"  Number of targets: {(library_df['Decoy'] == 0).sum()}", file=regtest)
165 |         print(f"  Number of decoys: {(library_df['Decoy'] == 1).sum()}", file=regtest)
166 |     else:
167 |         print("  Decoy column not found in output", file=regtest)
168 | 
169 |     # Verify LibraryIntensity values are reasonable (not in regtest output due to variance)
170 |     if "LibraryIntensity" in library_df.columns:
171 |         intensity_stats = library_df["LibraryIntensity"].describe()
172 |         # Only assert, don't print to regtest to avoid flakiness
173 |         assert intensity_stats["min"] >= 0, "LibraryIntensity should be non-negative"
174 |         assert intensity_stats["max"] <= 10001, (
175 |             "LibraryIntensity should be normalized to ~10000"
176 |         )
177 | 
178 | 
179 | @pytest.mark.skipif(
180 |     not HAS_RUST_BACKEND,
181 |     reason="In-silico feature not installed (easypqp_rs package missing - reinstall easypqp)",
182 | )
183 | def test_insilico_library(tmpdir, regtest):
184 |     _run_insilico_library(regtest, tmpdir.strpath)
185 | 


--------------------------------------------------------------------------------
/easypqp/openswathdecoygenerator.py:
--------------------------------------------------------------------------------
  1 | import pyopenms as po
  2 | import click
  3 | from typing import Union, Tuple
  4 | 
  5 | from .targetedfileconverter import TargetedExperiment
  6 | 
  7 | def check_argument_values(arg_name: str, arg_value: any, expected_type: Tuple[Union[type, None], Union[Tuple, None]]) -> None:
  8 |     """
  9 |     Check if the given argument value is of the expected type and value range (if applicable).
 10 |     Raise a TypeError or ValueError if the value is invalid.
 11 |     """
 12 |     expected_type, expected_range = expected_type
 13 |     if isinstance(expected_type, list) and None in expected_type:
 14 |         pass
 15 |     elif not isinstance(arg_value, expected_type):
 16 |         raise TypeError(f"{arg_name} should be of type {expected_type.__name__} not type {arg_value.__class__}.")
 17 |     if expected_range is not None:
 18 |         # Handle numeric range
 19 |         if isinstance(expected_range, tuple) and len(expected_range) == 2:
 20 |             if not (expected_range[0] <= arg_value <= expected_range[1]):
 21 |                 raise ValueError(f"{arg_name} should be within the range {expected_range}, cannot except {arg_value}.")
 22 |         elif isinstance(expected_range, list) and arg_value not in expected_range:
 23 |             raise ValueError(f"{arg_name} should be one of {expected_range}, cannot except '{arg_value}'.")
 24 | 
 25 | class OpenSwathDecoyGenerator(TargetedExperiment):
 26 |     def __init__(self, 
 27 |                  infile: str, 
 28 |                  outfile: str="library.pqp", 
 29 |                  in_type: Union[str, None]=None, 
 30 |                  out_type: Union[str, None]=None, 
 31 |                  method: str="shuffle",
 32 |                  decoy_tag: str="DECOY_",
 33 |                  min_decoy_fraction: float=0.8,
 34 |                  aim_decoy_fraction: float=1.0,
 35 |                  shuffle_max_attempts: int=30,
 36 |                  shuffle_sequence_identity_threshold: float=0.5,
 37 |                  shift_precursor_mz_shift: float=0.0,
 38 |                  shift_product_mz_shift: float=20.0,
 39 |                  product_mz_threshold: float=0.025,
 40 |                  allowed_fragment_types: str="b,y",
 41 |                  allowed_fragment_charges: str="1,2,3,4",
 42 |                  enable_detection_specific_losses: bool=False,
 43 |                  enable_detection_unspecific_losses: bool=False,
 44 |                  switchKR: bool=True,
 45 |                  separate: bool=False) -> None:
 46 |         super().__init__(True)
 47 | 
 48 |         # Valdiate arguments
 49 |         check_argument_values("infile", infile, (str, None))
 50 |         check_argument_values("outfile", outfile, (str, None))
 51 |         # Handle types
 52 |         if in_type is None:
 53 |             in_type = self._get_file_type(infile)
 54 |         if out_type is None:
 55 |             out_type = self._get_file_type(outfile)
 56 |         check_argument_values("in_type", in_type, ([str, None], ['tsv', 'mrm', 'pqp', 'TraML']))
 57 |         check_argument_values("out_type", out_type, ([str, None], ['tsv', 'pqp', 'TraML']))
 58 |         check_argument_values("method", method, (str, ['shuffle', 'pseudo-reverse', 'reverse', 'shift']))
 59 |         check_argument_values("decoy_tag", decoy_tag, (str, None))
 60 |         check_argument_values("min_decoy_fraction", min_decoy_fraction, (float, (0, 1)))
 61 |         check_argument_values("aim_decoy_fraction", aim_decoy_fraction, (float, (0, 1)))
 62 |         check_argument_values("shuffle_max_attempts", shuffle_max_attempts, (int, None))
 63 |         check_argument_values("shuffle_sequence_identity_threshold", shuffle_sequence_identity_threshold, (float, (0, 1)))
 64 |         check_argument_values("shift_precursor_mz_shift", shift_precursor_mz_shift, (float, None))
 65 |         check_argument_values("shift_product_mz_shift", shift_product_mz_shift, (float, None))
 66 |         check_argument_values("product_mz_threshold", product_mz_threshold, (float, None))
 67 |         check_argument_values("allowed_fragment_types", allowed_fragment_types, (str, None)) # TODO: Add value check, to ensure valid fragment types
 68 |         check_argument_values("allowed_fragment_charges", allowed_fragment_charges, (str, None)) # TODO: Add value check to ensure ints are in string of charges
 69 |         check_argument_values("enable_detection_specific_losses", enable_detection_specific_losses, (bool, None))
 70 |         check_argument_values("enable_detection_unspecific_losses", enable_detection_unspecific_losses, (bool, None))
 71 |         check_argument_values("switchKR", switchKR, (bool, None))
 72 |         check_argument_values("separate", separate, (bool, None))
 73 | 
 74 |         # TODO: Move this up before argument validation for specific arg?
 75 |         # Transform string
 76 |         allowed_fragment_types = allowed_fragment_types.split(",")
 77 |         allowed_fragment_types = [s.encode('utf-8') for s in allowed_fragment_types]
 78 |         allowed_fragment_charges = allowed_fragment_charges.split(",")
 79 |         allowed_fragment_charges = [int(charge) for charge in allowed_fragment_charges]
 80 | 
 81 |         # Assign values to self
 82 |         for name, value in locals().items():
 83 |             if name != 'self':
 84 |                 # print(f"Info: Setting {name} = {value}")
 85 |                 setattr(self, name, value)
 86 | 
 87 |         # Load target experiment
 88 |         self.load_library(self.infile, self.in_type)
 89 | 
 90 |     def generate_decoys(self) -> None:
 91 |         # Initiate decoy experiment
 92 |         self.tr_decoy = po.TargetedExperiment()
 93 | 
 94 |         # Generate decoys
 95 |         decoys = po.MRMDecoy()
 96 |         decoys.generateDecoys(self.tr_exp, self.tr_decoy, self.method, self.aim_decoy_fraction, self.switchKR, self.decoy_tag, self.shuffle_max_attempts, self.shuffle_sequence_identity_threshold, self.shift_precursor_mz_shift, self.shift_product_mz_shift, self.product_mz_threshold, self.allowed_fragment_types, self.allowed_fragment_charges, self.enable_detection_specific_losses, self.enable_detection_unspecific_losses, -4)
 97 | 
 98 |         click.echo(f"Info: Number of target peptides: {len(self.tr_exp.getPeptides())}")
 99 |         click.echo(f"Info: Number of decoy peptides: {len(self.tr_decoy.getPeptides())}")
100 |         click.echo(f"Info: Number of target proteins: {len(self.tr_exp.getProteins())}")
101 |         click.echo(f"Info: Number of decoy proteins: {len(self.tr_decoy.getProteins())}")
102 |         
103 |         if len(self.tr_decoy.getPeptides()) / len(self.tr_exp.getPeptides()) < self.min_decoy_fraction or len(self.tr_decoy.getProteins()) / len(self.tr_exp.getProteins()) < self.min_decoy_fraction:
104 |             raise click.ClickException(f"The number of decoys for peptides or proteins is below the threshold of {(self.min_decoy_fraction * 100)}% of the number of targets.")
105 |         
106 |         if self.separate:
107 |             click.echo(f"Info: Writing only decoys to file: {self.outfile}")
108 |             self.tr_exp = self.tr_decoy
109 |         else:
110 |             click.echo(f"Info: Writing targets and decoys to file: {self.outfile}")
111 |             self.tr_exp += self.tr_decoy
112 | 
113 |         self.write_library(self.outfile, self.out_type)


--------------------------------------------------------------------------------
/easypqp/targetedfileconverter.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import pandas as pd
  4 | import pyopenms as po
  5 | import ctypes
  6 | import click
  7 | from typing import Union
  8 | 
  9 | class TargetedExperiment:
 10 |     """
 11 |     Class to load and write a OpenMS TargetedExperiment
 12 |     """
 13 |     def __init__(self, legacy_traml_id: bool=True) -> None:
 14 |         self.legacy_traml_id = legacy_traml_id
 15 |         self.tr_exp = po.TargetedExperiment()
 16 |         self.file_types = po.FileTypes()
 17 | 
 18 |     def _validate_type(self, file: str, file_type: str) -> None:
 19 |         """Method to ensure filetype is a known OpenMS compatible transition list file type."""
 20 |         if self.file_types.nameToType(file_type) == po.FileType.UNKNOWN and file_type!='parquet':
 21 |             click.FileError(filename=file, hint=f"Error: Could not determine file type! {file}")
 22 |     
 23 |     def _get_file_type(self, infile) -> str:
 24 |         """Method to get file type extension from file."""
 25 |         return os.path.splitext(infile)[-1].split('.')[-1]
 26 | 
 27 |     def _get_file_type_id(self, file_type: str) -> int:
 28 |         """Method to get file type id as annotated in OpenMS filetype database."""
 29 |         return self.file_types.nameToType(file_type)-1
 30 |     
 31 |     def load_library(self, infile: str, in_type: Union[str, None]=None) -> None:
 32 |         """
 33 |         Method to load data from input transition list into an OpenMS TargetedExperiment Object
 34 | 
 35 |         Parameters:
 36 |             infile: (str) input transition list file to load
 37 |             in_type: (str|None) input file type. Default: None. Will be inferred from infile
 38 |         """
 39 |         if in_type is None:
 40 |             in_type = self._get_file_type(infile)
 41 |         self._validate_type(infile, in_type)
 42 |         # Convert infile str to ctype c char
 43 |         c_in_file = ctypes.create_string_buffer(infile.encode())
 44 |         if self._get_file_type_id(in_type) == po.FileType.TSV or self._get_file_type_id(in_type) == po.FileType.MRM:
 45 |             click.echo("Info: Reading TSV transition list data...")
 46 |             tsv_reader = po.TransitionTSVFile()
 47 |             tsv_reader.convertTSVToTargetedExperiment(c_in_file.value, self._get_file_type_id(in_type), self.tr_exp)
 48 |             tsv_reader.validateTargetedExperiment(self.tr_exp)
 49 |         
 50 |         elif self._get_file_type_id(self.in_type) == po.FileType.PQP:
 51 |             click.echo("Info: Reading PQP transition list data...")
 52 |             pqp_reader = po.TransitionPQPFile()
 53 |             pqp_reader.convertPQPToTargetedExperiment(c_in_file.value, self.tr_exp, self.legacy_traml_id)
 54 |             pqp_reader.validateTargetedExperiment(self.tr_exp)
 55 | 
 56 |         elif self._get_file_type_id(in_type) == po.FileType.TRAML:
 57 |             click.echo("Info: Reading TraML transition list data...")
 58 |             traml_reader = po.TraMLFile()
 59 |             traml_reader.load(c_in_file.value, self.tr_exp)
 60 |         
 61 |         click.echo(f"Info: Loaded {len(self.tr_exp.getCompounds())} Compounds, {len(self.tr_exp.getProteins()) } Proteins, {len(self.tr_exp.getPeptides())} Peptides, and {len(self.tr_exp.getTransitions())} Transitions")
 62 |     
 63 |     def write_library(self, outfile: str, out_type: Union[str, None]=None) -> None:
 64 |         """
 65 |         Method to write data from an OpenMS TargetedExperiment Object to disk
 66 | 
 67 |         Parameters:
 68 |             outfile: (str) output transition list file to load
 69 |             out_type: (str|None) output file type Default: None. Will be inferred from infile
 70 |         """
 71 |         if out_type is None:
 72 |             out_type = self._get_file_type(outfile)
 73 |         self._validate_type(outfile, out_type)
 74 |         # Convert outfile str to ctype c char
 75 |         c_out_file = ctypes.create_string_buffer(outfile.encode())
 76 |         if self._get_file_type_id(out_type) == po.FileType.TSV:
 77 |             click.echo("Info: Writing TSV transition list data to disk...")
 78 |             tsv_reader = po.TransitionTSVFile()
 79 |             self.tr_exp.getPeptides()
 80 |             tsv_reader.convertTargetedExperimentToTSV(c_out_file.value, self.tr_exp)
 81 | 
 82 |         elif self._get_file_type_id(out_type) == po.FileType.PQP:
 83 |             click.echo("Info: Writing PQP transition list data to disk...")
 84 |             pqp_reader = po.TransitionPQPFile()
 85 |             pqp_reader.convertTargetedExperimentToPQP(c_out_file.value, self.tr_exp)
 86 | 
 87 |         elif self._get_file_type_id(out_type) == po.FileType.TRAML:
 88 |             click.echo("Info: Writing TraML transition list data to disk...")
 89 |             traml_reader = po.TraMLFile()
 90 |             traml_reader.store(c_out_file.value, self.tr_exp)
 91 | 
 92 | 
 93 | 
 94 | class TargetedFileConverter(TargetedExperiment):
 95 |     '''
 96 |     TargetedFileConverter
 97 | 
 98 |     Converts different spectral libraries / transition files for targeted proteomics and metabolomics analysis.
 99 |   
100 |     Can convert multiple formats to and from TraML (standardized transition format). The following formats are supported:
101 |     
102 |         - @ref OpenMS::TraMLFile "TraML" 
103 |         - @ref OpenMS::TransitionTSVFile "OpenSWATH TSV transition lists" 
104 |         - @ref OpenMS::TransitionPQPFile "OpenSWATH PQP SQLite files" 
105 |         - SpectraST MRM transition lists 
106 |         - Skyline transition lists 
107 |         - Spectronaut transition lists 
108 |         - Parquet transition lists 
109 |     '''
110 | 
111 |     def __init__(self, infile: str, outfile: str="library.pqp", in_type: Union[str, None]=None, out_type: Union[str, None]=None, legacy_traml_id: bool=True) -> None:
112 |         super().__init__(legacy_traml_id)
113 |         self.infile = infile
114 |         self.outfile = outfile
115 | 
116 |         # Handle types
117 |         if in_type is None:
118 |             in_type = self._get_file_type(self.infile)
119 |         self.in_type = in_type
120 |         if out_type is None:
121 |             out_type = self._get_file_type(self.outfile)
122 |         self.out_type = out_type
123 | 
124 |     def convert(self) -> None:
125 |         """Method for converting between spectral library formats"""
126 |         # If input is parquet, need to write out a temporary tsv to consume for conversion
127 |         if self.in_type == 'parquet':
128 |             tr_list = pd.read_parquet(self.infile)
129 |             # Write out a temp tsv file for loading into a TargetedExperiment Object
130 |             temp_in_tsv = f"{os.path.splitext(self.infile)[0]}.tsv"
131 |             tr_list.to_csv(temp_in_tsv, sep="\t")
132 |             # Save org infile information
133 |             self.infile_parquet = self.infile
134 |             self.in_type_parquet = self.in_type
135 |             # Overwrite org infile information with TSV information
136 |             self.infile = temp_in_tsv
137 |             self.in_type = "tsv"    
138 | 
139 |         # Read Input into TargetedExperiment
140 |         self.load_library(self.infile, self.in_type)
141 | 
142 |         # Write TargetedExperiment to Output
143 |         self.write_library(self.outfile, self.out_type)
144 | 
145 |         # Clean Up
146 |         if 'in_type_parquet' in dir(self) and self.out_type!='tsv':
147 |             os.remove(self.infile)
148 |             self.infile = self.infile_parquet
149 | 
150 |         click.echo(f"Info: Finished converting {self.infile} to {self.outfile}")


--------------------------------------------------------------------------------
/easypqp/openswathassaygenerator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import pyopenms as po
  4 | import ctypes
  5 | import click
  6 | from typing import Union, Tuple
  7 | 
  8 | from .targetedfileconverter import TargetedExperiment
  9 | 
 10 | def check_argument_values(arg_name: str, arg_value: any, expected_type: Tuple[Union[type, None], Union[Tuple, None]]) -> None:
 11 |     """
 12 |     Check if the given argument value is of the expected type and value range (if applicable).
 13 |     Raise a TypeError or ValueError if the value is invalid.
 14 |     """
 15 |     expected_type, expected_range = expected_type
 16 |     if isinstance(expected_type, list) and None in expected_type:
 17 |         pass
 18 |     elif not isinstance(arg_value, expected_type):
 19 |         raise TypeError(f"{arg_name} should be of type {expected_type.__name__} not type {arg_value.__class__}.")
 20 |     if expected_range is not None:
 21 |         # Handle numeric range
 22 |         if isinstance(expected_range, tuple) and len(expected_range) == 2:
 23 |             if not (expected_range[0] <= arg_value <= expected_range[1]):
 24 |                 raise ValueError(f"{arg_name} should be within the range {expected_range}, cannot except {arg_value}.")
 25 |         elif isinstance(expected_range, list) and arg_value not in expected_range:
 26 |             raise ValueError(f"{arg_name} should be one of {expected_range}, cannot except '{arg_value}'.")
 27 | 
 28 | def check_fragment_type(input_str: str):
 29 |     possible_fragment_types = ['b','y','a','x','c','z']
 30 |     if input_str not in possible_fragment_types:
 31 |         raise ValueError(f"{input_str} is not one of the possible fragment types {possible_fragment_types}")
 32 | 
 33 | def string_to_list(input_str: str, output_type: type):
 34 |     str_list = input_str.split(",")
 35 |     ret_list = []
 36 |     for s in str_list:
 37 |         if (output_type == bytes):
 38 |             check_fragment_type(s)
 39 |             convert = bytes(s,encoding='utf-8' )
 40 |         else:
 41 |             convert = int(s)
 42 |         ret_list.append(convert) 
 43 | 
 44 |     return ret_list
 45 | 
 46 | def read_swath_file(file: str):
 47 |     click.echo("Validate provided Swath windows file:")
 48 |     swath_window_loader = po.SwathWindowLoader()
 49 |     swath_prec_lower = []
 50 |     swath_prec_upper = []
 51 |     ret_val = []
 52 |     swath_window_loader.readSwathWindows(file, swath_prec_lower, swath_prec_upper)
 53 |     click.echo("Read Swath maps file with %s windows" % str(len(swath_prec_lower)))
 54 |     for idx, s in enumerate(swath_prec_lower):
 55 |         current_win = []
 56 |         current_win.append(s)
 57 |         current_win.append(swath_prec_upper[idx])
 58 |         click.echo("Read lower swath window %s and upper window %s" % (s, swath_prec_upper[idx]))
 59 |         ret_val.append(current_win)
 60 |     return ret_val
 61 | 
 62 | def read_unimod_file(unimod_file):
 63 |     ### TODO
 64 |     return None
 65 |     # mods_database = po.ModificationsDB(unimod_file)
 66 |     
 67 |     # click.echo("Unimod XML: %s modification types and residue specificities imported from file: %s" % (mods_database.getNumberOfModifications(), unimod_file))
 68 | 
 69 | class OpenSwathAssayGenerator(TargetedExperiment):
 70 |     def __init__(self, infile, in_type, outfile, out_type, min_transitions, max_transitions, allowed_fragment_type, allowed_fragment_charges, enable_detection_specific_losses, enable_detection_unspecific_losses, precursor_mz_threshold, precursor_lower_mz_limit,
 71 |                  precursor_upper_mz_limit, product_mz_threshold, product_lower_mz_limit, product_upper_mz_limit, swath_windows_file, unimod_file, enable_ipf, max_num_alternative_localizations, disable_identification_ms2_precursors, disable_identification_specific_losses, enable_identification_unspecific_losses, enable_swath_specifity) -> None:
 72 |         super().__init__(True)
 73 | 
 74 |         self.infile = infile 
 75 |         self.in_type = in_type
 76 | 
 77 |         self.outfile = outfile
 78 |         self.out_type = out_type 
 79 | 
 80 |         self.min_transitions = min_transitions
 81 |         self.max_transitions = max_transitions
 82 | 
 83 |         self.allowed_fragment_type = string_to_list(allowed_fragment_type, bytes) 
 84 | 
 85 |         self.allowed_fragment_charges = string_to_list(allowed_fragment_charges, int)  ### TODO: check valid fragment charges
 86 | 
 87 |         self.enable_detection_specific_losses = enable_detection_specific_losses
 88 |         self.enable_detection_unspecific_losses = enable_detection_unspecific_losses
 89 |         self.precursor_mz_threshold = precursor_mz_threshold
 90 |         self.precursor_lower_mz_limit = precursor_lower_mz_limit
 91 |         self.precursor_upper_mz_limit = precursor_upper_mz_limit
 92 |         self.product_mz_threshold = product_mz_threshold
 93 |         self.product_lower_mz_limit = product_lower_mz_limit
 94 |         self.product_upper_mz_limit = product_upper_mz_limit
 95 | 
 96 |         self.swathes = list(list()) if swath_windows_file == None else read_swath_file(swath_windows_file)
 97 |        
 98 |        
 99 |         ### TODO: read unimod file 
100 |         self.unimod_file = None if unimod_file == None else read_unimod_file(unimod_file)
101 |         print(self.unimod_file )
102 |         ### TODO: implement enable ipf
103 |         self.enable_ipf = enable_ipf
104 |         self.max_num_alternative_localizations = max_num_alternative_localizations
105 |         self.disable_identification_ms2_precursors = disable_identification_ms2_precursors
106 |         self.disable_identification_specific_losses = disable_identification_specific_losses
107 |         self.enable_identification_unspecific_losses = enable_identification_unspecific_losses
108 |         self.enable_swath_specifity = enable_swath_specifity
109 | 
110 | 
111 | 
112 |         ### check argument
113 |         # # Valdiate arguments
114 |         # check_argument_values("infile", infile, (str, None))
115 |         # check_argument_values("outfile", outfile, (str, None))
116 |         # # Handle types
117 |         # if in_type is None:
118 |         #     in_type = self._get_file_type(infile)
119 |         # if out_type is None:
120 |         #     out_type = self._get_file_type(outfile)
121 |         # check_argument_values("in_type", in_type, ([str, None], ['tsv', 'mrm', 'pqp', 'TraML']))
122 |         # check_argument_values("out_type", out_type, ([str, None], ['tsv', 'pqp', 'TraML']))
123 |         # check_argument_values("product_mz_threshold", product_mz_threshold, (float, None))
124 |         # check_argument_values("allowed_fragment_types", allowed_fragment_types, (str, None)) # TODO: Add value check, to ensure valid fragment types
125 |         # check_argument_values("allowed_fragment_charges", allowed_fragment_charges, (str, None)) # TODO: Add value check to ensure ints are in string of charges
126 |         # check_argument_values("enable_detection_specific_losses", enable_detection_specific_losses, (bool, None))
127 |         # check_argument_values("enable_detection_unspecific_losses", enable_detection_unspecific_losses, (bool, None))
128 | 
129 |     def read_input_file(self) -> None:
130 |         self.load_library(self.infile, self.in_type)
131 |         ### convert to tsv (panda df)
132 | 
133 |         ### get all transtion for specific precursors
134 | 
135 | 
136 | 
137 |     def annotate_transitions(self) -> None:
138 |         click.echo("Info: Annotating transitions")
139 |         assays = po.MRMAssay()
140 |         assays.reannotateTransitions(self.tr_exp, self.precursor_mz_threshold, self.product_mz_threshold, self.allowed_fragment_type, self.allowed_fragment_charges, self.enable_detection_specific_losses, self.enable_detection_unspecific_losses, -4) ### todo convert fragment type to bytes
141 | 
142 |         click.echo("Info: Annotating detecting transitions")
143 |         assays.restrictTransitions(self.tr_exp, self.product_lower_mz_limit, self.product_upper_mz_limit, self.swathes) 
144 |         assays.detectingTransitions(self.tr_exp, self.min_transitions, self.max_transitions)
145 | 
146 |     def write_output_file(self) -> None:
147 |         self.write_library(self.outfile, self.out_type)


--------------------------------------------------------------------------------
/tests/_regtest_outputs/test_openswathdecoy_generator.test_openswath_decoy_generator.out:
--------------------------------------------------------------------------------
 1 |    ID PROTEIN_ACCESSION  DECOY
 2 | 9   0      DECOY_Q04637      1
 3 | 8   1      DECOY_Q2M2I8      1
 4 | 6   2      DECOY_Q86WB0      1
 5 | 7   3      DECOY_Q8WWI1      1
 6 | 4   4      DECOY_Q92890      1
 7 | 3   5            Q04637      0
 8 | 2   6            Q2M2I8      0
 9 | 1   7            Q86WB0      0
10 | 0   8            Q8WWI1      0
11 | 5   9            Q92890      0
12 |    ID       UNMODIFIED_SEQUENCE                         MODIFIED_SEQUENCE  DECOY
13 | 9   0      AGQTQPNPGILPIQPALTPR           AGQTQPNPGILPIQPALT(UniMod:21)PR      0
14 | 8   1  ATLSSTSGLDLMSESGEGEISPQR       ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR      0
15 | 6   2               EAALPPVSPLK                    EAALPPVS(UniMod:21)PLK      0
16 | 5   3           GVEPSPSPIKPGDIK                GVEPSPS(UniMod:21)PIKPGDIK      0
17 | 4   4           IDGPKIPSPSPEVGK                IDGPKIPS(UniMod:21)PSPEVGK      1
18 | 2   5               LPSVPPLAAEK                    LPS(UniMod:21)VPPLAAEK      1
19 | 1   6        LPSSPVELGPTDGTGMSR  LPSS(UniMod:21)PVELGPTDGTGM(UniMod:35)SR      1
20 | 7   7      PTLAPQIPLIGPNPQTQGAR           PT(UniMod:21)LAPQIPLIGPNPQTQGAR      1
21 | 0   8  QPSIEGEGSESMLDLGSTSSLTAR       QPS(UniMod:21)IEGEGSESMLDLGSTSSLTAR      1
22 | 3   9        SMGTGDTPGLEVPSSPLR  SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR      0
23 |    ID                                        TRAML_ID GROUP_LABEL  PRECURSOR_MZ  CHARGE LIBRARY_INTENSITY  LIBRARY_RT  LIBRARY_DRIFT_TIME  DECOY
24 | 3   0                 AGQTQPNPGILPIQPALT(Phospho)PR_2                 1075.0619       2              None     70.6096                -1.0      0
25 | 4   1             ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2                 1266.5622       2              None     67.4251                -1.0      0
26 | 8   2           DECOY_AGQTQPNPGILPIQPALT(Phospho)PR_2                 1075.0619       2              None     70.6096                -1.0      1
27 | 9   3       DECOY_ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2                 1266.5622       2              None     67.4251                -1.0      1
28 | 5   4                    DECOY_EAALPPVS(Phospho)PLK_2                  601.3151       2              None     48.0082                -1.0      1
29 | 6   5                DECOY_GVEPSPS(Phospho)PIKPGDIK_2                  800.9028       2              None     32.9698                -1.0      1
30 | 7   6  DECOY_SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2                  948.9241       2              None     52.9017                -1.0      1
31 | 0   7                          EAALPPVS(Phospho)PLK_2                  601.3151       2              None     48.0082                -1.0      0
32 | 1   8                      GVEPSPS(Phospho)PIKPGDIK_2                  800.9028       2              None     32.9698                -1.0      0
33 | 2   9        SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2                  948.9241       2              None     52.9017                -1.0      0
34 |     ID     TRAML_ID  PRODUCT_MZ  CHARGE TYPE ANNOTATION  ORDINAL  DETECTING  IDENTIFYING  QUANTIFYING  LIBRARY_INTENSITY  DECOY
35 | 0    0         2346    409.2146       2    y       y7^2        7          1            0            1          2965.7283      0
36 | 1    1         2347    465.7566       2    y       y8^2        8          1            0            1           132.8395      0
37 | 2    2         2349    720.3692       1    y       y6^1        6          1            0            1          1580.4800      0
38 | 3    3         2350    817.4219       1    y       y7^1        7          1            0            1         10000.0000      0
39 | 4    4         2352    930.5060       1    y       y8^1        8          1            0            1           417.7673      0
40 | 5    5         2353   1001.5431       1    y       y9^1        9          1            0            1           278.9014      0
41 | 6    6        12006    375.2238       1    y       y3^1        3          1            0            1          1621.3933      0
42 | 7    7        12009    529.2980       1    y       y5^1        5          1            0            1         10000.0000      0
43 | 8    8        12010    566.2941       2    y      y10^2       10          1            0            1          3326.0842      0
44 | 9    9        12012    657.3930       1    y       y6^1        6          1            0            1          1924.5614      0
45 | 10  10        12013    658.3365       2    y      y12^2       12          1            0            1          4464.7860      0
46 | 11  11        12015    867.5298       1    y       y8^1        8          1            0            1          5222.4050      0
47 | 12  12        21489    385.2558       1    y       y3^1        3          1            0            1          5179.5244      0
48 | 13  13        21490    393.1438       1    b       b4^1        4          1            0            1          2606.7244      0
49 | 14  14        21491    565.1923       1    b       b6^1        6          1            0            1          3256.2622      0
50 | 15  15        21493    666.2399       1    b       b7^1        7          1            0            1          3735.2622      0
51 | 16  16        21494    736.3389       1    y       y6^1        6          1            0            1         10000.0000      0
52 | 17  17        21496    835.4073       1    y       y7^1        7          1            0            1          3901.4023      0
53 | 18  18        31640    486.2307       1    b       b5^1        5          1            0            1         10000.0000      0
54 | 19  19        31641    697.3264       1    b       b7^1        7          1            0            1          7081.5693      0
55 | 20  20        31643    734.3597       1    y       y6^1        6          1            0            1          8579.1080      0
56 | 21  21        31644    832.4502       2    y      y15^2       15          1            0            1          2923.7356      0
57 | 22  22        31646    964.4847       1    b      b10^1       10          1            0            1          3234.0083      0
58 | 23  23        31647   1072.5551       1    y       y9^1        9          1            0            1          3853.8560      0
59 | 24  24        42446    400.2303       1    y       y3^1        3          1            0            1          7762.3594      0
60 | 25  25        42450    567.2287       1    y       y4^1        4          1            0            1          8796.0370      0
61 | 26  26        42457    818.4254       1    b       b9^1        9          1            0            1          5399.1875      0
62 | 27  27        42458    866.3768       1    y       y7^1        7          1            0            1          6659.5240      0
63 | 28  28        42459    933.4524       1    b      b10^1       10          1            0            1          6236.0680      0
64 | 29  29        42468   1139.4729       1    y      y10^1       10          1            0            1          3636.2630      0
65 | 30  30  DECOY_31640    560.2480       1    b       b5^1        5          1            0            1         10000.0000      1
66 | 31  31  DECOY_31641    801.3906       1    b       b7^1        7          1            0            1          7081.5693      1
67 | 32  32  DECOY_31643    660.3424       1    y       y6^1        6          1            0            1          8579.1080      1
68 | 33  33  DECOY_31644    795.4415       2    y      y15^2       15          1            0            1          2923.7356      1
69 | 34  34  DECOY_31646   1124.6115       1    b      b10^1       10          1            0            1          3234.0083      1
70 | 35  35  DECOY_31647    968.4908       1    y       y9^1        9          1            0            1          3853.8560      1
71 | 36  36  DECOY_42446    347.2037       1    y       y3^1        3          1            0            1          7762.3594      1
72 | 37  37  DECOY_42450    460.2878       1    y       y4^1        4          1            0            1          8796.0370      1
73 | 38  38  DECOY_42457    965.3612       1    b       b9^1        9          1            0            1          5399.1875      1
74 | 39  39  DECOY_42458    735.3995       1    y       y7^1        7          1            0            1          6659.5240      1
75 | 40  40  DECOY_42459   1094.4038       1    b      b10^1       10          1            0            1          6236.0680      1
76 | 41  41  DECOY_42468    992.5371       1    y      y10^1       10          1            0            1          3636.2630      1
77 | 42  42   DECOY_2346    363.2132       2    y       y7^2        7          1            0            1          2965.7283      1
78 | 43  43   DECOY_2347    412.7475       2    y       y8^2        8          1            0            1           132.8395      1
79 | 44  44   DECOY_2349    628.3665       1    y       y6^1        6          1            0            1          1580.4800      1
80 | 45  45   DECOY_2350    725.4192       1    y       y7^1        7          1            0            1         10000.0000      1
81 | 46  46   DECOY_2352    824.4876       1    y       y8^1        8          1            0            1           417.7673      1
82 | 47  47   DECOY_2353    991.4860       1    y       y9^1        9          1            0            1           278.9014      1
83 | 48  48  DECOY_12006    303.2027       1    y       y3^1        3          1            0            1          1621.3933      1
84 | 49  49  DECOY_12009    529.2980       1    y       y5^1        5          1            0            1         10000.0000      1
85 | 50  50  DECOY_12010    545.7627       2    y      y10^2       10          1            0            1          3326.0842      1
86 | 51  51  DECOY_12012    616.3301       1    y       y6^1        6          1            0            1          1924.5614      1
87 | 52  52  DECOY_12013    658.3365       2    y      y12^2       12          1            0            1          4464.7860      1
88 | 53  53  DECOY_12015    880.3812       1    y       y8^1        8          1            0            1          5222.4050      1
89 | 54  54  DECOY_21489    409.1864       1    y       y3^1        3          1            0            1          5179.5244      1
90 | 55  55  DECOY_21490    465.1745       1    b       b4^1        4          1            0            1          2606.7244      1
91 | 56  56  DECOY_21491    661.2957       1    b       b6^1        6          1            0            1          3256.2622      1
92 | 57  57  DECOY_21493    790.3383       1    b       b7^1        7          1            0            1          3735.2622      1
93 | 58  58  DECOY_21494    624.2770       1    y       y6^1        6          1            0            1         10000.0000      1
94 | 59  59  DECOY_21496    739.3039       1    y       y7^1        7          1            0            1          3901.4023      1
95 | 


--------------------------------------------------------------------------------
/tests/data/test_transition_list.tsv:
--------------------------------------------------------------------------------
 1 | PrecursorMz	ProductMz	PrecursorCharge	ProductCharge	LibraryIntensity	NormalizedRetentionTime	PeptideSequence	ModifiedPeptideSequence	PeptideGroupLabel	LabelType	CompoundName	SumFormula	SMILES	Adducts	ProteinId	UniprotId	GeneName	FragmentType	FragmentSeriesNumber	Annotation	CollisionEnergy	PrecursorIonMobility	TransitionGroupId	TransitionId	Decoy	DetectingTransition	IdentifyingTransition	QuantifyingTransition	Peptidoforms
 2 | 601.31505	260.196869	2	1	81.93454	48.008226163	EAALPPVSPLK	EAALPPVS(UniMod:21)PLK							Q04637		EIF4G1	y	2	y2^1	-1	-1	EAALPPVS(Phospho)PLK_2	2345	0	1	0	1	
 3 | 601.31505	409.214607	2	2	2965.7283	48.008226163	EAALPPVSPLK	EAALPPVS(UniMod:21)PLK							Q04637		EIF4G1	y	7	y7^2	-1	-1	EAALPPVS(Phospho)PLK_2	2346	0	1	0	1	
 4 | 601.31505	465.756639	2	2	132.83946	48.008226163	EAALPPVSPLK	EAALPPVS(UniMod:21)PLK							Q04637		EIF4G1	y	8	y8^2	-1	-1	EAALPPVS(Phospho)PLK_2	2347	0	1	0	1	
 5 | 601.31505	623.316408	2	1	101.36074	48.008226163	EAALPPVSPLK	EAALPPVS(UniMod:21)PLK							Q04637		EIF4G1	y	5	y5^1	-1	-1	EAALPPVS(Phospho)PLK_2	2348	0	1	0	1	
 6 | 601.31505	720.369173	2	1	1580.48	48.008226163	EAALPPVSPLK	EAALPPVS(UniMod:21)PLK							Q04637		EIF4G1	y	6	y6^1	-1	-1	EAALPPVS(Phospho)PLK_2	2349	0	1	0	1	
 7 | 601.31505	817.421937	2	1	10000	48.008226163	EAALPPVSPLK	EAALPPVS(UniMod:21)PLK							Q04637		EIF4G1	y	7	y7^1	-1	-1	EAALPPVS(Phospho)PLK_2	2350	0	1	0	1	
 8 | 601.31505	845.380467	2	1	22.41705	48.008226163	EAALPPVSPLK	EAALPPVS(UniMod:21)PLK							Q04637		EIF4G1	b	8	b8^1	-1	-1	EAALPPVS(Phospho)PLK_2	2351	0	1	0	1	
 9 | 601.31505	930.506001	2	1	417.7673	48.008226163	EAALPPVSPLK	EAALPPVS(UniMod:21)PLK							Q04637		EIF4G1	y	8	y8^1	-1	-1	EAALPPVS(Phospho)PLK_2	2352	0	1	0	1	
10 | 601.31505	1001.543115	2	1	278.9014	48.008226163	EAALPPVSPLK	EAALPPVS(UniMod:21)PLK							Q04637		EIF4G1	y	9	y9^1	-1	-1	EAALPPVS(Phospho)PLK_2	2353	0	1	0	1	
11 | 800.902751	286.139749	2	1	1231.8224	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	b	3	b3^1	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12005	0	1	0	1	
12 | 800.902751	375.223813	2	1	1621.3933	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	y	3	y3^1	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12006	0	1	0	1	
13 | 800.902751	434.268555	2	2	391.9799	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	y	8	y8^2	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12007	0	1	0	1	
14 | 800.902751	470.224542	2	1	612.1298	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	b	5	b5^1	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12008	0	1	0	1	
15 | 800.902751	529.298042	2	1	10000	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	y	5	y5^1	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12009	0	1	0	1	
16 | 800.902751	566.294118	2	2	3326.0842	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	y	10	y10^2	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12010	0	1	0	1	
17 | 800.902751	567.277307	2	1	1098.7223	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	b	6	b6^1	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12011	0	1	0	1	
18 | 800.902751	657.393005	2	1	1924.5614	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	y	6	y6^1	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12012	0	1	0	1	
19 | 800.902751	658.336514	2	2	4464.786	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	y	12	y12^2	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12013	0	1	0	1	
20 | 800.902751	722.857811	2	2	1568.4036	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	y	13	y13^2	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12014	0	1	0	1	
21 | 800.902751	867.529834	2	1	5222.405	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	y	8	y8^1	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12015	0	1	0	1	
22 | 800.902751	1072.507459	2	1	1544.1755	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	b	10	b10^1	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12016	0	1	0	1	
23 | 800.902751	1131.580959	2	1	1269.2039	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	y	10	y10^1	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12017	0	1	0	1	
24 | 800.902751	1315.665752	2	1	800.73376	32.969786991	GVEPSPSPIKPGDIK	GVEPSPS(UniMod:21)PIKPGDIK							Q92890		UFD1	y	12	y12^1	-1	-1	GVEPSPS(Phospho)PIKPGDIK_2	12018	0	1	0	1	
25 | 948.924087	385.255781	2	1	5179.5244	52.901659389	SMGTGDTPGLEVPSSPLR	SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR							Q86WB0		ZC3HC1	y	3	y3^1	-1	-1	SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2	21489	0	1	0	1	
26 | 948.924087	393.143849	2	1	2606.7244	52.901659389	SMGTGDTPGLEVPSSPLR	SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR							Q86WB0		ZC3HC1	b	4	b4^1	-1	-1	SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2	21490	0	1	0	1	
27 | 948.924087	565.192257	2	1	3256.2622	52.901659389	SMGTGDTPGLEVPSSPLR	SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR							Q86WB0		ZC3HC1	b	6	b6^1	-1	-1	SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2	21491	0	1	0	1	
28 | 948.924087	639.286171	2	1	2192.347	52.901659389	SMGTGDTPGLEVPSSPLR	SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR							Q86WB0		ZC3HC1	y	5	y5^1	-1	-1	SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2	21492	0	1	0	1	
29 | 948.924087	666.239936	2	1	3735.2622	52.901659389	SMGTGDTPGLEVPSSPLR	SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR							Q86WB0		ZC3HC1	b	7	b7^1	-1	-1	SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2	21493	0	1	0	1	
30 | 948.924087	736.338936	2	1	10000	52.901659389	SMGTGDTPGLEVPSSPLR	SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR							Q86WB0		ZC3HC1	y	6	y6^1	-1	-1	SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2	21494	0	1	0	1	
31 | 948.924087	831.890372	2	2	699.7533	52.901659389	SMGTGDTPGLEVPSSPLR	SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR							Q86WB0		ZC3HC1	y	16	y16^2	-1	-1	SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2	21495	0	1	0	1	
32 | 948.924087	835.40735	2	1	3901.4023	52.901659389	SMGTGDTPGLEVPSSPLR	SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR							Q86WB0		ZC3HC1	y	7	y7^1	-1	-1	SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2	21496	0	1	0	1	
33 | 948.924087	1134.555473	2	1	428.92825	52.901659389	SMGTGDTPGLEVPSSPLR	SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR							Q86WB0		ZC3HC1	y	10	y10^1	-1	-1	SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2	21497	0	1	0	1	
34 | 948.924087	1231.608237	2	1	563.8044	52.901659389	SMGTGDTPGLEVPSSPLR	SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR							Q86WB0		ZC3HC1	y	11	y11^1	-1	-1	SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2	21498	0	1	0	1	
35 | 1075.061909	257.124433	2	1	2472.791	70.609603879	AGQTQPNPGILPIQPALTPR	AGQTQPNPGILPIQPALT(UniMod:21)PR							Q2M2I8		AAK1	b	3	b3^1	-1	-1	AGQTQPNPGILPIQPALT(Phospho)PR_2	31639	0	1	0	1	
36 | 1075.061909	486.23069	2	1	10000	70.609603879	AGQTQPNPGILPIQPALTPR	AGQTQPNPGILPIQPALT(UniMod:21)PR							Q2M2I8		AAK1	b	5	b5^1	-1	-1	AGQTQPNPGILPIQPALT(Phospho)PR_2	31640	0	1	0	1	
37 | 1075.061909	697.326383	2	1	7081.5693	70.609603879	AGQTQPNPGILPIQPALTPR	AGQTQPNPGILPIQPALT(UniMod:21)PR							Q2M2I8		AAK1	b	7	b7^1	-1	-1	AGQTQPNPGILPIQPALT(Phospho)PR_2	31641	0	1	0	1	
38 | 1075.061909	726.902355	2	2	729.74634	70.609603879	AGQTQPNPGILPIQPALTPR	AGQTQPNPGILPIQPALT(UniMod:21)PR							Q2M2I8		AAK1	y	13	y13^2	-1	-1	AGQTQPNPGILPIQPALT(Phospho)PR_2	31642	0	1	0	1	
39 | 1075.061909	734.359671	2	1	8579.108	70.609603879	AGQTQPNPGILPIQPALTPR	AGQTQPNPGILPIQPALT(UniMod:21)PR							Q2M2I8		AAK1	y	6	y6^1	-1	-1	AGQTQPNPGILPIQPALT(Phospho)PR_2	31643	0	1	0	1	
40 | 1075.061909	832.450202	2	2	2923.7356	70.609603879	AGQTQPNPGILPIQPALTPR	AGQTQPNPGILPIQPALT(UniMod:21)PR							Q2M2I8		AAK1	y	15	y15^2	-1	-1	AGQTQPNPGILPIQPALT(Phospho)PR_2	31644	0	1	0	1	
41 | 1075.061909	851.400611	2	1	2737.5051	70.609603879	AGQTQPNPGILPIQPALTPR	AGQTQPNPGILPIQPALT(UniMod:21)PR							Q2M2I8		AAK1	b	9	b9^1	-1	-1	AGQTQPNPGILPIQPALT(Phospho)PR_2	31645	0	1	0	1	
42 | 1075.061909	964.484676	2	1	3234.0083	70.609603879	AGQTQPNPGILPIQPALTPR	AGQTQPNPGILPIQPALT(UniMod:21)PR							Q2M2I8		AAK1	b	10	b10^1	-1	-1	AGQTQPNPGILPIQPALT(Phospho)PR_2	31646	0	1	0	1	
43 | 1075.061909	1072.555077	2	1	3853.856	70.609603879	AGQTQPNPGILPIQPALTPR	AGQTQPNPGILPIQPALT(UniMod:21)PR							Q2M2I8		AAK1	y	9	y9^1	-1	-1	AGQTQPNPGILPIQPALT(Phospho)PR_2	31647	0	1	0	1	
44 | 1075.061909	1077.56874	2	1	2261.7683	70.609603879	AGQTQPNPGILPIQPALTPR	AGQTQPNPGILPIQPALT(UniMod:21)PR							Q2M2I8		AAK1	b	11	b11^1	-1	-1	AGQTQPNPGILPIQPALT(Phospho)PR_2	31648	0	1	0	1	
45 | 1075.061909	1185.639142	2	1	1024.3945	70.609603879	AGQTQPNPGILPIQPALTPR	AGQTQPNPGILPIQPALT(UniMod:21)PR							Q2M2I8		AAK1	y	10	y10^1	-1	-1	AGQTQPNPGILPIQPALT(Phospho)PR_2	31649	0	1	0	1	
46 | 1266.562206	286.176134	2	1	567.14233	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	3	b3^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42443	0	1	0	1	
47 | 1266.562206	303.177531	2	1	1103.8254	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	2	y2^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42444	0	1	0	1	
48 | 1266.562206	373.208163	2	1	1820.2323	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	4	b4^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42445	0	1	0	1	
49 | 1266.562206	400.230295	2	1	7762.3594	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	3	y3^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42446	0	1	0	1	
50 | 1266.562206	405.181296	2	2	114.97278	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	6	y6^2	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42447	0	1	0	1	
51 | 1266.562206	460.240193	2	1	2922.4272	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	5	b5^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42448	0	1	0	1	
52 | 1266.562206	561.287872	2	1	2393.0298	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	6	b6^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42449	0	1	0	1	
53 | 1266.562206	567.228656	2	1	8796.037	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	4	y4^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42450	0	1	0	1	
54 | 1266.562206	570.240071	2	2	502.2687	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	10	y10^2	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42451	0	1	0	1	
55 | 1266.562206	589.2921	2	2	364.18185	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	12	b12^2	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42452	0	1	0	1	
56 | 1266.562206	648.319901	2	1	3448.3528	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	7	b7^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42453	0	1	0	1	
57 | 1266.562206	680.31272	2	1	3568.9507	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	5	y5^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42454	0	1	0	1	
58 | 1266.562206	705.341365	2	1	1443.644	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	8	b8^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42455	0	1	0	1	
59 | 1266.562206	769.356158	2	2	681.4857	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	16	b16^2	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42456	0	1	0	1	
60 | 1266.562206	818.425429	2	1	5399.1875	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	9	b9^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42457	0	1	0	1	
61 | 1266.562206	866.376779	2	1	6659.524	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	7	y7^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42458	0	1	0	1	
62 | 1266.562206	933.452374	2	1	6236.068	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	10	b10^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42459	0	1	0	1	
63 | 1266.562206	986.421909	2	2	985.9538	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	18	y18^2	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42460	0	1	0	1	
64 | 1266.562206	995.419373	2	1	1262.0918	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	8	y8^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42461	0	1	0	1	
65 | 1266.562206	1036.945748	2	2	1161.6108	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	19	y19^2	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42462	0	1	0	1	
66 | 1266.562206	1046.536438	2	1	3329.5352	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	11	b11^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42463	0	1	0	1	
67 | 1266.562206	1052.440837	2	1	2903.2126	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	9	y9^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42464	0	1	0	1	
68 | 1266.562206	1080.461763	2	2	1668.9382	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	20	y20^2	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42465	0	1	0	1	
69 | 1266.562206	1115.477079	2	2	786.84143	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	22	b22^2	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42466	0	1	0	1	
70 | 1266.562206	1123.977777	2	2	2038.5486	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	21	y21^2	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42467	0	1	0	1	
71 | 1266.562206	1139.472866	2	1	3636.263	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	10	y10^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42468	0	1	0	1	
72 | 1266.562206	1177.576923	2	1	2879.441	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	12	b12^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42469	0	1	0	1	
73 | 1266.562206	1180.51981	2	2	829.09406	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	22	y22^2	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42470	0	1	0	1	
74 | 1266.562206	1231.043649	2	2	1074.6931	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	23	y23^2	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42471	0	1	0	1	
75 | 1266.562206	1264.608952	2	1	890.7413	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	13	b13^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42472	0	1	0	1	
76 | 1266.562206	1268.51546	2	1	1830.4344	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	11	y11^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42473	0	1	0	1	
77 | 1266.562206	1355.54749	2	1	2691.2388	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	12	y12^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42474	0	1	0	1	
78 | 1266.562206	1393.651546	2	1	870.2799	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	b	14	b14^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42475	0	1	0	1	
79 | 1266.562206	1486.587975	2	1	851.35144	67.42507992	ATLSSTSGLDLMSESGEGEISPQR	ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR							Q8WWI1		LMO7	y	13	y13^1	-1	-1	ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2	42476	0	1	0	1	
80 | 


--------------------------------------------------------------------------------
/easypqp/library.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from .util import timestamped_echo
  3 | 
  4 | try:
  5 |     import matplotlib
  6 | 
  7 |     matplotlib.use("Agg")
  8 |     import matplotlib.pyplot as plt
  9 | except ImportError:
 10 |     plt = None
 11 | 
 12 | import click
 13 | import os
 14 | import pathlib
 15 | import posixpath, ntpath
 16 | import numpy as np
 17 | import pandas as pd
 18 | 
 19 | # alignment
 20 | from sklearn import preprocessing
 21 | import sklearn.isotonic
 22 | import sklearn.linear_model
 23 | import statsmodels.api as sm
 24 | from scipy.interpolate import interp1d
 25 | 
 26 | # error rate estimation
 27 | HAS_PYPROPHET = False
 28 | try:
 29 |     from pyprophet.stats import pemp, qvalue, pi0est
 30 |     from pyprophet.ipf import compute_model_fdr
 31 | 
 32 |     HAS_PYPROPHET = True
 33 | except (ModuleNotFoundError, ImportError):
 34 |     # pyprophet (or one of its runtime deps) is not available or incompatible
 35 |     # Export placeholders so importing this module does not raise — commands
 36 |     # that require pyprophet should check HAS_PYPROPHET at runtime.
 37 |     pemp = None
 38 |     qvalue = None
 39 |     pi0est = None
 40 |     compute_model_fdr = None
 41 | 
 42 | # plotting
 43 | from scipy.stats import gaussian_kde
 44 | from numpy import linspace, concatenate
 45 | from seaborn import lmplot
 46 | 
 47 | 
 48 | def plot(path, title, targets, decoys):
 49 |     plt.figure(figsize=(10, 5))
 50 |     plt.subplots_adjust(hspace=0.5)
 51 | 
 52 |     plt.subplot(121)
 53 |     plt.title("group score distributions")
 54 |     plt.xlabel("score")
 55 |     plt.ylabel("# of groups")
 56 |     plt.hist(
 57 |         [targets, decoys],
 58 |         20,
 59 |         color=["g", "r"],
 60 |         label=["target", "decoy"],
 61 |         histtype="bar",
 62 |     )
 63 |     plt.legend(loc=2)
 64 | 
 65 |     plt.subplot(122)
 66 |     tdensity = gaussian_kde(targets)
 67 |     tdensity.covariance_factor = lambda: 0.25
 68 |     tdensity._compute_covariance()
 69 |     ddensity = gaussian_kde(decoys)
 70 |     ddensity.covariance_factor = lambda: 0.25
 71 |     ddensity._compute_covariance()
 72 |     xs = linspace(
 73 |         min(concatenate((targets, decoys))), max(concatenate((targets, decoys))), 200
 74 |     )
 75 |     plt.title("group score densities")
 76 |     plt.xlabel("score")
 77 |     plt.ylabel("density")
 78 |     plt.plot(xs, tdensity(xs), color="g", label="target")
 79 |     plt.plot(xs, ddensity(xs), color="r", label="decoy")
 80 |     plt.legend(loc=2)
 81 | 
 82 |     plt.suptitle(title)
 83 |     plt.savefig(path)
 84 |     plt.close()
 85 | 
 86 | 
 87 | def peptide_fdr(psms, peptide_fdr_threshold, pi0_lambda, plot_path, nofdr):
 88 |     pi0_method = "bootstrap"
 89 |     pi0_smooth_df = 3
 90 |     pi0_smooth_log_pi0 = False
 91 |     pfdr = False
 92 | 
 93 |     if nofdr:
 94 |         peptides = (
 95 |             psms.groupby(["modified_peptide", "decoy", "q_value"])["pp"]
 96 |             .max()
 97 |             .reset_index()
 98 |         )
 99 |         targets = peptides[~peptides["decoy"]].copy()
100 |         decoys = peptides[peptides["decoy"]].copy()
101 | 
102 |     else:
103 |         peptides = psms.groupby(["modified_peptide", "decoy"])["pp"].max().reset_index()
104 |         targets = peptides[~peptides["decoy"]].copy()
105 |         decoys = peptides[peptides["decoy"]].copy()
106 | 
107 |         targets["p_value"] = pemp(targets["pp"], decoys["pp"])
108 |         targets["q_value"] = qvalue(
109 |             targets["p_value"],
110 |             pi0est(
111 |                 targets["p_value"],
112 |                 pi0_lambda,
113 |                 pi0_method,
114 |                 pi0_smooth_df,
115 |                 pi0_smooth_log_pi0,
116 |             )["pi0"],
117 |             pfdr,
118 |         )
119 | 
120 |         plot(plot_path, "global peptide scores", targets["pp"], decoys["pp"])
121 | 
122 |     return targets[targets["q_value"] < peptide_fdr_threshold][
123 |         "modified_peptide"
124 |     ], np.min(targets[targets["q_value"] < peptide_fdr_threshold]["pp"])
125 | 
126 | 
127 | def protein_fdr(psms, protein_fdr_threshold, pi0_lambda, plot_path, nofdr):
128 |     pi0_method = "bootstrap"
129 |     pi0_smooth_df = 3
130 |     pi0_smooth_log_pi0 = False
131 |     pfdr = False
132 | 
133 |     if nofdr:
134 |         proteins = (
135 |             psms.groupby(["protein_id", "decoy", "q_value"])["pp"].max().reset_index()
136 |         )
137 |         targets = proteins[~proteins["decoy"]].copy()
138 |         decoys = proteins[proteins["decoy"]].copy()
139 | 
140 |     else:
141 |         proteins = psms.groupby(["protein_id", "decoy"])["pp"].max().reset_index()
142 |         targets = proteins[~proteins["decoy"]].copy()
143 |         decoys = proteins[proteins["decoy"]].copy()
144 | 
145 |         targets["p_value"] = pemp(targets["pp"], decoys["pp"])
146 |         targets["q_value"] = qvalue(
147 |             targets["p_value"],
148 |             pi0est(
149 |                 targets["p_value"],
150 |                 pi0_lambda,
151 |                 pi0_method,
152 |                 pi0_smooth_df,
153 |                 pi0_smooth_log_pi0,
154 |             )["pi0"],
155 |             pfdr,
156 |         )
157 | 
158 |         plot(plot_path, "global protein scores", targets["pp"], decoys["pp"])
159 | 
160 |     return targets[targets["q_value"] < protein_fdr_threshold]["protein_id"], np.min(
161 |         targets[targets["q_value"] < protein_fdr_threshold]["pp"]
162 |     )
163 | 
164 | 
165 | def process_psms(
166 |     psms,
167 |     psmtsv,
168 |     peptidetsv,
169 |     psm_fdr_threshold,
170 |     peptide_fdr_threshold,
171 |     protein_fdr_threshold,
172 |     pi0_lambda,
173 |     peptide_plot_path,
174 |     protein_plot_path,
175 |     proteotypic,
176 |     nofdr,
177 | ):
178 |     # Append columns
179 |     psms["base_name"] = psms["run_id"].apply(
180 |         lambda x: os.path.splitext(os.path.basename(x))[0]
181 |     )
182 | 
183 |     if None not in (psmtsv, peptidetsv):
184 |         # Read psm.tsv and peptide.tsv
185 |         peptidetsv_df = pd.read_csv(
186 |             peptidetsv,
187 |             index_col=False,
188 |             sep="\t",
189 |             usecols=["Peptide", "Gene", "Protein ID"],
190 |         )
191 |         psmtsv_df = pd.read_csv(
192 |             psmtsv,
193 |             index_col=False,
194 |             sep="\t",
195 |             usecols=["Spectrum", "Spectrum File", "Peptide"],
196 |         )
197 | 
198 |         # Filter out PSMs whose peptides are not in peptide.tsv
199 |         psmtsv_df = psmtsv_df[psmtsv_df["Peptide"].isin(peptidetsv_df["Peptide"])]
200 | 
201 |         # Generate a group_id column
202 |         temp_df = psmtsv_df["Spectrum"].str.split(".", expand=True)
203 |         psmtsv_df["group_id"] = (
204 |             temp_df.iloc[:, 0]
205 |             + "_"
206 |             + pd.to_numeric(temp_df.iloc[:, -2]).astype(str)
207 |             + psmtsv_df["Spectrum File"]
208 |             .apply(lambda x: posixpath.basename(ntpath.basename(x)))
209 |             .str.extract("(_rank[0-9]+)", expand=False)
210 |             .apply(lambda x: "" if pd.isna(x) else x)
211 |         )
212 | 
213 |         # Filter psm dataframe
214 |         psms = psms[psms["group_id"].isin(psmtsv_df["group_id"])]
215 | 
216 |         # Update gene_id and protein_id
217 |         psms = psms.merge(
218 |             peptidetsv_df, how="left", left_on="peptide_sequence", right_on="Peptide"
219 |         )
220 |         psms.drop(["gene_id", "protein_id"], inplace=True, axis=1)
221 |         psms.rename(
222 |             columns={"Gene": "gene_id", "Protein ID": "protein_id"}, inplace=True
223 |         )
224 |         psms["num_tot_proteins"] = 1
225 |         timestamped_echo(
226 |             "Info: %s redundant PSMs identified after filtering with %s and %s"
227 |             % (psms.shape[0], psmtsv, peptidetsv)
228 |         )
229 |     else:
230 |         # Filter proteotypic peptides
231 |         if proteotypic:
232 |             psms = psms[psms["num_tot_proteins"] == 1].copy()
233 |         else:
234 |             raise click.ClickException(
235 |                 "Support for non-proteotypic peptides is not yet implemented."
236 |             )
237 | 
238 |         # Generate canonical set of protein identifiers
239 |         proteinset = psms[["peptide_sequence", "protein_id"]].drop_duplicates()
240 |         proteinset["protein_id"] = proteinset["protein_id"].astype(str)
241 |         proteinset_canonical = (
242 |             proteinset.groupby("peptide_sequence")
243 |             .apply(
244 |                 lambda x: ";".join(
245 |                     sorted(
246 |                         list(
247 |                             set(
248 |                                 [
249 |                                     a
250 |                                     for b in x["protein_id"].str.split(";").tolist()
251 |                                     for a in b
252 |                                 ]
253 |                             )
254 |                         )
255 |                     )
256 |                 )
257 |             )
258 |             .reset_index(name="protein_id")
259 |         )
260 | 
261 |         psms = pd.merge(
262 |             psms.drop(columns="protein_id"), proteinset_canonical, on="peptide_sequence"
263 |         )
264 | 
265 |         # Prepare PeptideProphet / iProphet results
266 |         if "q_value" not in psms.columns:
267 |             psms["q_value"] = compute_model_fdr(psms["pep"].values)
268 | 
269 |         # Confident peptides and protein in global context
270 |         peptides, peptide_pp_threshold = peptide_fdr(
271 |             psms, peptide_fdr_threshold, pi0_lambda, peptide_plot_path, nofdr
272 |         )
273 |         timestamped_echo(
274 |             "Info: %s modified peptides identified (q-value < %s; PP threshold = %s)"
275 |             % (len(peptides), peptide_fdr_threshold, peptide_pp_threshold)
276 |         )
277 |         proteins, protein_pp_threshold = protein_fdr(
278 |             psms, protein_fdr_threshold, pi0_lambda, protein_plot_path, nofdr
279 |         )
280 |         timestamped_echo(
281 |             "Info: %s proteins identified (q-value < %s; PP threshold = %s)"
282 |             % (len(proteins), protein_fdr_threshold, protein_pp_threshold)
283 |         )
284 | 
285 |         # Filter peptides and proteins
286 |         psms = psms[psms["modified_peptide"].isin(peptides)]
287 |         psms = psms[psms["protein_id"].isin(proteins)]
288 | 
289 |         # Filter PSMs
290 |         psms = psms[psms["q_value"] < psm_fdr_threshold]
291 | 
292 |         # Remove decoys
293 |         psms = psms[~psms["decoy"]]
294 | 
295 |         timestamped_echo(
296 |             "Info: %s redundant PSMs identified (q-value < %s; PP threshold = %s)"
297 |             % (psms.shape[0], psm_fdr_threshold, np.min(1 - psms["pep"]))
298 |         )
299 | 
300 |     return psms
301 | 
302 | 
303 | def lowess_iso(x, y, lowess_frac):
304 |     with warnings.catch_warnings():
305 |         warnings.filterwarnings(
306 |             "ignore", message="invalid value encountered in ", category=RuntimeWarning
307 |         )
308 |         lwf = sm.nonparametric.lowess(y, x.ravel(), frac=lowess_frac)
309 |         while pd.isna(lwf[:, 1]).any():
310 |             lowess_frac *= 2
311 |             lwf = sm.nonparametric.lowess(y, x.ravel(), frac=lowess_frac)
312 |     lwf_x = lwf[:, 0]
313 |     ir = (
314 |         sklearn.isotonic.IsotonicRegression()
315 |     )  # make the regression strictly increasing
316 |     lwf_y = ir.fit_transform(lwf_x, lwf[:, 1])
317 |     mask = np.concatenate([[True], np.diff(lwf_y) != 0])  # remove non increasing points
318 |     try:
319 |         return interp1d(
320 |             lwf_x[mask], lwf_y[mask], bounds_error=False, fill_value="extrapolate"
321 |         )
322 |     except ValueError as e:
323 |         timestamped_echo(e)
324 |         return interp1d(lwf_x, lwf_y, bounds_error=False, fill_value="extrapolate")
325 | 
326 | 
327 | class LowessIsoEstimator:
328 |     def __init__(self, lowess_frac):
329 |         self.lowess_frac = lowess_frac
330 | 
331 |     def fit(self, x, y):
332 |         self.lwi = lowess_iso(x, y, self.lowess_frac)
333 |         return self
334 | 
335 |     def get_params(self, deep=False):
336 |         return {"lowess_frac": self.lowess_frac}
337 | 
338 |     def set_params(self, lowess_frac):
339 |         self.lowess_frac = lowess_frac
340 |         return self
341 | 
342 |     def score(self, x, y):
343 |         resid = self.lwi(x.ravel()) - y
344 |         return 1 / resid.dot(resid)
345 | 
346 |     def predict(self, x):
347 |         return self.lwi(x.ravel())
348 | 
349 |     def __repr__(self):
350 |         return str(self.get_params())
351 | 
352 | 
353 | def lowess_iso_predictor(filename, x, y, xpred):
354 |     gsc = sklearn.model_selection.GridSearchCV(
355 |         LowessIsoEstimator(None),
356 |         {"lowess_frac": [0.01, 0.02, 0.04, 0.08]},
357 |         cv=sklearn.model_selection.KFold(4, shuffle=True, random_state=0),
358 |         n_jobs=min(os.cpu_count(), 61),
359 |     )
360 | 
361 |     gsc.fit(x.reshape(-1, 1), y)
362 |     timestamped_echo(
363 |         f"Info: {filename}; Lowess fraction used: {gsc.best_params_['lowess_frac']}."
364 |     )
365 |     return gsc.best_estimator_.predict(xpred)
366 | 
367 | 
368 | def lowess(
369 |     run,
370 |     reference_run,
371 |     xcol,
372 |     ycol,
373 |     lowess_frac,
374 |     psm_fdr_threshold,
375 |     min_peptides,
376 |     filename,
377 |     main_path,
378 | ):
379 |     # Filter alignment data
380 |     run_alignment = run[run["q_value"] < psm_fdr_threshold] if "q_value" in run else run
381 |     if "q_value" in reference_run:
382 |         reference_run_alignment = reference_run[
383 |             reference_run["q_value"] < psm_fdr_threshold
384 |         ]
385 |     else:
386 |         reference_run_alignment = reference_run
387 | 
388 |     dfm = pd.merge(
389 |         run_alignment,
390 |         reference_run_alignment[["modified_peptide", "precursor_charge", ycol]],
391 |         on=["modified_peptide", "precursor_charge"],
392 |     )
393 |     timestamped_echo(
394 |         f"Info: {filename}; Peptide overlap between run and reference: {dfm.shape[0]}."
395 |     )
396 |     if dfm.shape[0] <= min_peptides:
397 |         timestamped_echo(
398 |             f"Info: {filename}; Skipping run because not enough peptides could be found for alignment."
399 |         )
400 |         return pd.DataFrame()
401 | 
402 |     if dfm.shape[0] < 50:  # use linear regression for small reference size
403 |         linreg = sklearn.linear_model.LinearRegression().fit(
404 |             dfm[xcol].to_numpy().reshape(-1, 1), dfm[ycol]
405 |         )
406 |         run[ycol] = linreg.predict(run[xcol].to_numpy().reshape(-1, 1))
407 |     else:
408 |         # Fit and apply the lowess model
409 |         run[ycol] = (
410 |             lowess_iso_predictor(
411 |                 filename,
412 |                 dfm[xcol].to_numpy(),
413 |                 dfm[ycol].to_numpy(),
414 |                 run[xcol].to_numpy(),
415 |             )
416 |             if lowess_frac == 0
417 |             else lowess_iso(dfm[xcol].to_numpy(), dfm[ycol].to_numpy(), lowess_frac)(
418 |                 run[xcol].to_numpy()
419 |             )
420 |         )
421 | 
422 |     # Plot regression
423 |     plt.plot(dfm[xcol].to_numpy(), dfm[ycol].to_numpy(), "o")
424 |     run1 = run[[xcol, ycol]].sort_values(xcol)
425 |     plt.plot(run1[xcol].to_numpy(), run1[ycol].to_numpy())
426 |     plt.xlabel(xcol)
427 |     plt.ylabel(ycol)
428 |     plt.savefig(os.path.join(main_path, filename + ".pdf"))
429 |     plt.close()
430 |     run1.to_pickle(os.path.join(main_path, filename + ".alignment_pkl"))
431 |     return run
432 | 
433 | 
434 | def remove_rank_suffix(x):
435 |     """
436 | 
437 |     :param x:
438 |     :return:
439 | 
440 |     >>> remove_rank_suffix('23aug2017_hela_serum_timecourse_4mz_narrow_6_rank4')
441 |     '23aug2017_hela_serum_timecourse_4mz_narrow_6'
442 |     >>> remove_rank_suffix('23aug2017_hela_serum_timecourse_4mz_narrow_6_rank44')
443 |     '23aug2017_hela_serum_timecourse_4mz_narrow_6'
444 |     >>> remove_rank_suffix('23aug2017_hela_serum_timecourse_4mz_narrow_6')
445 |     '23aug2017_hela_serum_timecourse_4mz_narrow_6'
446 |     """
447 |     import re
448 | 
449 |     return re.compile("(.+?)(?:_rank[0-9]+)?").fullmatch(x).group(1)
450 | 
451 | 
452 | def unify_modified_peptide_masses(mod_pep, transform=None):
453 |     if not hasattr(mod_pep, "str"):
454 |         return mod_pep, transform
455 |     if transform is None:
456 |         import collections
457 | 
458 |         float_list = {ee for e in mod_pep.str.findall("\\[(.+?)\\]") for ee in e}
459 |         d = collections.defaultdict(list)
460 |         current_group = None
461 |         for i, (v, k) in enumerate(sorted((float(e), e) for e in float_list)):
462 |             if current_group is None:
463 |                 current_group = d[i]
464 |             else:
465 |                 if abs(current_group[-1][0] / v - 1) > 0.001:
466 |                     current_group = d[i]
467 |             current_group.append((v, k))
468 |         transform = {s: l[0][1] for _, l in d.items() for f, s in l}
469 | 
470 |     def transform_func(mo):
471 |         ret = mo.group(0)
472 |         for k, v in transform.items():
473 |             ret = ret.replace(k, v)
474 |         return ret
475 | 
476 |     return mod_pep.str.replace(
477 |         "(?<=\\[).+?(?=\\])", transform_func, regex=True
478 |     ), transform
479 | 
480 | 
481 | def generate(
482 |     files,
483 |     outfile,
484 |     psmtsv,
485 |     peptidetsv,
486 |     perform_rt_calibration,
487 |     rt_referencefile,
488 |     rt_reference_run_path,
489 |     rt_filter,
490 |     perform_im_calibration,
491 |     im_referencefile,
492 |     im_reference_run_path,
493 |     im_filter,
494 |     psm_fdr_threshold,
495 |     peptide_fdr_threshold,
496 |     protein_fdr_threshold,
497 |     rt_lowess_frac,
498 |     rt_psm_fdr_threshold,
499 |     im_lowess_frac,
500 |     im_psm_fdr_threshold,
501 |     pi0_lambda,
502 |     peptide_plot_path,
503 |     protein_plot_path,
504 |     min_peptides,
505 |     proteotypic,
506 |     consensus,
507 |     nofdr,
508 |     diannpqp,
509 | ):
510 |     # Parse input arguments
511 |     psm_files = []
512 |     spectra = []
513 | 
514 |     if len(files) == 1 and files[0].endswith(".txt"):
515 |         files = pathlib.Path(files[0]).read_text().splitlines()
516 | 
517 |     for file in files:
518 |         if "psmpkl" in file:
519 |             psm_files.append(file)
520 |         if "peakpkl" in file:
521 |             spectra.append(file)
522 | 
523 |     if len(psm_files) == 0:
524 |         raise click.ClickException(
525 |             "No PSMs files present. Need to have tag 'psmpkl' in filename."
526 |         )
527 | 
528 |     if len(spectra) == 0:
529 |         raise click.ClickException(
530 |             "No spectrum files present. Need to have tag 'peakpkl' in filename."
531 |         )
532 | 
533 |     if peptidetsv is not None and psmtsv is None:
534 |         raise click.ClickException("There is a peptide.tsv but no psm.tsv.")
535 |     elif peptidetsv is None and psmtsv is not None:
536 |         raise click.ClickException("There is a psm.tsv but no peptide.tsv.")
537 | 
538 |     if None not in (psmtsv, peptidetsv):
539 |         timestamped_echo(
540 |             "Info: There are psm.tsv and peptide.tsv. Will ignore --psm_fdr_threshold, --peptide_fdr_threshold, --protein_fdr_threshold, --pi0_lambda, --proteotypic, and --no-proteotypic."
541 |         )
542 | 
543 |     # Read all PSM files
544 |     psms_list = []
545 |     for psm_file in psm_files:
546 |         timestamped_echo("Info: Reading file %s." % psm_file)
547 |         psm_tab = pd.read_pickle(psm_file)
548 |         if psm_tab.shape[0] > 0:
549 |             psms_list.append(psm_tab)
550 |     psms = pd.concat(psms_list).reset_index(drop=True)
551 |     psms["pp"] = 1 - psms["pep"]
552 |     psms["modified_peptide"], transform_mass = unify_modified_peptide_masses(
553 |         psms["modified_peptide"]
554 |     )
555 | 
556 |     timestamped_echo("Info: In total %s PSMs loaded." % psms.shape[0])
557 | 
558 |     pepid = process_psms(
559 |         psms,
560 |         psmtsv,
561 |         peptidetsv,
562 |         psm_fdr_threshold,
563 |         peptide_fdr_threshold,
564 |         protein_fdr_threshold,
565 |         pi0_lambda,
566 |         peptide_plot_path,
567 |         protein_plot_path,
568 |         proteotypic,
569 |         nofdr,
570 |     )
571 | 
572 |     # Get main path for figures
573 |     main_path = os.path.dirname(os.path.abspath(peptide_plot_path))
574 | 
575 |     # Generate set of best replicate identifications per run
576 |     pepidr = pepid.loc[
577 |         pepid.groupby(["base_name", "modified_peptide", "precursor_charge"])[
578 |             "pp"
579 |         ].idxmax()
580 |     ].sort_index()
581 | 
582 |     aligned_runs = pepidr  # this variable will store the aligned runs
583 |     # Prepare reference iRT list (if enabled)
584 |     if perform_rt_calibration:
585 |         rt_reference_run_columns = ["modified_peptide", "precursor_charge", "irt"]
586 | 
587 |         if rt_referencefile is not None:
588 |             # Read reference file if present
589 |             rt_reference_run = pd.read_csv(rt_referencefile, index_col=False, sep="\t")
590 |             if not set(rt_reference_run_columns).issubset(rt_reference_run.columns):
591 |                 raise click.ClickException(
592 |                     "Reference iRT file has wrong format. Requires columns 'modified_peptide', 'precursor_charge' and 'irt'."
593 |                 )
594 |             if rt_reference_run.shape[0] < 10:
595 |                 raise click.ClickException(
596 |                     "Reference iRT file has too few data points. Requires at least 10."
597 |                 )
598 |         else:
599 |             # Select reference run
600 |             pepidr_stats = (
601 |                 pepidr.groupby("base_name")[["modified_peptide"]].count().reset_index()
602 |             )
603 |             timestamped_echo(pepidr_stats)
604 | 
605 |             if rt_filter is not None:
606 |                 timestamped_echo(
607 |                     "Info: Filter candidate RT reference runs by tag '%s'." % rt_filter
608 |                 )
609 |                 pepidr_stats = pepidr_stats[
610 |                     pepidr_stats["base_name"].str.contains(rt_filter)
611 |                 ]
612 |                 timestamped_echo(pepidr_stats)
613 | 
614 |             rt_reference_run_base_name = pepidr_stats.loc[
615 |                 pepidr_stats["modified_peptide"].idxmax()
616 |             ]["base_name"]
617 | 
618 |             rt_reference_run = pepidr[
619 |                 pepidr["base_name"] == rt_reference_run_base_name
620 |             ].copy()
621 | 
622 |             # Normalize RT of reference run
623 |             min_max_scaler = preprocessing.MinMaxScaler()
624 |             rt_reference_run["irt"] = (
625 |                 min_max_scaler.fit_transform(rt_reference_run[["retention_time"]]) * 100
626 |             )
627 |             rt_reference_run[rt_reference_run_columns].to_csv(
628 |                 rt_reference_run_path, sep="\t", index=False
629 |             )
630 | 
631 |         # Normalize RT of all runs against reference
632 |         aligned_runs = aligned_runs.groupby(
633 |             "base_name", as_index=False, group_keys=False
634 |         ).apply(
635 |             lambda x: lowess(
636 |                 x,
637 |                 rt_reference_run,
638 |                 "retention_time",
639 |                 "irt",
640 |                 rt_lowess_frac,
641 |                 rt_psm_fdr_threshold,
642 |                 min_peptides,
643 |                 "easypqp_rt_alignment_" + x.name,
644 |                 main_path,
645 |             )
646 |         )
647 | 
648 |     else:  # in this case no rt_calibration is performed, we just scale the retention time
649 |         aligned_runs = pepidr
650 |         min_max_scaler = preprocessing.MinMaxScaler()
651 |         aligned_runs["irt"] = (
652 |             min_max_scaler.fit_transform(aligned_runs[["retention_time"]]) * 100
653 |         )
654 | 
655 |     # Determine if IM is present in the search data
656 |     if pepidr["ion_mobility"].isnull().all():
657 |         enable_im = False
658 |     else:
659 |         enable_im = True
660 | 
661 |     if perform_im_calibration and enable_im:
662 |         # Prepare reference IM list
663 |         im_reference_run_columns = ["modified_peptide", "precursor_charge", "im"]
664 | 
665 |         if im_referencefile is not None:
666 |             # Read reference file if present
667 |             im_reference_run = pd.read_csv(im_referencefile, index_col=False, sep="\t")
668 |             if not set(im_reference_run_columns).issubset(im_reference_run.columns):
669 |                 raise click.ClickException(
670 |                     "Reference IM file has wrong format. Requires columns 'modified_peptide', 'precursor_charge' and 'im'."
671 |                 )
672 |             if im_reference_run.shape[0] < 10:
673 |                 raise click.ClickException(
674 |                     "Reference IM file has too few data points. Requires at least 10."
675 |                 )
676 | 
677 |         else:
678 |             # Select reference run
679 |             pepidr_stats = (
680 |                 pepidr.groupby("base_name")[["modified_peptide"]].count().reset_index()
681 |             )
682 |             timestamped_echo(pepidr_stats)
683 | 
684 |             if im_filter is not None:
685 |                 timestamped_echo(
686 |                     "Info: Filter candidate IM reference runs by tag '%s'." % im_filter
687 |                 )
688 |                 pepidr_stats = pepidr_stats[
689 |                     pepidr_stats["base_name"].str.contains(im_filter)
690 |                 ]
691 |                 timestamped_echo(pepidr_stats)
692 | 
693 |             im_reference_run_base_name = pepidr_stats.loc[
694 |                 pepidr_stats["modified_peptide"].idxmax()
695 |             ]["base_name"]
696 | 
697 |             im_reference_run = pepidr[
698 |                 pepidr["base_name"] == im_reference_run_base_name
699 |             ].copy()
700 | 
701 |             # Set IM of reference run
702 |             im_reference_run["im"] = im_reference_run["ion_mobility"]
703 |             im_reference_run[im_reference_run_columns].to_csv(
704 |                 im_reference_run_path, sep="\t", index=False
705 |             )
706 | 
707 |         # perform IM calibration
708 |         aligned_runs = aligned_runs.groupby("base_name", as_index=False).apply(
709 |             lambda x: lowess(
710 |                 x,
711 |                 im_reference_run,
712 |                 "ion_mobility",
713 |                 "im",
714 |                 im_lowess_frac,
715 |                 im_psm_fdr_threshold,
716 |                 min_peptides,
717 |                 "easypqp_im_alignment_" + x.name,
718 |                 main_path,
719 |             )
720 |         )
721 | 
722 |     elif enable_im:  # if no calibration just transfer information as is
723 |         aligned_runs["im"] = aligned_runs["ion_mobility"]
724 |     else:
725 |         pass
726 | 
727 |     pepida = aligned_runs
728 | 
729 |     if pepida.empty or "irt" not in pepida.columns:
730 |         timestamped_echo(
731 |             "Info: Not enough peptides could be found for alignment. There will be a blank spectral library."
732 |         )
733 |         return
734 | 
735 |     # Remove peptides without valid iRT
736 |     pepida = pepida.loc[np.isfinite(pepida["irt"])]
737 | 
738 |     # Remove peptides without valid IM
739 |     if enable_im:
740 |         pepida = pepida.loc[np.isfinite(pepida["im"])]
741 |     else:
742 |         pepida.loc[:, "im"] = np.nan
743 | 
744 |     if pepida.empty:
745 |         timestamped_echo(
746 |             "Info: Not enough peptides could be found for alignment. There will be a blank spectral library."
747 |         )
748 |         return
749 | 
750 |     # Generate set of non-redundant global best replicate identifications
751 |     pepidb = pepida.loc[
752 |         pepida.groupby(["modified_peptide", "precursor_charge"])["pp"].idxmax()
753 |     ].sort_index()
754 | 
755 |     # Prepare ID mzML pairing
756 |     peak_files = pd.DataFrame({"path": spectra})
757 |     peak_files["base_name"] = peak_files["path"].apply(
758 |         lambda x: remove_rank_suffix(os.path.splitext(os.path.basename(x))[0])
759 |     )
760 | 
761 |     # Parse mzXML to retrieve peaks and store results in peak files
762 |     replicate_pqp = []
763 |     for idx, peak_file in peak_files.iterrows():
764 |         timestamped_echo("Info: Parsing file %s." % peak_file["path"])
765 |         meta_run = pepida[pepida["base_name"] == peak_file["base_name"]]
766 |         if meta_run.shape[0] > 0:
767 |             meta_global = pepidb[pepidb["base_name"] == peak_file["base_name"]]
768 |             peaks = pd.read_pickle(peak_file["path"])
769 |             peaks["modified_peptide"], _ = unify_modified_peptide_masses(
770 |                 peaks["modified_peptide"], transform_mass
771 |             )
772 |             # Generate run-specific PQP files for OpenSWATH alignment
773 |             if consensus or ("_Q1" in peak_file["base_name"]):
774 |                 run_pqp = pd.merge(
775 |                     meta_run,
776 |                     peaks,
777 |                     on=["modified_peptide", "precursor_charge", "scan_id"],
778 |                 )[
779 |                     [
780 |                         "precursor_mz",
781 |                         "product_mz",
782 |                         "fragment",
783 |                         "intensity",
784 |                         "irt",
785 |                         "im",
786 |                         "protein_id",
787 |                         "gene_id",
788 |                         "peptide_sequence",
789 |                         "modified_peptide",
790 |                         "precursor_charge",
791 |                     ]
792 |                 ]
793 |                 run_pqp.columns = [
794 |                     "PrecursorMz",
795 |                     "ProductMz",
796 |                     "Annotation",
797 |                     "LibraryIntensity",
798 |                     "NormalizedRetentionTime",
799 |                     "PrecursorIonMobility",
800 |                     "ProteinId",
801 |                     "GeneName",
802 |                     "PeptideSequence",
803 |                     "ModifiedPeptideSequence",
804 |                     "PrecursorCharge",
805 |                 ]
806 |                 run_pqp["PrecursorCharge"] = run_pqp["PrecursorCharge"].astype(int)
807 |                 run_pqp_path = os.path.splitext(peak_file["path"])[0] + "_run_peaks.tsv"
808 |                 run_pqp.to_csv(run_pqp_path, sep="\t", index=False)
809 |                 if consensus:
810 |                     replicate_pqp.append(run_pqp)
811 | 
812 |             # Generate global non-redundant PQP files
813 |             if not consensus:
814 |                 global_pqp = pd.merge(
815 |                     meta_global,
816 |                     peaks,
817 |                     on=["modified_peptide", "precursor_charge", "scan_id"],
818 |                 )[
819 |                     [
820 |                         "precursor_mz",
821 |                         "product_mz",
822 |                         "fragment",
823 |                         "intensity",
824 |                         "irt",
825 |                         "im",
826 |                         "protein_id",
827 |                         "gene_id",
828 |                         "peptide_sequence",
829 |                         "modified_peptide",
830 |                         "precursor_charge",
831 |                     ]
832 |                 ]
833 |                 global_pqp.columns = [
834 |                     "PrecursorMz",
835 |                     "ProductMz",
836 |                     "Annotation",
837 |                     "LibraryIntensity",
838 |                     "NormalizedRetentionTime",
839 |                     "PrecursorIonMobility",
840 |                     "ProteinId",
841 |                     "GeneName",
842 |                     "PeptideSequence",
843 |                     "ModifiedPeptideSequence",
844 |                     "PrecursorCharge",
845 |                 ]
846 |                 global_pqp["PrecursorCharge"] = global_pqp["PrecursorCharge"].astype(
847 |                     int
848 |                 )
849 |                 replicate_pqp.append(global_pqp)
850 | 
851 |     # Aggregate consensus spectra
852 |     pqp = pd.concat(replicate_pqp)
853 |     if consensus:
854 |         pqp_irt = (
855 |             pqp[
856 |                 [
857 |                     "ModifiedPeptideSequence",
858 |                     "PrecursorCharge",
859 |                     "NormalizedRetentionTime",
860 |                     "PrecursorIonMobility",
861 |                 ]
862 |             ]
863 |             .drop_duplicates()
864 |             .groupby(["ModifiedPeptideSequence", "PrecursorCharge"])[
865 |                 ["NormalizedRetentionTime", "PrecursorIonMobility"]
866 |             ]
867 |             .median()
868 |             .reset_index()
869 |         )
870 |         # group by modified peptide sequence before product m/z to avoid intermixing fragments of modified peptide positional isomers (e.g., T[80]PEPTIDE and TPEPT[80]IDE)
871 |         pqp_mass = (
872 |             pqp.groupby(
873 |                 [
874 |                     "PrecursorMz",
875 |                     "ModifiedPeptideSequence",
876 |                     "ProductMz",
877 |                     "Annotation",
878 |                     "ProteinId",
879 |                     "GeneName",
880 |                     "PeptideSequence",
881 |                     "PrecursorCharge",
882 |                 ],
883 |                 dropna=False,
884 |             )["LibraryIntensity"]
885 |             .median()
886 |             .reset_index()
887 |         )
888 |         pqp_mass = pqp_mass[
889 |             [
890 |                 "PrecursorMz",
891 |                 "ProductMz",
892 |                 "Annotation",
893 |                 "ProteinId",
894 |                 "GeneName",
895 |                 "PeptideSequence",
896 |                 "ModifiedPeptideSequence",
897 |                 "PrecursorCharge",
898 |                 "LibraryIntensity",
899 |             ]
900 |         ]  # rearrange columns back to the normal output order
901 |         pqp = pd.merge(
902 |             pqp_mass, pqp_irt, on=["ModifiedPeptideSequence", "PrecursorCharge"]
903 |         )
904 | 
905 |     # Generate DIA-NN2 compatible PQP file
906 |     if diannpqp:
907 |         pqp["FragmentLossType"] = np.nan
908 |         pqp["FragmentType"] = pqp["Annotation"].str[0]
909 |         pqp["FragmentSeriesNumber"] = pqp["Annotation"].str[1]
910 |         pqp["FragmentCharge"] = pqp["Annotation"].str.split("^").str[1].astype(int)
911 |         pqp["Proteotypic"] = [
912 |             1 if ";" in prot_id else 0 for prot_id in pqp["ProteinId"]
913 |         ]
914 |         # Remove redundant columns
915 |         pqp = pqp.drop(["Annotation"], axis=1)
916 | 
917 |     # Write output TSV file
918 |     pqp.to_csv(outfile, sep="\t", index=False)
919 | 


--------------------------------------------------------------------------------
/easypqp/sage.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | import json
   3 | import re
   4 | import numpy as np
   5 | import pandas as pd
   6 | from typing import Optional, Tuple, Dict, List
   7 | 
   8 | from .util import timestamped_echo
   9 | from .convert import unimod as UniModHelper
  10 | 
  11 | 
  12 | def _basename_wo_ext(p: str) -> str:
  13 |     """
  14 |     Return the basename without extensions, handling common compression/archive
  15 |     extensions so that e.g. 'file.mzML.gz' -> 'file'.
  16 | 
  17 |     Strategy:
  18 |     - take the basename
  19 |     - if the final extension is a known compression/archive suffix ('.gz', '.bz2',
  20 |       '.zst', '.tgz', etc.) remove it
  21 |     - then remove one remaining extension (the common data file extension)
  22 |     - return the resulting stem
  23 |     """
  24 |     name = os.path.basename(p or "")
  25 |     comp_suffixes = {
  26 |         ".gz",
  27 |         ".zst",
  28 |         ".tar",
  29 |     }
  30 | 
  31 |     root, ext = os.path.splitext(name)
  32 |     if ext and ext.lower() in comp_suffixes:
  33 |         name = root
  34 | 
  35 |     stem, _ = os.path.splitext(name)
  36 |     return stem
  37 | 
  38 | 
  39 | def _get_first_existing(df: pd.DataFrame, cols: List[str], cast=None, default=None):
  40 |     """
  41 |     Return the first existing column from a DataFrame as a pandas Series, with optional numeric casting,
  42 |     or a Series filled with a default value if none of the columns exist.
  43 | 
  44 |     Parameters
  45 |     ----------
  46 |     df : pandas.DataFrame
  47 |         The DataFrame to search for columns.
  48 |     cols : list[str]
  49 |         Ordered list of column names to look for. The function returns the first name in this list
  50 |         that is present in df.columns.
  51 |     cast : Any, optional
  52 |         If None (the default), the matched column is returned unchanged (the Series as stored in df).
  53 |         If not None, the matched column is converted to numeric using pandas.to_numeric(..., errors="coerce")
  54 |         before being returned. Note: the provided value is used only as a flag; it is not called/applied.
  55 |     default : Any, optional
  56 |         If no column from cols is present in df and default is None, the function returns None.
  57 |         If default is not None, the function returns a pandas.Series of length len(df) where every
  58 |         element equals default.
  59 | 
  60 |     Returns
  61 |     -------
  62 |     pandas.Series or None
  63 |         - If a matching column is found: the corresponding Series from df (possibly converted to numeric).
  64 |         - If no matching column is found and default is provided: a Series filled with default values.
  65 |         - If no matching column is found and default is None: None.
  66 | 
  67 |     Notes
  68 |     -----
  69 |     - Column lookup is an exact string membership check against df.columns.
  70 |     - When cast is not None, non-convertible values in the selected column become NaN due to
  71 |       errors="coerce" in pandas.to_numeric.
  72 |     - The function does not modify the input DataFrame.
  73 |     - If df is empty and default is provided, an empty Series (length 0) of the default value is returned.
  74 | 
  75 |     Examples
  76 |     --------
  77 |     - If cols = ["a", "b"] and df has column "b" but not "a", the function returns df["b"] (or its numeric cast).
  78 |     - If none of the cols exist and default=0, the function returns a Series of zeros with length len(df).
  79 |     """
  80 |     for c in cols:
  81 |         if c in df.columns:
  82 |             return df[c] if cast is None else pd.to_numeric(df[c], errors="coerce")
  83 |     if default is None:
  84 |         return None
  85 |     return pd.Series([default] * len(df))
  86 | 
  87 | 
  88 | def _read_table(path: str) -> pd.DataFrame:
  89 |     """Read a TSV or Parquet file into a DataFrame."""
  90 |     p = (path or "").lower()
  91 |     if p.endswith((".parquet", ".pq")):
  92 |         try:
  93 |             return pd.read_parquet(path)
  94 |         except Exception as e:
  95 |             raise RuntimeError(f"Failed to read Parquet file: {path}\n{e}")
  96 |     if p.endswith((".tsv")):
  97 |         try:
  98 |             return pd.read_csv(path, sep="\t", dtype=str)
  99 |         except Exception as e:
 100 |             raise RuntimeError(f"Failed to read TSV file: {path}\n{e}")
 101 | 
 102 | 
 103 | class SagePSMParser:
 104 |     """
 105 |     Parse results.sage.tsv to EasyPQP PSM schema (subset used by library.generate)
 106 | 
 107 |     Output columns:
 108 |       run_id, scan_id, hit_rank, massdiff, precursor_charge, retention_time,
 109 |       ion_mobility, peptide_sequence, protein_id, gene_id, num_tot_proteins,
 110 |       decoy, pep, modified_peptide, group_id, precursor_mz
 111 |     """
 112 | 
 113 |     PROTON = 1.0072764
 114 |     NEUTRON = 1.00335
 115 |     # Sage bracket delta pattern: A[+15.9949], C[-0.9840], etc.
 116 |     BRACKET_RE = re.compile(r"([A-Z])\[(?P<delta>[+-]?\d+(?:\.\d+)?)\]")
 117 |     # Uniprot token pattern: db|ACCESSION|ENTRY_NAME  (e.g., sp|P01903|DRA_HUMAN)
 118 |     _ACC_ENTRY_RE = re.compile(r"^[A-Za-z]{2}\|(?P<acc>[^|]+)\|(?P<entry>[^|]+)$")
 119 |     # Common decoy prefixes occasionally carried into protein tokens (we still rely on label for decoy)
 120 |     _DECOY_PREFIX_RE = re.compile(r"^(?:decoy_|rev_)+", flags=re.IGNORECASE)
 121 | 
 122 |     def __init__(
 123 |         self,
 124 |         results_tsv: str,
 125 |         unimod_xml: Optional[str],
 126 |         max_delta_unimod: float = 0.02,
 127 |         mz_precision_digits: int = 6,
 128 |     ):
 129 |         self.results_tsv = results_tsv
 130 |         self.um = UniModHelper(unimod_xml, max_delta_unimod) if unimod_xml else None
 131 |         self.max_delta_unimod = max_delta_unimod
 132 |         self.mz_precision_digits = mz_precision_digits
 133 | 
 134 |     @staticmethod
 135 |     def _uniq_preserve(seq):
 136 |         """De-duplicate while preserving order."""
 137 |         seen = set()
 138 |         out = []
 139 |         for x in seq:
 140 |             if x not in seen:
 141 |                 seen.add(x)
 142 |                 out.append(x)
 143 |         return out
 144 | 
 145 |     def _clean_token(self, tok: str) -> str:
 146 |         """Strip decoy prefixes and whitespace from an individual protein token."""
 147 |         tok = (tok or "").strip()
 148 |         return self._DECOY_PREFIX_RE.sub("", tok)
 149 | 
 150 |     def _parse_protein_token(self, tok: str) -> Tuple[str, str]:
 151 |         """
 152 |         Extract (accession, entry_name) from a single token.
 153 |         Falls back gracefully if format isn't db|ACC|ENTRY.
 154 |         """
 155 |         t = self._clean_token(tok)
 156 |         m = self._ACC_ENTRY_RE.match(t)
 157 |         if m:
 158 |             return m.group("acc"), m.group("entry")
 159 |         # Fallbacks:
 160 |         if "|" in t:
 161 |             parts = t.split("|")
 162 |             if len(parts) >= 3:
 163 |                 return parts[1] or "", parts[2] or ""
 164 |             # unknown pipe-y format: best-effort
 165 |             return parts[-1] or "", ""
 166 |         # No pipes at all: treat token as accession-only
 167 |         return t, ""
 168 | 
 169 |     def _split_accessions_and_entries(
 170 |         self, proteins: pd.Series
 171 |     ) -> Tuple[pd.Series, pd.Series, pd.Series]:
 172 |         """
 173 |         Vectorized split of Sage protein strings into:
 174 |           - accessions (semicolon-joined)
 175 |           - entry names (semicolon-joined)
 176 |           - count of unique accessions (for num_tot_proteins)
 177 |         """
 178 |         acc_list = []
 179 |         entry_list = []
 180 |         counts = []
 181 |         for s in proteins.astype(str):
 182 |             if not s or s == "nan":
 183 |                 accs, entries = [], []
 184 |             else:
 185 |                 toks = [t for t in s.split(";") if t.strip()]
 186 |                 pairs = [self._parse_protein_token(t) for t in toks]
 187 |                 accs = self._uniq_preserve([a for a, _ in pairs if a])
 188 |                 entries = self._uniq_preserve([e for _, e in pairs if e])
 189 | 
 190 |             acc_list.append(";".join(accs))
 191 |             entry_list.append(";".join(entries))
 192 |             counts.append(len(accs))
 193 | 
 194 |         return pd.Series(acc_list), pd.Series(entry_list), pd.Series(counts)
 195 | 
 196 |     def _annotate_unimod(self, pep: str) -> str:
 197 |         """
 198 |         Convert Sage bracket deltas (e.g., M[+15.9949]) to (UniMod:<ID>).
 199 |         Tries position-specific contexts (N-term / C-term) before 'Anywhere'.
 200 |         Falls back to leaving the numeric delta if nothing matches.
 201 |         """
 202 |         if self.um is None or "[" not in pep:
 203 |             return pep
 204 | 
 205 |         # 1) get clean sequence and site->delta map from Sage string
 206 |         seq = re.sub(r"\[[-+0-9.]+\]", "", pep)
 207 |         site2delta: Dict[int, float] = {}
 208 |         site = 0
 209 |         i = 0
 210 |         while i < len(pep):
 211 |             ch = pep[i]
 212 |             if ch.isalpha():
 213 |                 site += 1
 214 |                 i += 1
 215 |                 if i < len(pep) and pep[i] == "[":
 216 |                     j = pep.find("]", i + 1)
 217 |                     site2delta[site] = float(pep[i + 1 : j])
 218 |                     i = j + 1
 219 |             else:
 220 |                 i += 1
 221 | 
 222 |         # 2) position preference helper
 223 |         def positions_for_site(idx: int, length: int):
 224 |             if idx == 1:
 225 |                 # try N-terminus flavors first, then Anywhere
 226 |                 return ["Any N-term", "Protein N-term", "Anywhere"]
 227 |             if idx == length:
 228 |                 # try C-terminus flavors first, then Anywhere
 229 |                 return ["Any C-term", "Protein C-term", "Anywhere"]
 230 |             return ["Anywhere"]
 231 | 
 232 |         # 3) very small fallback table for the most common N-term losses
 233 |         #    (used only if UniMod lookup fails)
 234 |         def fallback_unimod(aa: str, idx: int, delta: float, tol=0.02) -> int:
 235 |             if idx == 1 and aa == "Q" and abs(delta - (-17.026549)) <= tol:
 236 |                 return 28  # Gln->pyro-Glu (N-term)
 237 |             if idx == 1 and aa == "E" and abs(delta - (-18.010565)) <= tol:
 238 |                 return 27  # Glu->pyro-Glu (N-term)
 239 |             return -1
 240 | 
 241 |         # 4) build output by injecting (UniMod:<id>) after the modified residue
 242 |         out = list(seq)
 243 |         L = len(seq)
 244 |         for idx in sorted(site2delta.keys(), reverse=True):
 245 |             delta = site2delta[idx]
 246 |             aa = seq[idx - 1]
 247 |             rec_id = -1
 248 | 
 249 |             # Try position-specific contexts first
 250 |             for pos in positions_for_site(idx, L):
 251 |                 rid = self.um.get_id(aa, pos, delta)
 252 |                 if isinstance(rid, tuple):
 253 |                     rid = rid[0]
 254 |                 if rid != -1:
 255 |                     rec_id = rid
 256 |                     break
 257 | 
 258 |             # Fallback: known N-term conversions (pyro-Glu/Q,E)
 259 |             if rec_id == -1:
 260 |                 rec_id = fallback_unimod(aa, idx, delta, self.max_delta_unimod)
 261 | 
 262 |             insert = f"(UniMod:{rec_id})" if rec_id != -1 else f"[{delta:+.6f}]"
 263 |             out.insert(idx, insert)
 264 | 
 265 |         return "".join(out)
 266 | 
 267 |     def parse(self) -> pd.DataFrame:
 268 |         df = _read_table(self.results_tsv).fillna("")
 269 | 
 270 |         filename = _get_first_existing(
 271 |             df, ["filename", "file", "rawfile", "raw_file", "source_file"]
 272 |         )
 273 |         if filename is None:
 274 |             raise ValueError("results.sage.tsv is missing a filename/raw file column.")
 275 |         run_id = filename.astype(str).apply(_basename_wo_ext)
 276 | 
 277 |         scan_id = (
 278 |             _get_first_existing(
 279 |                 df,
 280 |                 ["scannr", "scan", "scan_id", "spectrum_index"],
 281 |                 cast=float,
 282 |                 default=np.nan,
 283 |             )
 284 |             .fillna(1)
 285 |             .astype(int)
 286 |         )
 287 |         hit_rank = (
 288 |             _get_first_existing(df, ["rank", "hit_rank"], cast=float, default=1)
 289 |             .fillna(1)
 290 |             .astype(int)
 291 |         )
 292 |         z = (
 293 |             _get_first_existing(
 294 |                 df, ["precursor_charge", "charge", "z"], cast=float, default=2
 295 |             )
 296 |             .fillna(2)
 297 |             .astype(int)
 298 |         )
 299 | 
 300 |         rt = _get_first_existing(
 301 |             df,
 302 |             ["rt", "retention_time", "retention", "retention_time_sec"],
 303 |             cast=float,
 304 |             default=np.nan,
 305 |         )
 306 |         im = _get_first_existing(
 307 |             df, ["ion_mobility", "mobility", "ccs", "k0"], cast=float, default=np.nan
 308 |         )
 309 |         # If im is all 0s, set to NaN
 310 |         if im.eq(0).all():
 311 |             im = pd.Series([np.nan] * len(df))
 312 | 
 313 |         pep_seq = df["peptide"].astype(str)
 314 |         proteins_raw = _get_first_existing(df, ["proteins", "protein", "protein_id"])
 315 |         proteins_raw = (
 316 |             proteins_raw.astype(str)
 317 |             if proteins_raw is not None
 318 |             else pd.Series([""] * len(df))
 319 |         )
 320 |         protein_ids, gene_ids, num_prot = self._split_accessions_and_entries(
 321 |             proteins_raw
 322 |         )
 323 | 
 324 |         if "label" in df.columns:
 325 |             # decoy detection from label
 326 |             # Sage TSV: label == -1 (decoy), +1 (target)
 327 |             label_series = pd.to_numeric(df["label"], errors="coerce")
 328 |             decoy = label_series == -1
 329 |         elif "is_decoy" in df.columns:
 330 |             # The parquet format uses a boolean is_decoy column
 331 |             decoy = df["is_decoy"]
 332 | 
 333 |         # spectrum-level q-value, peptide-level q-value and protein-level q-value
 334 |         pep = (
 335 |             pd.to_numeric(df["posterior_error"], errors="coerce")
 336 |             if "posterior_error" in df.columns
 337 |             else pd.Series([np.nan] * len(df))
 338 |         )
 339 |         spectrum_q = (
 340 |             pd.to_numeric(df["spectrum_q"], errors="coerce")
 341 |             if "spectrum_q" in df.columns
 342 |             else pd.Series([np.nan] * len(df))
 343 |         )
 344 |         peptide_q = (
 345 |             pd.to_numeric(df["peptide_q"], errors="coerce")
 346 |             if "peptide_q" in df.columns
 347 |             else pd.Series([np.nan] * len(df))
 348 |         )
 349 |         protein_q = (
 350 |             pd.to_numeric(df["protein_q"], errors="coerce")
 351 |             if "protein_q" in df.columns
 352 |             else pd.Series([np.nan] * len(df))
 353 |         )
 354 | 
 355 |         # compute precursor m/z from neurtal mass using the theoretical calculated mass of the peptide.
 356 |         calcmass = _get_first_existing(df, ["calcmass"], cast=float, default=np.nan)
 357 |         prec_mz = pd.Series(np.nan, index=df.index, dtype=float)
 358 |         mask_calc = calcmass.notna() & (z > 0)
 359 |         prec_mz.loc[mask_calc] = (calcmass[mask_calc] + z[mask_calc] * self.PROTON) / z[
 360 |             mask_calc
 361 |         ]
 362 | 
 363 |         ## If we wanted to compute from experimental mass instead:
 364 |         # expmass  = _get_first_existing(df, ['expmass'],  cast=float, default=np.nan)
 365 |         # iso_err = _get_first_existing(
 366 |         #     df, ["isotope_error", "isotope"], cast=float, default=0.0
 367 |         # ).fillna(0.0)
 368 |         # mask_exp = prec_mz.isna() & expmass.notna() & (z > 0)
 369 |         # mz_exp = (expmass[mask_exp] + z[mask_exp] * PROTON) / z[mask_exp]
 370 |         # prec_mz.loc[mask_exp] = mz_exp - (iso_err[mask_exp] * NEUTRON) / z[mask_exp]
 371 | 
 372 |         ## set precision
 373 |         prec_mz = prec_mz.round(self.mz_precision_digits)
 374 | 
 375 |         # modified peptide
 376 |         modpep = pep_seq.apply(self._annotate_unimod)
 377 | 
 378 |         # group id (same style as convert paths)
 379 |         group_id = (
 380 |             run_id
 381 |             + "_"
 382 |             + scan_id.astype(str)
 383 |             + np.where(hit_rank > 1, "_rank" + hit_rank.astype(str), "")
 384 |         )
 385 | 
 386 |         out = pd.DataFrame(
 387 |             {
 388 |                 "run_id": run_id,
 389 |                 "scan_id": scan_id,
 390 |                 "hit_rank": hit_rank,
 391 |                 "massdiff": 0.0,
 392 |                 "precursor_charge": z,
 393 |                 "retention_time": rt,
 394 |                 "ion_mobility": im,
 395 |                 "peptide_sequence": pep_seq.str.replace(
 396 |                     r"\[[-+0-9.]+\]", "", regex=True
 397 |                 ),
 398 |                 "protein_id": protein_ids.fillna(""),
 399 |                 "gene_id": gene_ids.fillna(""),
 400 |                 "num_tot_proteins": num_prot.fillna(0).astype(int),
 401 |                 "decoy": decoy.astype(bool),
 402 |                 "modified_peptide": modpep,
 403 |                 "group_id": group_id,
 404 |                 "precursor_mz": prec_mz,
 405 |                 "pep": pep,
 406 |                 "q_value": spectrum_q,
 407 |                 "peptide_q": peptide_q,
 408 |                 "protein_q": protein_q,
 409 |             }
 410 |         )
 411 |         return out
 412 | 
 413 |     def parse_df(
 414 |         self, df: pd.DataFrame, psm_id_series: Optional[pd.Series] = None
 415 |     ) -> pd.DataFrame:
 416 |         """
 417 |         Parse a provided DataFrame slice (same logic as `parse` but works on an
 418 |         already-loaded DataFrame). This is useful for chunked/streaming flows.
 419 | 
 420 |         If `psm_id_series` is provided it will be attached to the returned
 421 |         DataFrame as a `psm_id` column (preserving positional alignment).
 422 |         """
 423 |         df = df.fillna("")
 424 | 
 425 |         filename = _get_first_existing(
 426 |             df, ["filename", "file", "rawfile", "raw_file", "source_file"]
 427 |         )
 428 |         if filename is None:
 429 |             raise ValueError("results.sage.tsv is missing a filename/raw file column.")
 430 |         run_id = filename.astype(str).apply(_basename_wo_ext)
 431 | 
 432 |         scan_id = (
 433 |             _get_first_existing(
 434 |                 df,
 435 |                 ["scannr", "scan", "scan_id", "spectrum_index"],
 436 |                 cast=float,
 437 |                 default=np.nan,
 438 |             )
 439 |             .fillna(1)
 440 |             .astype(int)
 441 |         )
 442 |         hit_rank = (
 443 |             _get_first_existing(df, ["rank", "hit_rank"], cast=float, default=1)
 444 |             .fillna(1)
 445 |             .astype(int)
 446 |         )
 447 |         z = (
 448 |             _get_first_existing(
 449 |                 df, ["precursor_charge", "charge", "z"], cast=float, default=2
 450 |             )
 451 |             .fillna(2)
 452 |             .astype(int)
 453 |         )
 454 | 
 455 |         rt = _get_first_existing(
 456 |             df,
 457 |             ["rt", "retention_time", "retention", "retention_time_sec"],
 458 |             cast=float,
 459 |             default=np.nan,
 460 |         )
 461 |         im = _get_first_existing(
 462 |             df, ["ion_mobility", "mobility", "ccs", "k0"], cast=float, default=np.nan
 463 |         )
 464 |         # If im is all 0s, set to NaN
 465 |         if im.eq(0).all():
 466 |             im = pd.Series([np.nan] * len(df))
 467 | 
 468 |         pep_seq = df["peptide"].astype(str)
 469 |         proteins_raw = _get_first_existing(df, ["proteins", "protein", "protein_id"])
 470 |         proteins_raw = (
 471 |             proteins_raw.astype(str)
 472 |             if proteins_raw is not None
 473 |             else pd.Series([""] * len(df))
 474 |         )
 475 |         protein_ids, gene_ids, num_prot = self._split_accessions_and_entries(
 476 |             proteins_raw
 477 |         )
 478 | 
 479 |         if "label" in df.columns:
 480 |             label_series = pd.to_numeric(df["label"], errors="coerce")
 481 |             decoy = label_series == -1
 482 |         elif "is_decoy" in df.columns:
 483 |             decoy = df["is_decoy"]
 484 |         else:
 485 |             decoy = pd.Series([False] * len(df))
 486 | 
 487 |         pep = (
 488 |             pd.to_numeric(df["posterior_error"], errors="coerce")
 489 |             if "posterior_error" in df.columns
 490 |             else pd.Series([np.nan] * len(df))
 491 |         )
 492 |         spectrum_q = (
 493 |             pd.to_numeric(df["spectrum_q"], errors="coerce")
 494 |             if "spectrum_q" in df.columns
 495 |             else pd.Series([np.nan] * len(df))
 496 |         )
 497 |         peptide_q = (
 498 |             pd.to_numeric(df["peptide_q"], errors="coerce")
 499 |             if "peptide_q" in df.columns
 500 |             else pd.Series([np.nan] * len(df))
 501 |         )
 502 |         protein_q = (
 503 |             pd.to_numeric(df["protein_q"], errors="coerce")
 504 |             if "protein_q" in df.columns
 505 |             else pd.Series([np.nan] * len(df))
 506 |         )
 507 | 
 508 |         calcmass = _get_first_existing(df, ["calcmass"], cast=float, default=np.nan)
 509 |         prec_mz = pd.Series(np.nan, index=df.index, dtype=float)
 510 |         mask_calc = calcmass.notna() & (z > 0)
 511 |         prec_mz.loc[mask_calc] = (calcmass[mask_calc] + z[mask_calc] * self.PROTON) / z[
 512 |             mask_calc
 513 |         ]
 514 |         prec_mz = prec_mz.round(self.mz_precision_digits)
 515 | 
 516 |         modpep = pep_seq.apply(self._annotate_unimod)
 517 | 
 518 |         group_id = (
 519 |             run_id
 520 |             + "_"
 521 |             + scan_id.astype(str)
 522 |             + np.where(hit_rank > 1, "_rank" + hit_rank.astype(str), "")
 523 |         )
 524 | 
 525 |         out = pd.DataFrame(
 526 |             {
 527 |                 "run_id": run_id,
 528 |                 "scan_id": scan_id,
 529 |                 "hit_rank": hit_rank,
 530 |                 "massdiff": 0.0,
 531 |                 "precursor_charge": z,
 532 |                 "retention_time": rt,
 533 |                 "ion_mobility": im,
 534 |                 "peptide_sequence": pep_seq.str.replace(
 535 |                     r"\[[-+0-9.]+\]", "", regex=True
 536 |                 ),
 537 |                 "protein_id": protein_ids.fillna(""),
 538 |                 "gene_id": gene_ids.fillna(""),
 539 |                 "num_tot_proteins": num_prot.fillna(0).astype(int),
 540 |                 "decoy": decoy.astype(bool),
 541 |                 "modified_peptide": modpep,
 542 |                 "group_id": group_id,
 543 |                 "precursor_mz": prec_mz,
 544 |                 "pep": pep,
 545 |                 "q_value": spectrum_q,
 546 |                 "peptide_q": peptide_q,
 547 |                 "protein_q": protein_q,
 548 |             }
 549 |         )
 550 | 
 551 |         if psm_id_series is not None:
 552 |             out = out.reset_index(drop=True)
 553 |             out["psm_id"] = psm_id_series.astype(str).str.strip().reset_index(drop=True)
 554 | 
 555 |         return out
 556 | 
 557 | 
 558 | class SageFragmentParser:
 559 |     """
 560 |     Parse matched_fragments.sage.tsv to EasyPQP 'peaks' table:
 561 |       columns: scan_id, modified_peptide, precursor_charge, precursor_mz, fragment, product_mz, intensity
 562 |     """
 563 | 
 564 |     def __init__(self, frags_tsv: str, mz_precision_digits: int = 6):
 565 |         self.frags_tsv = frags_tsv
 566 |         self.mz_precision_digits = mz_precision_digits
 567 | 
 568 |     @staticmethod
 569 |     def _ann(ftype: str, ord_: int, z: int) -> str:
 570 |         return f"{ftype}{ord_}^{z}"
 571 | 
 572 |     def parse(self, psms_with_psmid: pd.DataFrame) -> pd.DataFrame:
 573 |         fr = _read_table(self.frags_tsv).fillna("")
 574 | 
 575 |         for c in [
 576 |             "psm_id",
 577 |             "fragment_ordinals",
 578 |             "fragment_charge",
 579 |             "fragment_mz_calculated",
 580 |             "fragment_mz_experimental",
 581 |             "fragment_intensity",
 582 |         ]:
 583 |             if c in fr.columns:
 584 |                 fr[c] = pd.to_numeric(fr[c], errors="coerce")
 585 |         if "psm_id" not in fr.columns:
 586 |             raise ValueError(
 587 |                 "matched_fragments.sage.tsv must contain a 'psm_id' column."
 588 |             )
 589 | 
 590 |         fr["psm_id"] = fr["psm_id"].astype(str).str.strip()
 591 | 
 592 |         fr["fragment"] = fr.apply(
 593 |             lambda r: self._ann(
 594 |                 str(r["fragment_type"]),
 595 |                 int(r["fragment_ordinals"]),
 596 |                 int(r["fragment_charge"]),
 597 |             ),
 598 |             axis=1,
 599 |         )
 600 |         fr["product_mz"] = fr["fragment_mz_calculated"]
 601 | 
 602 |         # join to PSMs
 603 |         join_cols = [
 604 |             "psm_id",
 605 |             "scan_id",
 606 |             "modified_peptide",
 607 |             "precursor_mz",
 608 |             "precursor_charge",
 609 |             "run_id",
 610 |         ]
 611 |         j = fr.merge(psms_with_psmid[join_cols], on="psm_id", how="inner")
 612 | 
 613 |         peaks = j[
 614 |             [
 615 |                 "run_id",
 616 |                 "scan_id",
 617 |                 "modified_peptide",
 618 |                 "precursor_charge",
 619 |                 "precursor_mz",
 620 |                 "fragment",
 621 |                 "product_mz",
 622 |                 "fragment_intensity",
 623 |             ]
 624 |         ].copy()
 625 |         peaks.rename(columns={"fragment_intensity": "intensity"}, inplace=True)
 626 | 
 627 |         # per-PSM normalization to 10,000 (matches convert paths)
 628 |         peaks["intensity"] = peaks["intensity"].fillna(0.0)
 629 |         grp = peaks.groupby(
 630 |             ["run_id", "scan_id", "modified_peptide", "precursor_charge"], dropna=False
 631 |         )["intensity"]
 632 |         denom = grp.transform(lambda x: np.nanmax(x.values) if len(x) else np.nan)
 633 |         peaks["intensity"] = (peaks["intensity"] / denom) * 10000.0
 634 |         peaks["intensity"] = peaks["intensity"].fillna(0.0)
 635 | 
 636 |         # round and de-duplicate (keep most intense per exact fragment/product_mz)
 637 |         peaks["product_mz"] = peaks["product_mz"].round(self.mz_precision_digits)
 638 |         peaks["precursor_mz"] = peaks["precursor_mz"].round(self.mz_precision_digits)
 639 |         peaks["intensity"] = peaks["intensity"].round(self.mz_precision_digits)
 640 | 
 641 |         peaks = peaks.groupby(
 642 |             [
 643 |                 "run_id",
 644 |                 "scan_id",
 645 |                 "modified_peptide",
 646 |                 "precursor_charge",
 647 |                 "precursor_mz",
 648 |                 "fragment",
 649 |                 "product_mz",
 650 |             ],
 651 |             as_index=False,
 652 |         )["intensity"].max()
 653 |         return peaks
 654 | 
 655 |     def parse_df(self, fr: pd.DataFrame, psms_with_psmid: pd.DataFrame) -> pd.DataFrame:
 656 |         """
 657 |         Parse a fragments DataFrame (filtered to relevant rows) and join to the
 658 |         provided PSM DataFrame. This mirrors `parse` but operates on in-memory
 659 |         DataFrames to support streaming.
 660 |         """
 661 |         fr = fr.fillna("")
 662 |         for c in [
 663 |             "psm_id",
 664 |             "fragment_ordinals",
 665 |             "fragment_charge",
 666 |             "fragment_mz_calculated",
 667 |             "fragment_mz_experimental",
 668 |             "fragment_intensity",
 669 |         ]:
 670 |             if c in fr.columns:
 671 |                 fr[c] = pd.to_numeric(fr[c], errors="coerce")
 672 |         if "psm_id" not in fr.columns:
 673 |             raise ValueError(
 674 |                 "matched_fragments.sage.tsv must contain a 'psm_id' column."
 675 |             )
 676 | 
 677 |         fr["psm_id"] = fr["psm_id"].astype(str).str.strip()
 678 | 
 679 |         fr["fragment"] = fr.apply(
 680 |             lambda r: self._ann(
 681 |                 str(r["fragment_type"]),
 682 |                 int(r["fragment_ordinals"]),
 683 |                 int(r["fragment_charge"]),
 684 |             ),
 685 |             axis=1,
 686 |         )
 687 |         fr["product_mz"] = fr["fragment_mz_calculated"]
 688 | 
 689 |         join_cols = [
 690 |             "psm_id",
 691 |             "scan_id",
 692 |             "modified_peptide",
 693 |             "precursor_mz",
 694 |             "precursor_charge",
 695 |             "run_id",
 696 |         ]
 697 |         j = fr.merge(psms_with_psmid[join_cols], on="psm_id", how="inner")
 698 | 
 699 |         peaks = j[
 700 |             [
 701 |                 "run_id",
 702 |                 "scan_id",
 703 |                 "modified_peptide",
 704 |                 "precursor_charge",
 705 |                 "precursor_mz",
 706 |                 "fragment",
 707 |                 "product_mz",
 708 |                 "fragment_intensity",
 709 |             ]
 710 |         ].copy()
 711 |         peaks.rename(columns={"fragment_intensity": "intensity"}, inplace=True)
 712 | 
 713 |         peaks["intensity"] = peaks["intensity"].fillna(0.0)
 714 |         grp = peaks.groupby(
 715 |             ["run_id", "scan_id", "modified_peptide", "precursor_charge"], dropna=False
 716 |         )["intensity"]
 717 |         denom = grp.transform(lambda x: np.nanmax(x.values) if len(x) else np.nan)
 718 |         peaks["intensity"] = (peaks["intensity"] / denom) * 10000.0
 719 |         peaks["intensity"] = peaks["intensity"].fillna(0.0)
 720 | 
 721 |         peaks["product_mz"] = peaks["product_mz"].round(self.mz_precision_digits)
 722 |         peaks["precursor_mz"] = peaks["precursor_mz"].round(self.mz_precision_digits)
 723 |         peaks["intensity"] = peaks["intensity"].round(self.mz_precision_digits)
 724 | 
 725 |         peaks = peaks.groupby(
 726 |             [
 727 |                 "run_id",
 728 |                 "scan_id",
 729 |                 "modified_peptide",
 730 |                 "precursor_charge",
 731 |                 "precursor_mz",
 732 |                 "fragment",
 733 |                 "product_mz",
 734 |             ],
 735 |             as_index=False,
 736 |         )["intensity"].max()
 737 |         return peaks
 738 | 
 739 | 
 740 | def convert_sage(
 741 |     results_tsv: str,
 742 |     fragments_tsv: str,
 743 |     unimod_xml: Optional[str],
 744 |     max_delta_unimod: float = 0.02,
 745 |     mz_precision_digits: int = 6,
 746 |     *,
 747 |     force_streaming: Optional[bool] = None,
 748 |     streaming_threshold_bytes: int = 1_000_000_000,
 749 | ) -> Optional[List[str]]:
 750 |     """
 751 |     High-level conversion: Sage TSV/Parquet to EasyPQP PSM and peaks pickles written to disk.
 752 |     """
 753 |     # Auto-switch to streaming mode when inputs are very large, unless caller
 754 |     # explicitly requested non-streaming via force_streaming=False.
 755 |     try:
 756 |         if force_streaming is None:
 757 |             # determine combined size (fall back to streaming if either file is missing)
 758 |             try:
 759 |                 rsize = os.path.getsize(results_tsv)
 760 |             except Exception:
 761 |                 rsize = 0
 762 |             try:
 763 |                 fsize = os.path.getsize(fragments_tsv)
 764 |             except Exception:
 765 |                 fsize = 0
 766 |             use_stream = (rsize + fsize) >= streaming_threshold_bytes
 767 |         else:
 768 |             use_stream = bool(force_streaming)
 769 |     except Exception:
 770 |         use_stream = False
 771 | 
 772 |     if use_stream:
 773 |         timestamped_echo("Info: Using streaming Sage conversion for low memory usage")
 774 |         return convert_sage_streaming(
 775 |             results_tsv,
 776 |             fragments_tsv,
 777 |             unimod_xml,
 778 |             max_delta_unimod=max_delta_unimod,
 779 |             mz_precision_digits=mz_precision_digits,
 780 |         )
 781 | 
 782 |     # Read raw to extract psm_id for joining
 783 |     timestamped_echo("Info: Reading Sage PSMs")
 784 |     raw_res = _read_table(results_tsv)
 785 |     if "psm_id" not in raw_res.columns:
 786 |         raise ValueError(
 787 |             "results.sage.tsv must contain a 'psm_id' for joining with matched fragments."
 788 |         )
 789 | 
 790 |     raw_res["psm_id"] = raw_res["psm_id"].astype(str).str.strip()
 791 | 
 792 |     psms = SagePSMParser(
 793 |         results_tsv, unimod_xml, max_delta_unimod, mz_precision_digits
 794 |     ).parse()
 795 |     psms = raw_res[["psm_id"]].join(psms)
 796 | 
 797 |     if psms.empty:
 798 |         raise ValueError("No PSMs were parsed from the provided results.sage.tsv file.")
 799 | 
 800 |     timestamped_echo("Info: Reading Sage matched fragment peaks")
 801 |     peaks = SageFragmentParser(fragments_tsv, mz_precision_digits).parse(psms)
 802 | 
 803 |     if peaks.empty:
 804 |         raise ValueError(
 805 |             "No fragment peaks were parsed from the provided matched_fragments.sage.tsv file."
 806 |         )
 807 | 
 808 |     # Trim to minimal schema expected by library.generate
 809 |     keep = [
 810 |         "run_id",
 811 |         "scan_id",
 812 |         "hit_rank",
 813 |         "massdiff",
 814 |         "precursor_charge",
 815 |         "retention_time",
 816 |         "ion_mobility",
 817 |         "peptide_sequence",
 818 |         "protein_id",
 819 |         "gene_id",
 820 |         "num_tot_proteins",
 821 |         "decoy",
 822 |         "modified_peptide",
 823 |         "group_id",
 824 |         "pep",
 825 |         "q_value",
 826 |         "peptide_q",
 827 |         "protein_q",
 828 |     ]
 829 |     psms_export = psms[keep].copy()
 830 | 
 831 |     runs = sorted(psms_export["run_id"].dropna().unique().tolist())
 832 |     new_infiles = []
 833 |     for run in runs:
 834 |         psms_r = psms_export.loc[psms_export["run_id"] == run]
 835 |         peaks_r = (
 836 |             peaks.loc[peaks["run_id"] == run] if "run_id" in peaks.columns else peaks
 837 |         )
 838 | 
 839 |         if psms_r.empty or peaks_r.empty:
 840 |             timestamped_echo(
 841 |                 f"Info: Skipping run {run}: psms={len(psms_r)}, peaks={len(peaks_r)}"
 842 |             )
 843 |             continue
 844 | 
 845 |         psmpkl = f"{run}.psmpkl"
 846 |         peakpkl = f"{run}.peakpkl"
 847 |         psms_r.to_pickle(psmpkl)
 848 |         peaks_r.to_pickle(peakpkl)
 849 |         timestamped_echo(f"Info: Wrote {psmpkl} and {peakpkl}")
 850 |         new_infiles.extend([psmpkl, peakpkl])
 851 | 
 852 |     if len(new_infiles) == 0:
 853 |         # click may not be available in all contexts; raise a generic error here
 854 |         raise RuntimeError("No non-empty runs detected after Sage conversion.")
 855 | 
 856 | 
 857 | def convert_sage_streaming(
 858 |     results_tsv: str,
 859 |     fragments_tsv: str,
 860 |     unimod_xml: Optional[str],
 861 |     max_delta_unimod: float = 0.02,
 862 |     mz_precision_digits: int = 6,
 863 |     chunksize: int = 800_000,
 864 |     tmpdir: Optional[str] = None,
 865 | ) -> List[str]:
 866 |     """
 867 |     Memory-efficient streaming conversion that processes one run at a time.
 868 | 
 869 |     PSMs are held in memory (they're relatively small); the *fragments* file is
 870 |     streamed twice per run:
 871 |       - pass 1: compute per-PSM-group max intensity (denom)
 872 |       - pass 2: normalize and aggregate peaks
 873 | 
 874 |     This is logically equivalent to the non-streaming convert_sage().
 875 |     """
 876 |     import tempfile
 877 | 
 878 |     # --- 1) Load all PSMs once (as in non-streaming convert_sage) ---
 879 |     timestamped_echo("Info: [streaming] Reading Sage PSMs (full table in memory)")
 880 | 
 881 |     raw_res = _read_table(results_tsv)
 882 |     if "psm_id" not in raw_res.columns:
 883 |         raise ValueError(
 884 |             "results.sage.tsv must contain a 'psm_id' for joining with matched fragments."
 885 |         )
 886 |     raw_res["psm_id"] = raw_res["psm_id"].astype(str).str.strip()
 887 | 
 888 |     parser = SagePSMParser(
 889 |         results_tsv, unimod_xml, max_delta_unimod, mz_precision_digits
 890 |     )
 891 |     psms_parsed = parser.parse()
 892 |     # align on index, same as non-streaming convert_sage
 893 |     psms_all = raw_res[["psm_id"]].join(psms_parsed)
 894 | 
 895 |     if psms_all.empty:
 896 |         raise ValueError("No PSMs were parsed from the provided results.sage.tsv file.")
 897 | 
 898 |     # Runs defined exactly as in non-streaming convert_sage
 899 |     runs = sorted(psms_all["run_id"].dropna().unique().tolist())
 900 |     timestamped_echo(f"Info: [streaming] Discovered {len(runs)} runs")
 901 | 
 902 |     outfiles: List[str] = []
 903 |     tmpdir = tmpdir or tempfile.mkdtemp(prefix="easypqp_sage_")
 904 | 
 905 |     # --- 2) Process one run at a time ---
 906 |     for run in runs:
 907 |         timestamped_echo(f"Info: [streaming] Processing run {run}")
 908 | 
 909 |         psms_run = psms_all.loc[psms_all["run_id"] == run].copy()
 910 |         if psms_run.empty:
 911 |             timestamped_echo(f"Info: Skipping run {run}: no PSMs")
 912 |             continue
 913 | 
 914 |         # Limit to columns needed for joins / normalization
 915 |         norm_cols = psms_run[
 916 |             [
 917 |                 "psm_id",
 918 |                 "run_id",
 919 |                 "scan_id",
 920 |                 "modified_peptide",
 921 |                 "precursor_charge",
 922 |                 "precursor_mz",
 923 |             ]
 924 |         ].copy()
 925 |         norm_cols["psm_id"] = norm_cols["psm_id"].astype(str).str.strip()
 926 | 
 927 |         run_psm_ids = set(norm_cols["psm_id"])
 928 | 
 929 |         # --- PASS 1: build denom_map from joined fragments ---
 930 |         denom_map: Dict[str, float] = {}
 931 | 
 932 |         first_pass_matches = 0
 933 |         for fr_chunk in pd.read_csv(
 934 |             fragments_tsv, sep="\t", dtype=str, chunksize=chunksize
 935 |         ):
 936 |             if "psm_id" not in fr_chunk.columns:
 937 |                 continue
 938 | 
 939 |             fr_chunk["psm_id"] = fr_chunk["psm_id"].astype(str).str.strip()
 940 |             mask = fr_chunk["psm_id"].isin(run_psm_ids)
 941 |             if not mask.any():
 942 |                 continue
 943 | 
 944 |             sub = fr_chunk.loc[mask].copy()
 945 |             first_pass_matches += int(mask.sum())
 946 | 
 947 |             # convert intensity to numeric
 948 |             if "fragment_intensity" not in sub.columns:
 949 |                 continue
 950 |             sub["fragment_intensity"] = pd.to_numeric(
 951 |                 sub["fragment_intensity"], errors="coerce"
 952 |             ).fillna(0.0)
 953 | 
 954 |             # join to get run_id / scan_id / modified_peptide / charge
 955 |             j = sub.merge(norm_cols, on="psm_id", how="inner")
 956 |             if j.empty:
 957 |                 continue
 958 | 
 959 |             # group key: exactly matches non-streaming normalization groups
 960 |             j["group_key"] = (
 961 |                 j["run_id"].astype(str)
 962 |                 + "||"
 963 |                 + j["scan_id"].astype(str)
 964 |                 + "||"
 965 |                 + j["modified_peptide"].astype(str)
 966 |                 + "||"
 967 |                 + j["precursor_charge"].astype(str)
 968 |             )
 969 | 
 970 |             gb = j.groupby("group_key")["fragment_intensity"].max()
 971 |             for k, mx in gb.items():
 972 |                 prev = denom_map.get(k)
 973 |                 if prev is None or mx > prev:
 974 |                     denom_map[k] = float(mx)
 975 | 
 976 |         timestamped_echo(
 977 |             f"Info: [streaming] Run {run}: PASS1 matched {first_pass_matches} fragment rows; "
 978 |             f"{len(denom_map)} normalization groups"
 979 |         )
 980 | 
 981 |         if not denom_map:
 982 |             timestamped_echo(f"Info: Skipping run {run}: no fragment peaks")
 983 |             continue
 984 | 
 985 |         # --- PASS 2: normalize intensities and build peaks_run ---
 986 |         peaks_parts: List[pd.DataFrame] = []
 987 |         second_pass_matches = 0
 988 | 
 989 |         join_cols = [
 990 |             "psm_id",
 991 |             "scan_id",
 992 |             "modified_peptide",
 993 |             "precursor_mz",
 994 |             "precursor_charge",
 995 |             "run_id",
 996 |         ]
 997 |         join_psms = psms_run[join_cols].copy()
 998 |         join_psms["psm_id"] = join_psms["psm_id"].astype(str).str.strip()
 999 | 
1000 |         for fr_chunk in pd.read_csv(
1001 |             fragments_tsv, sep="\t", dtype=str, chunksize=chunksize
1002 |         ):
1003 |             if "psm_id" not in fr_chunk.columns:
1004 |                 continue
1005 | 
1006 |             fr_chunk["psm_id"] = fr_chunk["psm_id"].astype(str).str.strip()
1007 |             mask = fr_chunk["psm_id"].isin(run_psm_ids)
1008 |             if not mask.any():
1009 |                 continue
1010 | 
1011 |             sub = fr_chunk.loc[mask].copy()
1012 |             second_pass_matches += int(mask.sum())
1013 | 
1014 |             # numeric conversions
1015 |             for c in [
1016 |                 "fragment_ordinals",
1017 |                 "fragment_charge",
1018 |                 "fragment_mz_calculated",
1019 |                 "fragment_mz_experimental",
1020 |                 "fragment_intensity",
1021 |             ]:
1022 |                 if c in sub.columns:
1023 |                     sub[c] = pd.to_numeric(sub[c], errors="coerce")
1024 | 
1025 |             # annotate fragment + product_mz
1026 |             sub["fragment"] = sub.apply(
1027 |                 lambda r: SageFragmentParser._ann(
1028 |                     str(r.get("fragment_type", "")),
1029 |                     int(r.get("fragment_ordinals", 0) or 0),
1030 |                     int(r.get("fragment_charge", 0) or 0),
1031 |                 ),
1032 |                 axis=1,
1033 |             )
1034 |             sub["product_mz"] = sub.get("fragment_mz_calculated")
1035 | 
1036 |             # join to PSMs for this run
1037 |             j = sub.merge(join_psms, on="psm_id", how="inner")
1038 |             if j.empty:
1039 |                 continue
1040 | 
1041 |             peaks = j[
1042 |                 [
1043 |                     "run_id",
1044 |                     "scan_id",
1045 |                     "modified_peptide",
1046 |                     "precursor_charge",
1047 |                     "precursor_mz",
1048 |                     "fragment",
1049 |                     "product_mz",
1050 |                     "fragment_intensity",
1051 |                 ]
1052 |             ].copy()
1053 |             peaks.rename(columns={"fragment_intensity": "intensity"}, inplace=True)
1054 | 
1055 |             # normalize using denom_map
1056 |             peaks["intensity"] = pd.to_numeric(
1057 |                 peaks["intensity"], errors="coerce"
1058 |             ).fillna(0.0)
1059 | 
1060 |             peaks["group_key"] = (
1061 |                 peaks["run_id"].astype(str)
1062 |                 + "||"
1063 |                 + peaks["scan_id"].astype(str)
1064 |                 + "||"
1065 |                 + peaks["modified_peptide"].astype(str)
1066 |                 + "||"
1067 |                 + peaks["precursor_charge"].astype(str)
1068 |             )
1069 |             peaks["denom"] = peaks["group_key"].map(lambda x: denom_map.get(x, np.nan))
1070 |             peaks["intensity"] = (peaks["intensity"] / peaks["denom"]) * 10000.0
1071 |             peaks["intensity"] = peaks["intensity"].fillna(0.0)
1072 |             peaks.drop(columns=["denom", "group_key"], inplace=True)
1073 | 
1074 |             # round like non-streaming
1075 |             peaks["product_mz"] = peaks["product_mz"].round(mz_precision_digits)
1076 |             peaks["precursor_mz"] = peaks["precursor_mz"].round(mz_precision_digits)
1077 |             peaks["intensity"] = peaks["intensity"].round(mz_precision_digits)
1078 | 
1079 |             peaks_parts.append(peaks)
1080 | 
1081 |         timestamped_echo(
1082 |             f"Info: [streaming] Run {run}: PASS2 matched {second_pass_matches} fragment rows"
1083 |         )
1084 | 
1085 |         if not peaks_parts:
1086 |             timestamped_echo(
1087 |                 f"Info: Skipping run {run}: no fragment peaks after processing"
1088 |             )
1089 |             continue
1090 | 
1091 |         peaks_run = pd.concat(peaks_parts, ignore_index=True)
1092 | 
1093 |         # Final aggregation: identical grouping keys to non-streaming parse()
1094 |         peaks_run = peaks_run.groupby(
1095 |             [
1096 |                 "run_id",
1097 |                 "scan_id",
1098 |                 "modified_peptide",
1099 |                 "precursor_charge",
1100 |                 "precursor_mz",
1101 |                 "fragment",
1102 |                 "product_mz",
1103 |             ],
1104 |             as_index=False,
1105 |         )["intensity"].max()
1106 | 
1107 |         # --- PSM export (same schema as non-streaming convert_sage) ---
1108 |         keep = [
1109 |             "run_id",
1110 |             "scan_id",
1111 |             "hit_rank",
1112 |             "massdiff",
1113 |             "precursor_charge",
1114 |             "retention_time",
1115 |             "ion_mobility",
1116 |             "peptide_sequence",
1117 |             "protein_id",
1118 |             "gene_id",
1119 |             "num_tot_proteins",
1120 |             "decoy",
1121 |             "modified_peptide",
1122 |             "group_id",
1123 |             "pep",
1124 |             "q_value",
1125 |             "peptide_q",
1126 |             "protein_q",
1127 |         ]
1128 |         psms_export = psms_run[keep].copy()
1129 | 
1130 |         # Optional: dedup by group_id if you really want it,
1131 |         # but to be 1:1 with old convert_sage, you can *omit* this.
1132 |         # psms_export = psms_export.drop_duplicates(subset=["group_id"]).reset_index(drop=True)
1133 | 
1134 |         if psms_export.empty or peaks_run.empty:
1135 |             timestamped_echo(
1136 |                 f"Info: Skipping run {run}: psms={len(psms_export)}, peaks={len(peaks_run)}"
1137 |             )
1138 |             continue
1139 | 
1140 |         psmpkl = f"{run}.psmpkl"
1141 |         peakpkl = f"{run}.peakpkl"
1142 |         psms_export.to_pickle(psmpkl)
1143 |         peaks_run.to_pickle(peakpkl)
1144 | 
1145 |         timestamped_echo(
1146 |             f"Info: [streaming] Wrote {psmpkl} (n_psms={len(psms_export)}) "
1147 |             f"and {peakpkl} (n_peaks={len(peaks_run)})"
1148 |         )
1149 |         outfiles.extend([psmpkl, peakpkl])
1150 | 
1151 |     if not outfiles:
1152 |         raise RuntimeError(
1153 |             "No non-empty runs detected after Sage streaming conversion."
1154 |         )
1155 | 
1156 |     return outfiles
1157 | 


--------------------------------------------------------------------------------