├── tests ├── data │ ├── library_targets.pqp │ ├── test_transition_list.pqp │ ├── Q99536.fasta │ ├── results.sage.tsv │ ├── matched_fragments.sage.tsv │ ├── config.json │ └── test_transition_list.tsv ├── test_generate_ionseries.py ├── _regtest_outputs │ ├── test_convertsage.test_convertsage.out │ ├── test_insilico_library.test_insilico_library.out │ ├── test_targetedfileconverter.test_targeted_file_converter_tsvtopqp.out │ ├── test_openswathassay_generator.test_openswath_assay_generator.out │ ├── test_targetedfileconverter.test_targeted_file_converter_pqptotsv.out │ └── test_openswathdecoy_generator.test_openswath_decoy_generator.out ├── test_convert.py ├── README.md ├── test_openswathdecoy_generator.py ├── test_openswathassay_generator.py ├── test_targetedfileconverter.py ├── test_convertsage.py └── test_insilico_library.py ├── easypqp ├── __init__.py ├── util.py ├── unimoddb.py ├── openswathdecoygenerator.py ├── targetedfileconverter.py ├── openswathassaygenerator.py ├── library.py └── sage.py ├── CONTRIBUTING.md ├── Dockerfile ├── .github └── workflows │ ├── dockerpublish.yml │ ├── ci.yml │ ├── pythonpublish.yml │ └── changelog.yml ├── pyproject.toml ├── LICENSE ├── .gitignore ├── CHANGELOG.md ├── requirements.txt └── README.md /tests/data/library_targets.pqp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grosenberger/easypqp/HEAD/tests/data/library_targets.pqp -------------------------------------------------------------------------------- /tests/data/test_transition_list.pqp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grosenberger/easypqp/HEAD/tests/data/test_transition_list.pqp -------------------------------------------------------------------------------- /tests/test_generate_ionseries.py: -------------------------------------------------------------------------------- 1 | from easypqp.convert import generate_ionseries 2 | 3 | print(generate_ionseries(".(UniMod:1)ADQLTEEQIAEFK", 2)) 4 | -------------------------------------------------------------------------------- /easypqp/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # Global variable for location of packaged unimod.xml file 4 | location = os.path.dirname(os.path.realpath(__file__)) 5 | pkg_unimod_db = os.path.join(location, 'data', 'unimod.xml') -------------------------------------------------------------------------------- /tests/_regtest_outputs/test_convertsage.test_convertsage.out: -------------------------------------------------------------------------------- 1 | Info: Converting Sage inputs: results.sage.tsv + matched_fragments.sage.tsv 2 | Info: Reading Sage PSMs 3 | Info: Reading Sage matched fragment peaks 4 | Info: Wrote LQSRPAAPPAPGPGQLTLR.psmpkl and LQSRPAAPPAPGPGQLTLR.peakpkl 5 | Info: Total elapsed time 0.00 minutes. 6 | -------------------------------------------------------------------------------- /tests/data/Q99536.fasta: -------------------------------------------------------------------------------- 1 | >sp|Q99536|VAT1_HUMAN Synaptic vesicle membrane protein VAT-1 homolog OS=Homo sapiens OX=9606 GN=VAT1 PE=1 SV=2 2 | MSDEREVAEAATGEDASSPPPKTEAASDPQHPAASEGAAAAAASPPLLRCLVLTGFGGYD 3 | KVKLQSRPAAPPAPGPGQLTLRLRACGLNFADLMARQGLYDRLPPLPVTPGMEGAGVVIA 4 | VGEGVSDRKAGDRVMVLNRSGMWQEEVTVPSVQTFLIPEAMTFEEAAALLVNYITAYMVL 5 | FDFGNLQPGHSVLVHMAAGGVGMAAVQLCRTVENVTVFGTASASKHEALKENGVTHPIDY 6 | HTTDYVDEIKKISPKGVDIVMDPLGGSDTAKGYNLLKPMGKVVTYGMANLLTGPKRNLMA 7 | LARTWWNQFSVTALQLLQANRAVCGFHLGYLDGEVELVSGVVARLLALYNQGHIKPHIDS 8 | VWPFEKVADAMKQMQEKKNVGKVLLVPGPEKEN 9 | -------------------------------------------------------------------------------- /tests/data/results.sage.tsv: -------------------------------------------------------------------------------- 1 | psm_id peptide proteins num_proteins filename scannr rank label expmass calcmass charge peptide_len missed_cleavages semi_enzymatic isotope_error precursor_ppm fragment_ppm hyperscore delta_next delta_best rt aligned_rt predicted_rt delta_rt_model ion_mobility predicted_mobility delta_mobility matched_peaks longest_b longest_y longest_y_pct matched_intensity_pct scored_candidates poisson sage_discriminant_score posterior_error spectrum_q peptide_q protein_q ms2_intensity 2 | 1 LQSRPAAPPAPGPGQLTLR sp|Q99536|VAT1_HUMAN 1 LQSRPAAPPAPGPGQLTLR.mzML controllerType=0 controllerNumber=1 scan=30069 1 1 1926.0815 1926.08 3 19 0 0 0.0 0.8239083 0.503857 72.26591573806016 72.26591573806016 0.0 108.2854 0.993444 0.0 0.993444 0.0 0.0 0.0 22 9 12 0.6315789 64.770966 1 -1.9562811911083433 1.2944585 1.0 1.0 1.0 1.0 72609170.0 3 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Commit Message Convention 2 | 3 | We follow [Conventional Commits](https://www.conventionalcommits.org/): 4 | 5 | ``` 6 | (): 7 | 8 | [optional body] 9 | 10 | [optional footer(s)] 11 | ``` 12 | 13 | ### Types: 14 | - `feat`: New feature 15 | - `fix`: Bug fix 16 | - `docs`: Documentation changes 17 | - `style`: Code style changes (formatting, etc.) 18 | - `refactor`: Code refactoring 19 | - `perf`: Performance improvements 20 | - `test`: Adding or updating tests 21 | - `chore`: Maintenance tasks 22 | 23 | ### Examples: 24 | ```bash 25 | git commit -m "feat: add support for DuckDB backend" 26 | git commit -m "fix: resolve memory leak in scoring module" 27 | git commit -m "docs: update installation instructions" 28 | git commit -m "chore: update dependencies to latest versions" 29 | ``` 30 | 31 | ### Breaking Changes: 32 | ```bash 33 | git commit -m "feat!: remove deprecated API endpoints 34 | 35 | BREAKING CHANGE: The /v1/score endpoint has been removed. 36 | Use /v2/score instead." 37 | ``` -------------------------------------------------------------------------------- /tests/data/matched_fragments.sage.tsv: -------------------------------------------------------------------------------- 1 | psm_id fragment_type fragment_ordinals fragment_charge fragment_mz_calculated fragment_mz_experimental fragment_intensity 2 | 1 b 2 1 242.14992 242.14989 578440.75 3 | 1 b 3 1 329.18195 329.18304 33585.195 4 | 1 b 4 1 485.28305 485.28275 703782.06 5 | 1 b 5 1 582.3358 582.33417 362622.56 6 | 1 b 6 1 653.3729 653.3723 678786.2 7 | 1 b 7 1 724.41003 724.4097 11793456.0 8 | 1 b 8 1 821.46277 821.4627 1807024.5 9 | 1 b 9 1 918.5155 918.5154 556689.5 10 | 1 b 10 1 989.5526 989.55237 13877311.0 11 | 1 b 12 1 1143.627 1143.6226 920279.2 12 | 1 y 12 1 1203.6846 1203.6862 1344718.1 13 | 1 y 11 1 1106.6318 1106.6305 3311897.0 14 | 1 y 10 1 1009.57904 1009.57855 1904729.9 15 | 1 y 9 1 938.54193 938.5417 15733808.0 16 | 1 y 8 1 841.4892 841.4886 6905694.0 17 | 1 y 7 1 784.4677 784.46735 5842707.0 18 | 1 y 6 1 687.415 687.4143 2340203.2 19 | 1 y 5 1 630.3935 630.3935 839138.7 20 | 1 y 4 1 502.3349 502.33484 1299269.5 21 | 1 y 3 1 389.25085 389.251 840262.75 22 | 1 y 2 1 288.2032 288.20273 233942.19 23 | 1 y 1 1 175.11914 175.11903 700811.4 24 | -------------------------------------------------------------------------------- /tests/test_convert.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from easypqp.convert import get_scan 3 | 4 | 5 | class TestConvert(unittest.TestCase): 6 | 7 | def test_get_scan(self): 8 | self.assertEqual(11, get_scan("controllerType=0 controllerNumber=1 scan=2 demux=0", 11)) 9 | self.assertEqual(11, get_scan("sample=2 period=3 cycle=4 experiment=5", 11)) 10 | self.assertEqual(11, get_scan("frame=2 scan=3", 11)) 11 | 12 | self.assertEqual(11, get_scan("controllerType=0 controllerNumber=1 scan=11", 22)) 13 | self.assertEqual(11, get_scan("function=0 process=1 scan=11", 22)) 14 | self.assertEqual(11, get_scan("jobRun=0 spotLabel=asw spectrum=11", 22)) 15 | self.assertEqual(11, get_scan("11", 22)) 16 | self.assertEqual(11, get_scan("scan=11", 22)) 17 | self.assertEqual(11, get_scan("spectrum=11", 22)) 18 | self.assertEqual(11, get_scan("scanId=11", 22)) 19 | self.assertEqual(11, get_scan("index=11", 22)) 20 | self.assertEqual(11, get_scan("frame=11", 22)) 21 | 22 | 23 | if __name__ == '__main__': 24 | unittest.main() 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # docker build -t easypqp:latest . 2 | 3 | FROM python:3.11-slim 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | ENV PYTHONUNBUFFERED=1 7 | 8 | # Minimal build/runtime dependencies. Add or remove system packages as needed 9 | # if package compilation fails (e.g., pyopenms may require extra libs). 10 | RUN apt-get update \ 11 | && apt-get install -y --no-install-recommends \ 12 | build-essential \ 13 | gcc \ 14 | git \ 15 | cmake \ 16 | swig \ 17 | pkg-config \ 18 | libxml2-dev \ 19 | zlib1g-dev \ 20 | libbz2-dev \ 21 | liblzma-dev \ 22 | libcurl4-openssl-dev \ 23 | libssl-dev \ 24 | # Runtime libraries required by pyopenms 25 | libglib2.0-0 \ 26 | libgomp1 \ 27 | && rm -rf /var/lib/apt/lists/* 28 | 29 | # Upgrade packaging tools 30 | RUN pip install --no-cache-dir --upgrade pip setuptools wheel 31 | 32 | # Copy project into the image 33 | WORKDIR /tmp/easypqp 34 | COPY . /tmp/easypqp 35 | 36 | # Install EasyPQP with all optional features by default 37 | RUN pip install --no-cache-dir ".[all]" 38 | 39 | # Cleanup sources 40 | WORKDIR / 41 | RUN rm -rf /tmp/easypqp 42 | 43 | # Default command prints help 44 | CMD ["easypqp","--help"] 45 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | README 2 | ====== 3 | 4 | The scripts should be run with `py.test` (>=3.4.1) with installed plugin `pytest-regest` 5 | (>=1.0.14 see https://pypi.python.org/pypi/pytest-regtest). 6 | 7 | The plugin allows recording of approved output so that later test runs will check if 8 | the output is still the same. It is simple to use as you can see in `test_via_regression.py`. 9 | 10 | In order to record output you have to use the `regtest` fixture like in the following example. 11 | This `regtest` behaves like a file handle, so you can write to it as usual: 12 | 13 | ```` 14 | def test_0(regtest): 15 | print >> regtest, "this is the recorded output" 16 | ```` 17 | 18 | If you now create a new test function `test_0` in a file `test_xyz.py`, first run 19 | 20 | ```` 21 | $ py.test tests/test_xyz.py::test_0 22 | ```` 23 | 24 | which will show you the yet not approved output. You can approve this output using 25 | 26 | ```` 27 | $ py.test --regtest-reset tests/test_xyz.py::test_0 28 | ```` 29 | 30 | Which will create a file in `tests/_regtest_outputs/test_xyz.test_0.out` which you should not forget to 31 | commit with `git`. 32 | 33 | 34 | Later runs like 35 | ```` 36 | $ py.test tests/test_xyz.py 37 | ```` 38 | 39 | will then check if the recorded output is still the same. 40 | 41 | -------------------------------------------------------------------------------- /.github/workflows/dockerpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Docker image 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | push_to_registries: 9 | name: Push Docker image to multiple registries 10 | runs-on: ubuntu-latest 11 | permissions: 12 | packages: write 13 | contents: read 14 | steps: 15 | - name: Check out the repo 16 | uses: actions/checkout@v3 17 | 18 | - name: Log in to Docker Hub 19 | uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 20 | with: 21 | username: ${{ secrets.DOCKER_USERNAME }} 22 | password: ${{ secrets.DOCKER_PASSWORD }} 23 | 24 | - name: Log in to the Container registry 25 | uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 26 | with: 27 | registry: ghcr.io 28 | username: ${{ github.actor }} 29 | password: ${{ secrets.GITHUB_TOKEN }} 30 | 31 | - name: Extract metadata (tags, labels) for Docker 32 | id: meta 33 | uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 34 | with: 35 | images: | 36 | grosenberger/easypqp 37 | ghcr.io/${{ github.repository }} 38 | 39 | - name: Build and push Docker images 40 | uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc 41 | with: 42 | context: . 43 | push: true 44 | tags: ${{ steps.meta.outputs.tags }} 45 | labels: ${{ steps.meta.outputs.labels }} 46 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64.0.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "easypqp" 7 | version = "0.1.54" 8 | description = "EasyPQP: Simple library generation for OpenSWATH" 9 | readme = "README.md" 10 | requires-python = ">=3.9" 11 | authors = [ 12 | { name = "George Rosenberger", email = "gr2578@cumc.columbia.edu" }, 13 | ] 14 | license = "BSD-3-Clause" 15 | # Core dependencies required for basic EasyPQP functionality 16 | dependencies = [ 17 | "numba", 18 | "Click>=8.0.0", 19 | "numpy==1.26.4", 20 | "scipy", 21 | "scikit-learn", 22 | "statsmodels", 23 | "pandas>=1.5.0", 24 | "biopython", 25 | "pyopenms>=3.3.0", 26 | "matplotlib>=3.5.1", 27 | "seaborn", 28 | "tqdm", 29 | ] 30 | 31 | [project.urls] 32 | Homepage = "https://github.com/grosenberger/easypqp" 33 | 34 | [project.scripts] 35 | easypqp = "easypqp.main:cli" 36 | 37 | [project.optional-dependencies] 38 | 39 | # PyProphet integration 40 | # Install with: pip install easypqp[pyprophet] 41 | pyprophet = ["pyprophet"] 42 | 43 | # Rust backend for in-silico library generation 44 | # Install with: pip install easypqp[rust] 45 | rust = ["easypqp_rs>=0.1.5"] 46 | 47 | # All optional features 48 | # Install with: pip install easypqp[all] 49 | all = [ 50 | "pyprophet", 51 | "easypqp_rs>=0.1.5" 52 | ] 53 | 54 | [tool.setuptools] 55 | include-package-data = true 56 | 57 | [tool.setuptools.packages.find] 58 | where = ["."] 59 | include = ["easypqp*"] 60 | 61 | [tool.setuptools.package-data] 62 | easypqp = ["data/unimod.xml"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, George Rosenberger 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /tests/data/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "database": { 3 | "enzyme": { 4 | "missed_cleavages": 1, 5 | "min_len": 7, 6 | "max_len": 50, 7 | "cleave_at": "KR", 8 | "restrict": "P", 9 | "c_terminal": true, 10 | "semi_enzymatic": false 11 | }, 12 | "peptide_min_mass": 500.0, 13 | "peptide_max_mass": 5000.0, 14 | "static_mods": { 15 | "C": 57.0215 16 | }, 17 | "variable_mods": {}, 18 | "max_variable_mods": 2, 19 | "decoy_tag": "rev_", 20 | "generate_decoys": true, 21 | "fasta": "tests/data/Q99536.fasta" 22 | }, 23 | "insilico_settings": { 24 | "precursor_charge": [ 25 | 2, 26 | 4 27 | ], 28 | "max_fragment_charge": 1, 29 | "min_transitions": 6, 30 | "max_transitions": 6, 31 | "fragmentation_model": "cid", 32 | "allowed_fragment_types": [ 33 | "b", 34 | "y" 35 | ], 36 | "rt_scale": 100.0 37 | }, 38 | "dl_feature_generators": { 39 | "device": "cpu", 40 | "fine_tune_config": { 41 | "fine_tune": false, 42 | "train_data_path": "", 43 | "batch_size": 256, 44 | "epochs": 3, 45 | "learning_rate": 0.001, 46 | "save_model": true 47 | }, 48 | "instrument": "QE", 49 | "nce": 20.0, 50 | "batch_size": 64 51 | }, 52 | "peptide_chunking": 0, 53 | "output_file": "tests/data/easypqp_insilico_library.tsv", 54 | "write_report": true, 55 | "parquet_output": false 56 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .idea -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ master, main ] 6 | pull_request: 7 | branches: [ master, main ] 8 | 9 | # Ensure only one run per branch/PR is active at a time; cancel previous runs when new commits arrive 10 | concurrency: 11 | group: ${{ github.workflow }}-${{ github.ref }} 12 | cancel-in-progress: true 13 | 14 | jobs: 15 | test: 16 | name: Run tests 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [ '3.10', '3.11', '3.12', '3.13' ] 22 | 23 | steps: 24 | - name: Checkout repository 25 | uses: actions/checkout@v4 26 | 27 | - name: Set up Python 28 | uses: actions/setup-python@v4 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | 32 | - name: Install system dependencies 33 | run: | 34 | sudo apt-get update 35 | sudo apt-get install -y --no-install-recommends \ 36 | build-essential gcc git cmake swig pkg-config libxml2-dev zlib1g-dev libbz2-dev liblzma-dev libcurl4-openssl-dev libssl-dev libglib2.0-0 libgomp1 37 | 38 | - name: Upgrade pip and setuptools 39 | run: pip install --upgrade pip setuptools wheel 40 | 41 | - name: Cache pip 42 | uses: actions/cache@v4 43 | with: 44 | path: ~/.cache/pip 45 | key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }} 46 | restore-keys: | 47 | ${{ runner.os }}-pip-${{ matrix.python-version }}- 48 | 49 | - name: Install package with all extras 50 | run: pip install --no-cache-dir ".[all]" 51 | 52 | - name: Install test dependencies 53 | run: python -m pip install pytest pytest-regtest 54 | 55 | - name: Run pytest 56 | run: pytest -q 57 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | workflow_dispatch: 7 | inputs: 8 | tag: 9 | description: 'Tag to publish (e.g. 0.1.53). If omitted on manual dispatch the workflow will try to use the latest release tag.' 10 | required: false 11 | type: string 12 | 13 | jobs: 14 | deploy: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout repository (full history) 18 | uses: actions/checkout@v4 19 | with: 20 | fetch-depth: 0 21 | 22 | - name: Checkout requested tag (workflow_dispatch) 23 | if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' }} 24 | run: | 25 | git fetch --tags --force 26 | git checkout tags/${{ github.event.inputs.tag }} -b publish-${{ github.event.inputs.tag }} 27 | 28 | - name: Checkout release tag (release event) 29 | if: ${{ github.event_name == 'release' }} 30 | run: | 31 | git fetch --tags --force 32 | git checkout tags/${{ github.event.release.tag_name }} -b publish-${{ github.event.release.tag_name }} 33 | - name: Set up Python 34 | uses: actions/setup-python@v4 35 | with: 36 | python-version: '3.x' 37 | - name: Install build tools 38 | run: | 39 | python -m pip install --upgrade pip 40 | python -m pip install --upgrade build twine 41 | - name: Build and publish 42 | env: 43 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 44 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 45 | run: | 46 | # Build sdist and wheel using PEP 517 build backend (reads pyproject.toml) 47 | python -m build --sdist --wheel 48 | # Upload artifacts with twine 49 | python -m twine upload dist/* 50 | 51 | -------------------------------------------------------------------------------- /tests/test_openswathdecoy_generator.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import subprocess 5 | import shutil 6 | import sys 7 | 8 | import pandas as pd 9 | import sqlite3 10 | 11 | import pytest 12 | 13 | pd.options.display.expand_frame_repr = False 14 | pd.options.display.precision = 4 15 | pd.options.display.max_columns = None 16 | 17 | DATA_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") 18 | 19 | def _run_cmdline(cmdline): 20 | stdout = cmdline + "\n" 21 | try: 22 | stdout += str(subprocess.check_output(cmdline, shell=True, 23 | stderr=subprocess.STDOUT)) 24 | except subprocess.CalledProcessError as error: 25 | print(error, end="", file=sys.stderr) 26 | raise 27 | return stdout 28 | 29 | def _run_openswath_decoy_generator(regtest, temp_folder): 30 | os.chdir(temp_folder) 31 | data_path = os.path.join(DATA_FOLDER, "library_targets.pqp") 32 | shutil.copy(data_path, temp_folder) 33 | cmdline = "easypqp openswath-decoy-generator --in library_targets.pqp --out library.pqp --method pseudo-reverse" 34 | 35 | stdout = _run_cmdline(cmdline) 36 | 37 | conn = sqlite3.connect("library.pqp") 38 | protein_table = pd.read_sql_query("SELECT * FROM PROTEIN", conn) 39 | peptide_table = pd.read_sql_query("SELECT * FROM PEPTIDE", conn) 40 | precursor_table = pd.read_sql_query("SELECT * FROM PRECURSOR", conn) 41 | transition_table = pd.read_sql_query("SELECT * FROM TRANSITION", conn) 42 | conn.close() 43 | 44 | print(protein_table.sort_values("ID"),file=regtest) 45 | print(peptide_table.sort_values("ID"),file=regtest) 46 | print(precursor_table.sort_values("ID"),file=regtest) 47 | print(transition_table.sort_values("ID"),file=regtest) 48 | 49 | def test_openswath_decoy_generator(tmpdir, regtest): 50 | _run_openswath_decoy_generator(regtest, tmpdir.strpath) -------------------------------------------------------------------------------- /tests/test_openswathassay_generator.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import subprocess 5 | import shutil 6 | import sys 7 | 8 | import pandas as pd 9 | import sqlite3 10 | 11 | import pytest 12 | 13 | pd.options.display.expand_frame_repr = False 14 | pd.options.display.precision = 4 15 | pd.options.display.max_columns = None 16 | 17 | DATA_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") 18 | 19 | def _run_cmdline(cmdline): 20 | stdout = cmdline + "\n" 21 | try: 22 | stdout += str(subprocess.check_output(cmdline, shell=True, 23 | stderr=subprocess.STDOUT)) 24 | except subprocess.CalledProcessError as error: 25 | print(error, end="", file=sys.stderr) 26 | raise 27 | return stdout 28 | 29 | def _run_openswath_assay_generator(regtest, temp_folder): 30 | os.chdir(temp_folder) 31 | data_path = os.path.join(DATA_FOLDER, "test_transition_list.tsv") 32 | shutil.copy(data_path, temp_folder) 33 | cmdline = "easypqp openswath-assay-generator --in test_transition_list.tsv --out library_targets.pqp" 34 | 35 | stdout = _run_cmdline(cmdline) 36 | 37 | conn = sqlite3.connect("library_targets.pqp") 38 | protein_table = pd.read_sql_query("SELECT * FROM PROTEIN", conn) 39 | peptide_table = pd.read_sql_query("SELECT * FROM PEPTIDE", conn) 40 | precursor_table = pd.read_sql_query("SELECT * FROM PRECURSOR", conn) 41 | transition_table = pd.read_sql_query("SELECT * FROM TRANSITION", conn) 42 | conn.close() 43 | 44 | print(protein_table.sort_values("ID"),file=regtest) 45 | print(peptide_table.sort_values("ID"),file=regtest) 46 | print(precursor_table.sort_values("ID"),file=regtest) 47 | print(transition_table.sort_values("ID"),file=regtest) 48 | 49 | def test_openswath_assay_generator(tmpdir, regtest): 50 | _run_openswath_assay_generator(regtest, tmpdir.strpath) -------------------------------------------------------------------------------- /tests/_regtest_outputs/test_insilico_library.test_insilico_library.out: -------------------------------------------------------------------------------- 1 | Generated library contains 492 transitions 2 | Number of unique precursors: 41 3 | Number of unique peptides: 41 4 | 5 | Columns: ['PrecursorMz', 'ProductMz', 'PrecursorCharge', 'ProductCharge', 'LibraryIntensity', 'NormalizedRetentionTime', 'PeptideSequence', 'ModifiedPeptideSequence', 'PeptideGroupLabel', 'LabelType', 'CompoundName', 'SumFormula', 'SMILES', 'Adducts', 'ProteinId', 'UniprotId', 'GeneName', 'FragmentType', 'FragmentSeriesNumber', 'Annotation', 'CollisionEnergy', 'PrecursorIonMobility', 'TransitionGroupId', 'TransitionId', 'Decoy', 'DetectingTransition', 'IdentifyingTransition', 'QuantifyingTransition', 'Peptidoforms'] 6 | 7 | First 5 transitions (deterministic columns only): 8 | PrecursorMz ProductMz PrecursorCharge ProductCharge PeptideSequence ModifiedPeptideSequence ProteinId UniprotId GeneName FragmentType FragmentSeriesNumber Annotation TransitionGroupId TransitionId Decoy 9 | 0 393.7187 228.1343 2 1 NLMALAR NLMALAR Q99536 Q99536 VAT1_HUMAN b 2 b2^1 0 2 0 10 | 1 393.7187 246.1561 2 1 NLMALAR NLMALAR Q99536 Q99536 VAT1_HUMAN y 2 y2^1 0 20 0 11 | 2 393.7187 359.1748 2 1 NLMALAR NLMALAR Q99536 Q99536 VAT1_HUMAN b 3 b3^1 0 6 0 12 | 3 393.7187 359.2401 2 1 NLMALAR NLMALAR Q99536 Q99536 VAT1_HUMAN y 3 y3^1 0 16 0 13 | 4 393.7187 430.2772 2 1 NLMALAR NLMALAR Q99536 Q99536 VAT1_HUMAN y 4 y4^1 0 12 0 14 | 15 | Statistics: 16 | Precursor charge range: 2-3 17 | Fragment types: ['b', 'y'] 18 | Contains decoys: False 19 | Number of targets: 492 20 | Number of decoys: 0 21 | -------------------------------------------------------------------------------- /tests/test_targetedfileconverter.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import subprocess 5 | import shutil 6 | import sys 7 | 8 | import pandas as pd 9 | import sqlite3 10 | 11 | import pytest 12 | 13 | pd.options.display.expand_frame_repr = False 14 | pd.options.display.precision = 4 15 | pd.options.display.max_columns = None 16 | 17 | DATA_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") 18 | 19 | def _run_cmdline(cmdline): 20 | stdout = cmdline + "\n" 21 | try: 22 | stdout += str(subprocess.check_output(cmdline, shell=True, 23 | stderr=subprocess.STDOUT)) 24 | except subprocess.CalledProcessError as error: 25 | print(error, end="", file=sys.stderr) 26 | raise 27 | return stdout 28 | 29 | def _run_targetedfileconverter(regtest, temp_folder, infile, outfile): 30 | os.chdir(temp_folder) 31 | data_path = os.path.join(DATA_FOLDER, infile) 32 | shutil.copy(data_path, temp_folder) 33 | cmdline = f"easypqp targeted-file-converter --in {infile} --out {outfile}" 34 | 35 | stdout = _run_cmdline(cmdline) 36 | 37 | if outfile.split(".")[1] == "pqp": 38 | conn = sqlite3.connect(outfile) 39 | protein_table = pd.read_sql_query("SELECT * FROM PROTEIN", conn) 40 | peptide_table = pd.read_sql_query("SELECT * FROM PEPTIDE", conn) 41 | precursor_table = pd.read_sql_query("SELECT * FROM PRECURSOR", conn) 42 | transition_table = pd.read_sql_query("SELECT * FROM TRANSITION", conn) 43 | conn.close() 44 | 45 | print(protein_table.sort_values("ID"),file=regtest) 46 | print(peptide_table.sort_values("ID"),file=regtest) 47 | print(precursor_table.sort_values("ID"),file=regtest) 48 | print(transition_table.sort_values("ID"),file=regtest) 49 | elif outfile.split(".")[1] == "tsv": 50 | print(pd.read_csv(outfile, sep="\t", nrows=100).sort_index(axis=1),file=regtest) 51 | 52 | 53 | def test_targeted_file_converter_tsvtopqp(tmpdir, regtest): 54 | _run_targetedfileconverter(regtest, tmpdir.strpath, "test_transition_list.tsv", "test_transition_list.pqp") 55 | 56 | def test_targeted_file_converter_pqptotsv(tmpdir, regtest): 57 | _run_targetedfileconverter(regtest, tmpdir.strpath, "test_transition_list.pqp", "test_transition_list.tsv") -------------------------------------------------------------------------------- /easypqp/util.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import json 3 | import tempfile 4 | from typing import Union 5 | import click 6 | 7 | 8 | def timestamped_echo(message): 9 | timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 10 | click.echo(f"{timestamp} - {message}") 11 | 12 | 13 | 14 | 15 | def create_json_config(as_bytes: bool = False) -> Union[str, bytes]: 16 | """ 17 | Create a JSON configuration file for EasyPQP In-silico library generation. 18 | """ 19 | config = { 20 | "version": "0.1.0", 21 | "database": { 22 | "enzyme": { 23 | "missed_cleavages": 1, 24 | "min_len": None, 25 | "max_len": None, 26 | "cleave_at": "KR", 27 | "restrict": "P", 28 | "c_terminal": None, 29 | "semi_enzymatic": None 30 | }, 31 | "peptide_min_mass": 500.0, 32 | "peptide_max_mass": 5000.0, 33 | "static_mods": { 34 | "C": 57.0215 35 | }, 36 | "variable_mods": {}, 37 | "max_variable_mods": 2, 38 | "decoy_tag": "rev_", 39 | "generate_decoys": True, 40 | "fasta": "" 41 | }, 42 | "insilico_settings": { 43 | "precursor_charge": [2, 4], 44 | "max_fragment_charge": 1, 45 | "min_transitions": 6, 46 | "max_transitions": 6, 47 | "fragmentation_model": "cid", 48 | "allowed_fragment_types": ["b", "y"], 49 | "rt_scale": 100.0 50 | }, 51 | "dl_feature_generators": { 52 | "device": "cpu", 53 | "fine_tune_config": { 54 | "fine_tune": False, 55 | "train_data_path": "", 56 | "batch_size": 256, 57 | "epochs": 3, 58 | "learning_rate": 0.001, 59 | "save_model": True 60 | }, 61 | "instrument": "QE", 62 | "nce": 20.0, 63 | "batch_size": 64 64 | }, 65 | "peptide_chunking": 0, 66 | "output_file": "./easypqp_insilico_library.tsv", 67 | "write_report": True, 68 | "parquet_output": False 69 | } 70 | 71 | json_str = json.dumps(config, indent=2) 72 | 73 | if as_bytes: 74 | return json_str.encode('utf-8') 75 | else: 76 | with tempfile.NamedTemporaryFile('w+', suffix=".json", delete=False) as tmp: 77 | tmp.write(json_str) 78 | tmp.flush() 79 | return tmp.name 80 | -------------------------------------------------------------------------------- /tests/test_convertsage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import shutil 4 | import sys 5 | 6 | import pandas as pd 7 | import re 8 | 9 | DATA_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") 10 | 11 | 12 | def _run_cmdline(cmdline): 13 | try: 14 | out = subprocess.check_output(cmdline, shell=True, stderr=subprocess.STDOUT) 15 | return out.decode(errors="replace") 16 | except subprocess.CalledProcessError as error: 17 | out = error.output.decode() if error.output else "" 18 | print(out, file=sys.stderr) 19 | raise 20 | 21 | 22 | def _run_convertsage(temp_folder, regtest): 23 | os.chdir(temp_folder) 24 | 25 | # Copy test files to temp directory 26 | shutil.copy(os.path.join(DATA_FOLDER, "results.sage.tsv"), temp_folder) 27 | shutil.copy(os.path.join(DATA_FOLDER, "matched_fragments.sage.tsv"), temp_folder) 28 | 29 | cmdline = ( 30 | "easypqp convertsage --sage_psm results.sage.tsv " 31 | "--sage_fragments matched_fragments.sage.tsv" 32 | ) 33 | 34 | out = _run_cmdline(cmdline) 35 | # Strip leading timestamps of the form 'YYYY-MM-DD HH:MM:SS - ' and 36 | # filter out pyopenms environment warnings which are non-deterministic 37 | cleaned_lines = [] 38 | for line in out.splitlines(): 39 | line = re.sub(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - ", "", line) 40 | # Remove pyopenms/openms data-path warnings which leak local site-packages paths 41 | if re.search(r"pyopenms", line, flags=re.IGNORECASE) or re.search( 42 | r"OPENMS_DATA_PATH", line 43 | ): 44 | continue 45 | cleaned_lines.append(line) 46 | cleaned = "\n".join(cleaned_lines) 47 | print(cleaned, file=regtest) 48 | 49 | # Expect output files for run 'LQSRPAAPPAPGPGQLTLR' 50 | run_stem = "LQSRPAAPPAPGPGQLTLR" 51 | psmpkl = f"{run_stem}.psmpkl" 52 | peakpkl = f"{run_stem}.peakpkl" 53 | 54 | assert os.path.exists(psmpkl), f"Missing expected output {psmpkl}" 55 | assert os.path.exists(peakpkl), f"Missing expected output {peakpkl}" 56 | 57 | # Verify pickles load and contain expected columns 58 | psms = pd.read_pickle(psmpkl) 59 | peaks = pd.read_pickle(peakpkl) 60 | 61 | assert not psms.empty, "psmpkl is empty" 62 | assert not peaks.empty, "peakpkl is empty" 63 | 64 | # Basic schema checks 65 | assert "run_id" in psms.columns 66 | assert "scan_id" in psms.columns 67 | assert "run_id" in peaks.columns 68 | assert "product_mz" in peaks.columns or "fragment" in peaks.columns 69 | 70 | 71 | def test_convertsage(tmpdir, regtest): 72 | _run_convertsage(tmpdir.strpath, regtest) 73 | -------------------------------------------------------------------------------- /.github/workflows/changelog.yml: -------------------------------------------------------------------------------- 1 | # filepath: .github/workflows/changelog.yml 2 | name: Generate Changelog 3 | 4 | on: 5 | workflow_dispatch: # Manual trigger 6 | inputs: 7 | version: 8 | description: 'Version tag (e.g., 3.0.4 or v3.0.4). If omitted the latest tag will be used.' 9 | required: false 10 | type: string 11 | push: 12 | # Trigger on any tag so releases without a 'v' prefix (e.g. '0.1.53') also run 13 | tags: 14 | - '*' 15 | 16 | concurrency: 17 | group: ${{ github.workflow }}-${{ github.ref }} 18 | cancel-in-progress: true 19 | 20 | jobs: 21 | changelog: 22 | name: Generate Changelog 23 | runs-on: ubuntu-latest 24 | permissions: 25 | contents: write 26 | pull-requests: write 27 | 28 | steps: 29 | - name: Checkout code 30 | uses: actions/checkout@v4 31 | with: 32 | fetch-depth: 0 # Fetch all history for changelog generation 33 | 34 | - name: Install git-cliff 35 | run: | 36 | wget https://github.com/orhun/git-cliff/releases/download/v2.7.0/git-cliff-2.7.0-x86_64-unknown-linux-gnu.tar.gz 37 | tar -xzf git-cliff-2.7.0-x86_64-unknown-linux-gnu.tar.gz 38 | sudo mv git-cliff-2.7.0/git-cliff /usr/local/bin/ 39 | chmod +x /usr/local/bin/git-cliff 40 | 41 | - name: Generate full CHANGELOG 42 | run: | 43 | git-cliff --output CHANGELOG.md 44 | 45 | - name: Generate release notes for latest tag 46 | if: startsWith(github.ref, 'refs/tags/') 47 | run: | 48 | # Get the latest tag 49 | LATEST_TAG=$(git describe --tags --abbrev=0) 50 | 51 | # Generate changelog for this release only 52 | git-cliff --latest --strip header > RELEASE_NOTES.md 53 | 54 | echo "Release notes for ${LATEST_TAG}:" 55 | cat RELEASE_NOTES.md 56 | 57 | - name: Commit and push CHANGELOG 58 | if: github.event_name == 'workflow_dispatch' || startsWith(github.ref, 'refs/tags/') 59 | run: | 60 | git config user.name "github-actions[bot]" 61 | git config user.email "github-actions[bot]@users.noreply.github.com" 62 | git add CHANGELOG.md 63 | 64 | if git diff --staged --quiet; then 65 | echo "No changes to CHANGELOG.md" 66 | else 67 | git commit -m "chore: update CHANGELOG.md" 68 | git push origin HEAD:master || git push origin HEAD:main 69 | fi 70 | 71 | - name: Create/Update Release with Changelog 72 | if: startsWith(github.ref, 'refs/tags/') 73 | uses: softprops/action-gh-release@v1 74 | with: 75 | body_path: RELEASE_NOTES.md 76 | draft: false 77 | prerelease: false 78 | env: 79 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /tests/_regtest_outputs/test_targetedfileconverter.test_targeted_file_converter_tsvtopqp.out: -------------------------------------------------------------------------------- 1 | ID PROTEIN_ACCESSION DECOY 2 | 4 0 Q04637 0 3 | 3 1 Q2M2I8 0 4 | 2 2 Q86WB0 0 5 | 1 3 Q8WWI1 0 6 | 0 4 Q92890 0 7 | ID UNMODIFIED_SEQUENCE MODIFIED_SEQUENCE DECOY 8 | 4 0 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR 0 9 | 3 1 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR 0 10 | 2 2 EAALPPVSPLK EAALPPVS(UniMod:21)PLK 0 11 | 1 3 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK 0 12 | 0 4 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR 0 13 | ID TRAML_ID GROUP_LABEL PRECURSOR_MZ CHARGE LIBRARY_INTENSITY LIBRARY_RT LIBRARY_DRIFT_TIME DECOY 14 | 3 0 AGQTQPNPGILPIQPALT(Phospho)PR_2 1075.0619 2 None 70.6096 -1.0 0 15 | 4 1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 1266.5622 2 None 67.4251 -1.0 0 16 | 0 2 EAALPPVS(Phospho)PLK_2 601.3151 2 None 48.0082 -1.0 0 17 | 1 3 GVEPSPS(Phospho)PIKPGDIK_2 800.9028 2 None 32.9698 -1.0 0 18 | 2 4 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 948.9241 2 None 52.9017 -1.0 0 19 | ID TRAML_ID PRODUCT_MZ CHARGE TYPE ANNOTATION ORDINAL DETECTING IDENTIFYING QUANTIFYING LIBRARY_INTENSITY DECOY 20 | 0 0 2345 260.1969 1 y y2^1 2 1 0 1 81.9345 0 21 | 1 1 2346 409.2146 2 y y7^2 7 1 0 1 2965.7283 0 22 | 2 2 2347 465.7566 2 y y8^2 8 1 0 1 132.8395 0 23 | 3 3 2348 623.3164 1 y y5^1 5 1 0 1 101.3607 0 24 | 4 4 2349 720.3692 1 y y6^1 6 1 0 1 1580.4800 0 25 | .. .. ... ... ... ... ... ... ... ... ... ... ... 26 | 73 73 42472 1264.6090 1 b b13^1 13 1 0 1 890.7413 0 27 | 74 74 42473 1268.5155 1 y y11^1 11 1 0 1 1830.4344 0 28 | 75 75 42474 1355.5475 1 y y12^1 12 1 0 1 2691.2388 0 29 | 76 76 42475 1393.6515 1 b b14^1 14 1 0 1 870.2799 0 30 | 77 77 42476 1486.5880 1 y y13^1 13 1 0 1 851.3514 0 31 | 32 | [78 rows x 12 columns] 33 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | ## [0.1.54] - 2025-12-12 6 | 7 | ### 🚀 Features 8 | 9 | - Add Sage PSM and fragment parsers for EasyPQP conversion 10 | - Add Sage input options for library generation 11 | - Enhance SagePSMParser with protein token parsing and unique accessions handling 12 | - Add max delta mass option for UniMod annotation in library generation 13 | - Implement table reading function for TSV and Parquet files in SagePSMParser 14 | - Add mz precision option to SagePSMParser and conversion function 15 | - Add convertsage cli command 16 | - Add DataFrame parsing methods for Sage PSM and fragment data to support streaming 17 | - Add streaming options to convertsage for improved performance with large inputs 18 | - Enhance convert_sage_streaming for improved memory efficiency and processing speed 19 | - Update streaming threshold to 2GB 20 | - Add EasyPQP In-Silico Library Generation command 21 | - Add EasyPQP In-Silico Library Generation command 22 | - Enhance insilico_library options with RT scaling, report generation, Parquet output, and threading support 23 | - Add insilico library generation test and configuration files 24 | - Add matched_fragments and results data files for testing 25 | - Implement optional in-silico library generation with rust backend support 26 | - Add CI workflow for testing across multiple Python versions 27 | - Add concurrency configuration to CI workflow for improved efficiency 28 | 29 | ### 🐛 Bug Fixes 30 | 31 | - Correct documentation for output columns in SagePSMParser 32 | - Handle import for transform_pi0_lambda across PyProphet versions 33 | - Precursor mz calculcation 34 | - Enhance _basename_wo_ext to handle common compression extensions 35 | - Improve logging messages for streaming conversion in convert_sage 36 | - Add missing easypqp_rs dependency in pyproject.toml 37 | - Remove timestamps from convertsage output for deterministic testing 38 | - Enhance output cleaning in _run_convertsage for deterministic testing 39 | - Pyprophet import error 40 | - Update version to 0.1.54 in pyproject.toml 41 | - Update changelog workflow to allow any tag and improve version input description 42 | 43 | ### 💼 Other 44 | 45 | - Parameter transformation function for Click options 46 | 47 | ### 📚 Documentation 48 | 49 | - Update README to reflect support for Sage 50 | - Update README with CLI commands for easypqp 51 | - Add doc for _get_first_existing function to retrieve existing DataFrame columns with optional casting 52 | - Update README to include in-silico library generation command and details 53 | - Add information about standalone portable rust binary in README 54 | - Update README to clarify installation of optional features and in-silico library generation 55 | 56 | ### 🧪 Testing 57 | 58 | - Add test for convertsage functionality with output validation 59 | 60 | ### ⚙️ Miscellaneous Tasks 61 | 62 | - Add requirements.txt for dependency management 63 | - Add changelog generation workflow and update CONTRIBUTING guidelines 64 | - Update pyproject.toml to clarify optional dependencies and remove redundant entries 65 | - Update Dockerfile to use Python 3.11-slim 66 | 67 | 68 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.10 3 | # by the following command: 4 | # 5 | # pip-compile --all-extras --output-file=requirements.txt 6 | # 7 | biopython==1.85 8 | # via easypqp (pyproject.toml) 9 | click==8.3.0 10 | # via 11 | # easypqp (pyproject.toml) 12 | # pyprophet 13 | contourpy==1.3.2 14 | # via matplotlib 15 | cycler==0.12.1 16 | # via matplotlib 17 | cython==3.1.4 18 | # via pyprophet 19 | duckdb==1.3.2 20 | # via 21 | # duckdb-extension-sqlite-scanner 22 | # duckdb-extensions 23 | # pyprophet 24 | duckdb-extension-sqlite-scanner==1.3.2 25 | # via pyprophet 26 | duckdb-extensions==1.3.2 27 | # via pyprophet 28 | fonttools==4.60.2 29 | # via matplotlib 30 | joblib==1.5.2 31 | # via scikit-learn 32 | kiwisolver==1.4.9 33 | # via matplotlib 34 | llvmlite==0.45.1 35 | # via numba 36 | loguru==0.7.3 37 | # via pyprophet 38 | lxml==6.0.2 39 | # via pyprophet 40 | matplotlib==3.10.7 41 | # via 42 | # easypqp (pyproject.toml) 43 | # pyopenms 44 | # pyprophet 45 | # seaborn 46 | numba==0.62.1 47 | # via easypqp (pyproject.toml) 48 | numexpr==2.14.1 49 | # via pyprophet 50 | numpy==1.26.4 51 | # via 52 | # biopython 53 | # contourpy 54 | # easypqp (pyproject.toml) 55 | # matplotlib 56 | # numba 57 | # numexpr 58 | # pandas 59 | # patsy 60 | # pyopenms 61 | # pyprophet 62 | # scikit-learn 63 | # scipy 64 | # seaborn 65 | # statsmodels 66 | # xgboost 67 | nvidia-nccl-cu12==2.28.3 68 | # via xgboost 69 | packaging==25.0 70 | # via 71 | # matplotlib 72 | # statsmodels 73 | pandas==2.3.3 74 | # via 75 | # easypqp (pyproject.toml) 76 | # pyopenms 77 | # pyprophet 78 | # seaborn 79 | # statsmodels 80 | patsy==1.0.1 81 | # via statsmodels 82 | pillow==12.0.0 83 | # via matplotlib 84 | polars==1.34.0 85 | # via pyprophet 86 | polars-runtime-32==1.34.0 87 | # via polars 88 | psutil==7.1.0 89 | # via pyprophet 90 | pyarrow==21.0.0 91 | # via pyprophet 92 | pyopenms==3.4.0 93 | # via 94 | # easypqp (pyproject.toml) 95 | # pyprophet 96 | pyparsing==3.2.5 97 | # via matplotlib 98 | pypdf==6.4.0 99 | # via pyprophet 100 | pyprophet==3.0.2 101 | # via easypqp (pyproject.toml) 102 | python-dateutil==2.9.0.post0 103 | # via 104 | # matplotlib 105 | # pandas 106 | pytz==2025.2 107 | # via pandas 108 | scikit-learn==1.7.2 109 | # via 110 | # easypqp (pyproject.toml) 111 | # pyprophet 112 | scipy==1.15.3 113 | # via 114 | # easypqp (pyproject.toml) 115 | # pyprophet 116 | # scikit-learn 117 | # statsmodels 118 | # xgboost 119 | seaborn==0.13.2 120 | # via 121 | # easypqp (pyproject.toml) 122 | # pyprophet 123 | six==1.17.0 124 | # via python-dateutil 125 | statsmodels==0.14.5 126 | # via 127 | # easypqp (pyproject.toml) 128 | # pyprophet 129 | tabulate==0.9.0 130 | # via pyprophet 131 | threadpoolctl==3.6.0 132 | # via scikit-learn 133 | tqdm==4.67.1 134 | # via easypqp (pyproject.toml) 135 | typing-extensions==4.15.0 136 | # via pypdf 137 | tzdata==2025.2 138 | # via pandas 139 | xgboost==3.1.0 140 | # via pyprophet 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | EasyPQP: Simple library generation for OpenSWATH 2 | ================================================ 3 | 4 | [![CI](https://github.com/grosenberger/easypqp/actions/workflows/ci.yml/badge.svg)](https://github.com/grosenberger/easypqp/actions/workflows/ci.yml) 5 | 6 | EasyPQP is a Python package that provides simplified and fast peptide query parameter generation for OpenSWATH. It can process input from MSFragger, Sage or other database search engines in pepXML/idXML/tsv format. Statistical validation can be conducted either using PyProphet or PeptideProphet/iProphet. Retention times and ion mobilities are calibrated using an internal or external standard. In addition to a cumulative library, run-specific libraries are generated for non-linear RT alignment in OpenSWATH. For generation of PTM specific libraries that utilizes a unimod.xml database, you can further restrict the unimod.xml database file for modifications and site-specificities of interest. It also supports in-silico library generation. 7 | 8 | Installation 9 | ============ 10 | 11 | We strongly advice to install EasyPQP in a Python [*virtualenv*](https://virtualenv.pypa.io/en/stable/). EasyPQP is compatible with Python 3. 12 | 13 | Install the development version of *easypqp* from GitHub: 14 | 15 | ```` 16 | $ pip install git+https://github.com/grosenberger/easypqp.git@master 17 | ```` 18 | 19 | ### Full Installation 20 | 21 | To install all optional features: 22 | 23 | ```` 24 | $ pip install easypqp[all] 25 | ```` 26 | 27 | This will install the `easypqp_rs` package, which provides the in-silico library generation feature and pyprophet for statistical validation. 28 | 29 | Running EasyPQP 30 | =============== 31 | 32 | *EasyPQP* is not only a Python package, but also a command line tool: 33 | 34 | ```` 35 | $ easypqp --help 36 | ```` 37 | 38 | or: 39 | 40 | ```` 41 | $ easypqp convert --help 42 | $ easypqp convertpsm --help 43 | $ easypqp convertsage --help 44 | $ easypqp library --help 45 | $ easypqp insilico-library --help 46 | $ easypqp reduce --help 47 | $ easypqp filter-unimod --help 48 | $ easypqp openswath-assay-generator --help 49 | $ easypqp openswath-decoy-generator --help 50 | $ easypqp targeted-file-converter --help 51 | ```` 52 | 53 | Generating an *In-Silico* Library 54 | ================================= 55 | 56 | The in-silico library generation feature is included if you installed EasyPQP with the `[all]` or `[rust]` extras (to install the `easypqp_rs` package). 57 | 58 | To generate an in-silico library, you can use the `insilico-library` command. For example: 59 | 60 | ```` 61 | $ easypqp insilico-library --fasta your_proteome.fasta --output_file insilico_library.tsv 62 | ```` 63 | 64 | For more information on the parameters and JSON configuration file, see the [Configuration Reference](https://github.com/singjc/easypqp-rs?tab=readme-ov-file#configuration-reference) 65 | 66 | > [!NOTE] 67 | > If no `retention_time`, `ion_mobility`, or `ms2_intensity` fields are provided under `dl_feature_generators` in the config, pretrained models will be automatically downloaded and used. The current default pretrained models used are: 68 | > - RT: `rt_cnn_tf` - A CNN-Transformer model trained on the [ProteomicsML repository RT dataset](https://proteomicsml.org/datasets/retentiontime/ProteomeTools_RT.html). This model is based on AlphaPeptDeep's CNN-LSTM implementation, with the biLSTM replaced by a Transformer encoder. 69 | > - CCS: `ccs_cnn_tf` - A CNN-Transformer model trained on the [ProteomicsML repository CCS dataset](https://proteomicsml.org/datasets/ionmobility/Meier_TIMS.html). This model is also based on AlphaPeptDeep's CNN-LSTM implementation, with the biLSTM replaced by a Transformer encoder. 70 | > - MS2: `ms2_bert` - A BERT-based model retreived from AlphaPeptDeep's pretrained models. 71 | 72 | If you want just a standalone portable rust binary, you can download one from the [easypqp-rs releases page](https://github.com/singjc/easypqp-rs/releases). 73 | 74 | Docker 75 | ====== 76 | 77 | EasyPQP is also available from Docker (automated builds): 78 | 79 | Pull the development version of *easypqp* from DockerHub (synced with GitHub): 80 | 81 | ```` 82 | $ docker pull grosenberger/easypqp:latest 83 | ```` 84 | -------------------------------------------------------------------------------- /tests/_regtest_outputs/test_openswathassay_generator.test_openswath_assay_generator.out: -------------------------------------------------------------------------------- 1 | ID PROTEIN_ACCESSION DECOY 2 | 4 0 Q04637 0 3 | 3 1 Q2M2I8 0 4 | 2 2 Q86WB0 0 5 | 1 3 Q8WWI1 0 6 | 0 4 Q92890 0 7 | ID UNMODIFIED_SEQUENCE MODIFIED_SEQUENCE DECOY 8 | 4 0 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR 0 9 | 3 1 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR 0 10 | 2 2 EAALPPVSPLK EAALPPVS(UniMod:21)PLK 0 11 | 1 3 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK 0 12 | 0 4 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR 0 13 | ID TRAML_ID GROUP_LABEL PRECURSOR_MZ CHARGE LIBRARY_INTENSITY LIBRARY_RT LIBRARY_DRIFT_TIME DECOY 14 | 3 0 AGQTQPNPGILPIQPALT(Phospho)PR_2 1075.0619 2 None 70.6096 -1.0 0 15 | 4 1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 1266.5622 2 None 67.4251 -1.0 0 16 | 0 2 EAALPPVS(Phospho)PLK_2 601.3151 2 None 48.0082 -1.0 0 17 | 1 3 GVEPSPS(Phospho)PIKPGDIK_2 800.9028 2 None 32.9698 -1.0 0 18 | 2 4 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 948.9241 2 None 52.9017 -1.0 0 19 | ID TRAML_ID PRODUCT_MZ CHARGE TYPE ANNOTATION ORDINAL DETECTING IDENTIFYING QUANTIFYING LIBRARY_INTENSITY DECOY 20 | 0 0 31640 486.2307 1 b b5^1 5 1 0 1 10000.0000 0 21 | 1 1 31641 697.3264 1 b b7^1 7 1 0 1 7081.5693 0 22 | 2 2 31643 734.3597 1 y y6^1 6 1 0 1 8579.1080 0 23 | 3 3 31644 832.4502 2 y y15^2 15 1 0 1 2923.7356 0 24 | 4 4 31646 964.4847 1 b b10^1 10 1 0 1 3234.0083 0 25 | 5 5 31647 1072.5551 1 y y9^1 9 1 0 1 3853.8560 0 26 | 6 6 42446 400.2303 1 y y3^1 3 1 0 1 7762.3594 0 27 | 7 7 42450 567.2287 1 y y4^1 4 1 0 1 8796.0370 0 28 | 8 8 42457 818.4254 1 b b9^1 9 1 0 1 5399.1875 0 29 | 9 9 42458 866.3768 1 y y7^1 7 1 0 1 6659.5240 0 30 | 10 10 42459 933.4524 1 b b10^1 10 1 0 1 6236.0680 0 31 | 11 11 42468 1139.4729 1 y y10^1 10 1 0 1 3636.2630 0 32 | 12 12 2346 409.2146 2 y y7^2 7 1 0 1 2965.7283 0 33 | 13 13 2347 465.7566 2 y y8^2 8 1 0 1 132.8395 0 34 | 14 14 2349 720.3692 1 y y6^1 6 1 0 1 1580.4800 0 35 | 15 15 2350 817.4219 1 y y7^1 7 1 0 1 10000.0000 0 36 | 16 16 2352 930.5060 1 y y8^1 8 1 0 1 417.7673 0 37 | 17 17 2353 1001.5431 1 y y9^1 9 1 0 1 278.9014 0 38 | 18 18 12006 375.2238 1 y y3^1 3 1 0 1 1621.3933 0 39 | 19 19 12009 529.2980 1 y y5^1 5 1 0 1 10000.0000 0 40 | 20 20 12010 566.2941 2 y y10^2 10 1 0 1 3326.0842 0 41 | 21 21 12012 657.3930 1 y y6^1 6 1 0 1 1924.5614 0 42 | 22 22 12013 658.3365 2 y y12^2 12 1 0 1 4464.7860 0 43 | 23 23 12015 867.5298 1 y y8^1 8 1 0 1 5222.4050 0 44 | 24 24 21489 385.2558 1 y y3^1 3 1 0 1 5179.5244 0 45 | 25 25 21490 393.1438 1 b b4^1 4 1 0 1 2606.7244 0 46 | 26 26 21491 565.1923 1 b b6^1 6 1 0 1 3256.2622 0 47 | 27 27 21493 666.2399 1 b b7^1 7 1 0 1 3735.2622 0 48 | 28 28 21494 736.3389 1 y y6^1 6 1 0 1 10000.0000 0 49 | 29 29 21496 835.4073 1 y y7^1 7 1 0 1 3901.4023 0 50 | -------------------------------------------------------------------------------- /tests/_regtest_outputs/test_targetedfileconverter.test_targeted_file_converter_pqptotsv.out: -------------------------------------------------------------------------------- 1 | Adducts Annotation CollisionEnergy CompoundName Decoy DetectingTransition FragmentSeriesNumber FragmentType GeneName IdentifyingTransition LabelType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideGroupLabel PeptideSequence Peptidoforms PrecursorCharge PrecursorIonMobility PrecursorMz ProductCharge ProductMz ProteinId QuantifyingTransition SMILES SumFormula TransitionGroupId TransitionId UniprotId 2 | 0 NaN y2^1 -1.0 NaN 0 1 2 y EIF4G1 0 NaN 81.9345 EAALPPVS(UniMod:21)PLK 48.0082 NaN EAALPPVSPLK NaN 2 -1.0 601.3151 1 260.1969 Q04637 1 NaN NaN EAALPPVS(Phospho)PLK_2 2345 NaN 3 | 1 NaN y7^2 -1.0 NaN 0 1 7 y EIF4G1 0 NaN 2965.7283 EAALPPVS(UniMod:21)PLK 48.0082 NaN EAALPPVSPLK NaN 2 -1.0 601.3151 2 409.2146 Q04637 1 NaN NaN EAALPPVS(Phospho)PLK_2 2346 NaN 4 | 2 NaN y8^2 -1.0 NaN 0 1 8 y EIF4G1 0 NaN 132.8395 EAALPPVS(UniMod:21)PLK 48.0082 NaN EAALPPVSPLK NaN 2 -1.0 601.3151 2 465.7566 Q04637 1 NaN NaN EAALPPVS(Phospho)PLK_2 2347 NaN 5 | 3 NaN y5^1 -1.0 NaN 0 1 5 y EIF4G1 0 NaN 101.3607 EAALPPVS(UniMod:21)PLK 48.0082 NaN EAALPPVSPLK NaN 2 -1.0 601.3151 1 623.3164 Q04637 1 NaN NaN EAALPPVS(Phospho)PLK_2 2348 NaN 6 | 4 NaN y6^1 -1.0 NaN 0 1 6 y EIF4G1 0 NaN 1580.4800 EAALPPVS(UniMod:21)PLK 48.0082 NaN EAALPPVSPLK NaN 2 -1.0 601.3151 1 720.3692 Q04637 1 NaN NaN EAALPPVS(Phospho)PLK_2 2349 NaN 7 | .. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 8 | 73 NaN b13^1 -1.0 NaN 0 1 13 b LMO7 0 NaN 890.7413 ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR 67.4251 NaN ATLSSTSGLDLMSESGEGEISPQR NaN 2 -1.0 1266.5622 1 1264.6090 Q8WWI1 1 NaN NaN ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42472 NaN 9 | 74 NaN y11^1 -1.0 NaN 0 1 11 y LMO7 0 NaN 1830.4344 ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR 67.4251 NaN ATLSSTSGLDLMSESGEGEISPQR NaN 2 -1.0 1266.5622 1 1268.5155 Q8WWI1 1 NaN NaN ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42473 NaN 10 | 75 NaN y12^1 -1.0 NaN 0 1 12 y LMO7 0 NaN 2691.2388 ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR 67.4251 NaN ATLSSTSGLDLMSESGEGEISPQR NaN 2 -1.0 1266.5622 1 1355.5475 Q8WWI1 1 NaN NaN ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42474 NaN 11 | 76 NaN b14^1 -1.0 NaN 0 1 14 b LMO7 0 NaN 870.2799 ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR 67.4251 NaN ATLSSTSGLDLMSESGEGEISPQR NaN 2 -1.0 1266.5622 1 1393.6515 Q8WWI1 1 NaN NaN ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42475 NaN 12 | 77 NaN y13^1 -1.0 NaN 0 1 13 y LMO7 0 NaN 851.3514 ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR 67.4251 NaN ATLSSTSGLDLMSESGEGEISPQR NaN 2 -1.0 1266.5622 1 1486.5880 Q8WWI1 1 NaN NaN ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42476 NaN 13 | 14 | [78 rows x 29 columns] 15 | -------------------------------------------------------------------------------- /easypqp/unimoddb.py: -------------------------------------------------------------------------------- 1 | import click 2 | from tqdm import tqdm 3 | 4 | # Unimod parsing 5 | import xml.etree.cElementTree as ET 6 | 7 | 8 | def site_validation(site_input): 9 | """ 10 | Perform a check to ensure inputs are valid 11 | Arguments: 12 | site_input: (list) list of amino acid residues, or terminal notation, or wild card notation (*). 13 | Returns: 14 | Nothing is returned. An error is raised if the input contains a non-valid site. 15 | """ 16 | acceptable_sites = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V", 'U', 'O', '[', ']', 'n', 'c', '*'] 17 | site_check = [site not in acceptable_sites for site in site_input] 18 | if any(site_check): 19 | raise click.ClickException( f"Incorrect site specificity input, site(s) {', '.join([i for (i, v) in zip(site_input, site_check) if v])} is not valid. Acceptable sites: {', '.join(acceptable_sites)}") 20 | 21 | def site_specificity_transform(site_input): 22 | """ 23 | Transform input site to return the site and position. Transforms terminal notation to site notation in unimod.xml and whether its any terminal site or a protein terminal site. 24 | Arguments: 25 | site_input: (list) list of amino acid residues, or terminal notation, or wild card notation (*). 26 | Returns: 27 | Returns a tuple of list of sites and list of positions 28 | """ 29 | # Site and Position Mapping 30 | terminal_map = {'[':'N-term', ']':'C-term', 'n':'N-term', 'c':'C-term'} 31 | site_position_map = {'[':'Protein N-term', ']':'Protein C-term', 'n':'Any N-term', 'c':'Any C-term'} 32 | # Split sites 33 | site_input = [site for site in site_input] 34 | site_validation(site_input) 35 | sites=[]; positions=[] 36 | for site in site_input: 37 | if site in terminal_map.keys(): 38 | sites.append(terminal_map[site]) 39 | positions.append(site_position_map[site]) 40 | elif site=="*": 41 | sites.append("*") 42 | positions.append("*") 43 | else: 44 | sites.append(site) 45 | positions.append("Anywhere") 46 | return sites, positions 47 | 48 | def unimod_filter(unimod_file, out_file, accession_ids, site_specificity): 49 | """ 50 | Filter an input unimod to restrict for specific modifications and site specificities 51 | Arguments: 52 | unimod_file: (str) path/filename of input unimod.xml file. 53 | out_file: (str) path/filename to write out new filtered unimod.xml file 54 | accession_ids: (list) list of unimod accession ids to restrict for. i.e. ['1','21','35] 55 | site_specificity: (list) list of site specificties to further restrict corresponding unimod for. i.e. ['n','STY','M], will restrict acetylation for any N-Term, phosphorylation for serine, threonine, and tyrosine, and oxidation for methionine. 56 | Returns: 57 | Nothing is returned. The restricted unimod database is written to the out_file. 58 | """ 59 | # Register Namespace 60 | ET.register_namespace('umod', 'http://www.unimod.org/xmlns/schema/unimod_2') 61 | 62 | # Read in unimod XML database 63 | click.echo(f"INFO: Loading XML data from {unimod_file}") 64 | tree = ET.parse(unimod_file) 65 | root = tree.getroot() 66 | 67 | # Namespace 68 | ns = {'umod':'http://www.unimod.org/xmlns/schema/unimod_2'} 69 | 70 | # Generate root for new filtered unimod XML 71 | root_out = ET.Element(root.tag, root.attrib) 72 | 73 | # Append elements subelements 74 | umod_elements = root.findall("umod:elements", ns) 75 | root_out.append(umod_elements[0]) 76 | 77 | # Append desired modifications 78 | mod_entries = root.findall('umod:modifications', ns)[0] 79 | mod_out = ET.Element(mod_entries.tag, mod_entries.attrib) 80 | i=0 81 | pbar = tqdm(accession_ids) 82 | pbar_desc = "INFO: Restricting" 83 | for record_id in pbar: 84 | add_unimod_entry = mod_entries.findall(f"./umod:mod/[@record_id='{record_id}']", ns)[0] 85 | if site_specificity is not None: 86 | site, position = site_specificity_transform(site_specificity[i]) 87 | # Update progess bar description 88 | pbar_desc = f"INFO: Restricting..{add_unimod_entry.attrib.get('title')}({','.join(site)})" 89 | pbar.set_description(pbar_desc) 90 | if site != "*": 91 | for unimod_site in add_unimod_entry.findall(f"./umod:specificity", ns): 92 | if unimod_site.attrib['site'] in site and unimod_site.attrib['position'] in position: 93 | # If current specificity element is a requested one, continue on 94 | continue 95 | else: 96 | # Remove specificities that do not match requested specificities 97 | add_unimod_entry.remove(unimod_site) 98 | else: 99 | # Update progess bar description 100 | pbar_desc = f"INFO: Restricting..{add_unimod_entry.attrib.get('title')}" 101 | pbar.set_description(pbar_desc) 102 | # click.echo(f"INFO: Appending to filtered unimod XML - title={add_unimod_entry.attrib.get('title')} with record_id={add_unimod_entry.attrib.get('record_id')}") 103 | mod_out.append( add_unimod_entry ) 104 | i+=1 105 | root_out.append(mod_out) 106 | 107 | # Append amino acids 108 | umod_amino_acids = root.findall("umod:amino_acids", ns) 109 | root_out.append(umod_amino_acids[0]) 110 | 111 | # Append mod bricks 112 | umod_mod_bricks = root.findall("umod:mod_bricks", ns) 113 | root_out.append(umod_mod_bricks[0]) 114 | 115 | # Generate element hierarchy to write out to xml 116 | tree_out = ET.ElementTree(root_out) 117 | # For Pretty-Printing 118 | ET.indent(tree_out, ' ') 119 | # Write out filtered unimod xml database 120 | click.echo(f"INFO: Writing out filtered unimod XML file to {out_file}") 121 | tree_out.write(out_file, encoding="UTF-8", xml_declaration=True, method="xml") 122 | 123 | # Insert Top Comment 124 | # TODO: This may not be the best way to add the top level comment in standard unimod.xml database files. Might be able to use lxml instead, requiring an additional dependency 125 | with open(out_file, 'r+', encoding="utf-8") as file_handle: 126 | lines = file_handle.readlines() 127 | lines.insert(1, "\n\n\n") # you can use any index if you know the line index 128 | file_handle.seek(0) 129 | file_handle.writelines(lines) -------------------------------------------------------------------------------- /tests/test_insilico_library.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import subprocess 5 | import shutil 6 | import sys 7 | 8 | import pandas as pd 9 | import pytest 10 | 11 | pd.options.display.expand_frame_repr = False 12 | pd.options.display.precision = 4 13 | pd.options.display.max_columns = None 14 | 15 | DATA_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") 16 | 17 | # Check if insilico feature is available 18 | try: 19 | from easypqp_rs import generate_insilico_library # noqa: F401 20 | 21 | HAS_RUST_BACKEND = True 22 | except ImportError: 23 | HAS_RUST_BACKEND = False 24 | 25 | 26 | def _run_cmdline(cmdline): 27 | stdout = cmdline + "\n" 28 | try: 29 | stdout += str( 30 | subprocess.check_output(cmdline, shell=True, stderr=subprocess.STDOUT) 31 | ) 32 | except subprocess.CalledProcessError as error: 33 | print(error, end="", file=sys.stderr) 34 | print( 35 | "Command output:", 36 | error.output.decode() if error.output else "No output", 37 | file=sys.stderr, 38 | ) 39 | raise 40 | return stdout 41 | 42 | 43 | def _run_insilico_library(regtest, temp_folder): 44 | os.chdir(temp_folder) 45 | config_path = os.path.join(DATA_FOLDER, "config.json") 46 | fasta_path = os.path.join(DATA_FOLDER, "Q99536.fasta") 47 | 48 | # Copy test files to temp directory 49 | shutil.copy(config_path, temp_folder) 50 | shutil.copy(fasta_path, temp_folder) 51 | 52 | # Update config to use local paths in temp folder 53 | import json 54 | 55 | with open("config.json", "r") as f: 56 | config = json.load(f) 57 | 58 | # Update paths to be relative to temp folder 59 | config["database"]["fasta"] = "Q99536.fasta" 60 | config["output_file"] = "easypqp_insilico_library.tsv" 61 | 62 | with open("config.json", "w") as f: 63 | json.dump(config, f, indent=2) 64 | 65 | cmdline = "easypqp insilico-library --config config.json" 66 | 67 | _run_cmdline(cmdline) 68 | 69 | # Read and verify the output TSV file 70 | output_file = "easypqp_insilico_library.tsv" 71 | assert os.path.exists(output_file), f"Output file {output_file} was not created" 72 | 73 | library_df = pd.read_csv(output_file, sep="\t") 74 | 75 | # Print basic statistics about the generated library 76 | print(f"Generated library contains {len(library_df)} transitions", file=regtest) 77 | 78 | # Use TransitionGroupId for unique precursors if available, otherwise compute from other columns 79 | if "TransitionGroupId" in library_df.columns: 80 | num_precursors = library_df["TransitionGroupId"].nunique() 81 | else: 82 | # Compute unique precursors from PrecursorMz, PrecursorCharge, and PeptideSequence 83 | num_precursors = library_df.groupby( 84 | ["PrecursorMz", "PrecursorCharge", "PeptideSequence"] 85 | ).ngroups 86 | 87 | print(f"Number of unique precursors: {num_precursors}", file=regtest) 88 | 89 | # Use appropriate column for peptide count 90 | peptide_col = ( 91 | "ModifiedPeptideSequence" 92 | if "ModifiedPeptideSequence" in library_df.columns 93 | else "PeptideSequence" 94 | ) 95 | print( 96 | f"Number of unique peptides: {library_df[peptide_col].nunique()}", file=regtest 97 | ) 98 | 99 | # Print column names 100 | print(f"\nColumns: {list(library_df.columns)}", file=regtest) 101 | 102 | # Round LibraryIntensity to make test more stable (DL predictions can vary slightly) 103 | # Keep only deterministic columns for display 104 | display_df = library_df.head().copy() 105 | if "LibraryIntensity" in display_df.columns: 106 | display_df["LibraryIntensity"] = display_df["LibraryIntensity"].round(0) 107 | 108 | # Sort by ProductMz to ensure consistent ordering across runs 109 | display_df = display_df.sort_values("ProductMz").reset_index(drop=True) 110 | 111 | # Print a sample of the data (first few rows) - excluding non-deterministic columns 112 | print("\nFirst 5 transitions (deterministic columns only):", file=regtest) 113 | deterministic_cols = [ 114 | "PrecursorMz", 115 | "ProductMz", 116 | "PrecursorCharge", 117 | "ProductCharge", 118 | "PeptideSequence", 119 | "ModifiedPeptideSequence", 120 | "ProteinId", 121 | "UniprotId", 122 | "GeneName", 123 | "FragmentType", 124 | "FragmentSeriesNumber", 125 | "Annotation", 126 | "TransitionGroupId", 127 | "TransitionId", 128 | "Decoy", 129 | ] 130 | available_cols = [col for col in deterministic_cols if col in display_df.columns] 131 | print(display_df[available_cols].to_string(), file=regtest) 132 | 133 | # Verify core columns exist (using actual column names from the output) 134 | core_columns = [ 135 | "PrecursorMz", 136 | "ProductMz", 137 | "PrecursorCharge", 138 | "ProductCharge", 139 | "LibraryIntensity", 140 | "PeptideSequence", 141 | "ProteinId", 142 | "FragmentType", 143 | "FragmentSeriesNumber", 144 | "Annotation", 145 | ] 146 | 147 | missing_columns = [col for col in core_columns if col not in library_df.columns] 148 | if missing_columns: 149 | print(f"\nWarning: Missing core columns: {missing_columns}", file=regtest) 150 | 151 | # Print some statistics 152 | print("\nStatistics:", file=regtest) 153 | print( 154 | f" Precursor charge range: {library_df['PrecursorCharge'].min()}-{library_df['PrecursorCharge'].max()}", 155 | file=regtest, 156 | ) 157 | print( 158 | f" Fragment types: {sorted(library_df['FragmentType'].unique())}", file=regtest 159 | ) 160 | 161 | # Check for decoys using the Decoy column 162 | if "Decoy" in library_df.columns: 163 | print(f" Contains decoys: {library_df['Decoy'].sum() > 0}", file=regtest) 164 | print(f" Number of targets: {(library_df['Decoy'] == 0).sum()}", file=regtest) 165 | print(f" Number of decoys: {(library_df['Decoy'] == 1).sum()}", file=regtest) 166 | else: 167 | print(" Decoy column not found in output", file=regtest) 168 | 169 | # Verify LibraryIntensity values are reasonable (not in regtest output due to variance) 170 | if "LibraryIntensity" in library_df.columns: 171 | intensity_stats = library_df["LibraryIntensity"].describe() 172 | # Only assert, don't print to regtest to avoid flakiness 173 | assert intensity_stats["min"] >= 0, "LibraryIntensity should be non-negative" 174 | assert intensity_stats["max"] <= 10001, ( 175 | "LibraryIntensity should be normalized to ~10000" 176 | ) 177 | 178 | 179 | @pytest.mark.skipif( 180 | not HAS_RUST_BACKEND, 181 | reason="In-silico feature not installed (easypqp_rs package missing - reinstall easypqp)", 182 | ) 183 | def test_insilico_library(tmpdir, regtest): 184 | _run_insilico_library(regtest, tmpdir.strpath) 185 | -------------------------------------------------------------------------------- /easypqp/openswathdecoygenerator.py: -------------------------------------------------------------------------------- 1 | import pyopenms as po 2 | import click 3 | from typing import Union, Tuple 4 | 5 | from .targetedfileconverter import TargetedExperiment 6 | 7 | def check_argument_values(arg_name: str, arg_value: any, expected_type: Tuple[Union[type, None], Union[Tuple, None]]) -> None: 8 | """ 9 | Check if the given argument value is of the expected type and value range (if applicable). 10 | Raise a TypeError or ValueError if the value is invalid. 11 | """ 12 | expected_type, expected_range = expected_type 13 | if isinstance(expected_type, list) and None in expected_type: 14 | pass 15 | elif not isinstance(arg_value, expected_type): 16 | raise TypeError(f"{arg_name} should be of type {expected_type.__name__} not type {arg_value.__class__}.") 17 | if expected_range is not None: 18 | # Handle numeric range 19 | if isinstance(expected_range, tuple) and len(expected_range) == 2: 20 | if not (expected_range[0] <= arg_value <= expected_range[1]): 21 | raise ValueError(f"{arg_name} should be within the range {expected_range}, cannot except {arg_value}.") 22 | elif isinstance(expected_range, list) and arg_value not in expected_range: 23 | raise ValueError(f"{arg_name} should be one of {expected_range}, cannot except '{arg_value}'.") 24 | 25 | class OpenSwathDecoyGenerator(TargetedExperiment): 26 | def __init__(self, 27 | infile: str, 28 | outfile: str="library.pqp", 29 | in_type: Union[str, None]=None, 30 | out_type: Union[str, None]=None, 31 | method: str="shuffle", 32 | decoy_tag: str="DECOY_", 33 | min_decoy_fraction: float=0.8, 34 | aim_decoy_fraction: float=1.0, 35 | shuffle_max_attempts: int=30, 36 | shuffle_sequence_identity_threshold: float=0.5, 37 | shift_precursor_mz_shift: float=0.0, 38 | shift_product_mz_shift: float=20.0, 39 | product_mz_threshold: float=0.025, 40 | allowed_fragment_types: str="b,y", 41 | allowed_fragment_charges: str="1,2,3,4", 42 | enable_detection_specific_losses: bool=False, 43 | enable_detection_unspecific_losses: bool=False, 44 | switchKR: bool=True, 45 | separate: bool=False) -> None: 46 | super().__init__(True) 47 | 48 | # Valdiate arguments 49 | check_argument_values("infile", infile, (str, None)) 50 | check_argument_values("outfile", outfile, (str, None)) 51 | # Handle types 52 | if in_type is None: 53 | in_type = self._get_file_type(infile) 54 | if out_type is None: 55 | out_type = self._get_file_type(outfile) 56 | check_argument_values("in_type", in_type, ([str, None], ['tsv', 'mrm', 'pqp', 'TraML'])) 57 | check_argument_values("out_type", out_type, ([str, None], ['tsv', 'pqp', 'TraML'])) 58 | check_argument_values("method", method, (str, ['shuffle', 'pseudo-reverse', 'reverse', 'shift'])) 59 | check_argument_values("decoy_tag", decoy_tag, (str, None)) 60 | check_argument_values("min_decoy_fraction", min_decoy_fraction, (float, (0, 1))) 61 | check_argument_values("aim_decoy_fraction", aim_decoy_fraction, (float, (0, 1))) 62 | check_argument_values("shuffle_max_attempts", shuffle_max_attempts, (int, None)) 63 | check_argument_values("shuffle_sequence_identity_threshold", shuffle_sequence_identity_threshold, (float, (0, 1))) 64 | check_argument_values("shift_precursor_mz_shift", shift_precursor_mz_shift, (float, None)) 65 | check_argument_values("shift_product_mz_shift", shift_product_mz_shift, (float, None)) 66 | check_argument_values("product_mz_threshold", product_mz_threshold, (float, None)) 67 | check_argument_values("allowed_fragment_types", allowed_fragment_types, (str, None)) # TODO: Add value check, to ensure valid fragment types 68 | check_argument_values("allowed_fragment_charges", allowed_fragment_charges, (str, None)) # TODO: Add value check to ensure ints are in string of charges 69 | check_argument_values("enable_detection_specific_losses", enable_detection_specific_losses, (bool, None)) 70 | check_argument_values("enable_detection_unspecific_losses", enable_detection_unspecific_losses, (bool, None)) 71 | check_argument_values("switchKR", switchKR, (bool, None)) 72 | check_argument_values("separate", separate, (bool, None)) 73 | 74 | # TODO: Move this up before argument validation for specific arg? 75 | # Transform string 76 | allowed_fragment_types = allowed_fragment_types.split(",") 77 | allowed_fragment_types = [s.encode('utf-8') for s in allowed_fragment_types] 78 | allowed_fragment_charges = allowed_fragment_charges.split(",") 79 | allowed_fragment_charges = [int(charge) for charge in allowed_fragment_charges] 80 | 81 | # Assign values to self 82 | for name, value in locals().items(): 83 | if name != 'self': 84 | # print(f"Info: Setting {name} = {value}") 85 | setattr(self, name, value) 86 | 87 | # Load target experiment 88 | self.load_library(self.infile, self.in_type) 89 | 90 | def generate_decoys(self) -> None: 91 | # Initiate decoy experiment 92 | self.tr_decoy = po.TargetedExperiment() 93 | 94 | # Generate decoys 95 | decoys = po.MRMDecoy() 96 | decoys.generateDecoys(self.tr_exp, self.tr_decoy, self.method, self.aim_decoy_fraction, self.switchKR, self.decoy_tag, self.shuffle_max_attempts, self.shuffle_sequence_identity_threshold, self.shift_precursor_mz_shift, self.shift_product_mz_shift, self.product_mz_threshold, self.allowed_fragment_types, self.allowed_fragment_charges, self.enable_detection_specific_losses, self.enable_detection_unspecific_losses, -4) 97 | 98 | click.echo(f"Info: Number of target peptides: {len(self.tr_exp.getPeptides())}") 99 | click.echo(f"Info: Number of decoy peptides: {len(self.tr_decoy.getPeptides())}") 100 | click.echo(f"Info: Number of target proteins: {len(self.tr_exp.getProteins())}") 101 | click.echo(f"Info: Number of decoy proteins: {len(self.tr_decoy.getProteins())}") 102 | 103 | if len(self.tr_decoy.getPeptides()) / len(self.tr_exp.getPeptides()) < self.min_decoy_fraction or len(self.tr_decoy.getProteins()) / len(self.tr_exp.getProteins()) < self.min_decoy_fraction: 104 | raise click.ClickException(f"The number of decoys for peptides or proteins is below the threshold of {(self.min_decoy_fraction * 100)}% of the number of targets.") 105 | 106 | if self.separate: 107 | click.echo(f"Info: Writing only decoys to file: {self.outfile}") 108 | self.tr_exp = self.tr_decoy 109 | else: 110 | click.echo(f"Info: Writing targets and decoys to file: {self.outfile}") 111 | self.tr_exp += self.tr_decoy 112 | 113 | self.write_library(self.outfile, self.out_type) -------------------------------------------------------------------------------- /easypqp/targetedfileconverter.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import pandas as pd 4 | import pyopenms as po 5 | import ctypes 6 | import click 7 | from typing import Union 8 | 9 | class TargetedExperiment: 10 | """ 11 | Class to load and write a OpenMS TargetedExperiment 12 | """ 13 | def __init__(self, legacy_traml_id: bool=True) -> None: 14 | self.legacy_traml_id = legacy_traml_id 15 | self.tr_exp = po.TargetedExperiment() 16 | self.file_types = po.FileTypes() 17 | 18 | def _validate_type(self, file: str, file_type: str) -> None: 19 | """Method to ensure filetype is a known OpenMS compatible transition list file type.""" 20 | if self.file_types.nameToType(file_type) == po.FileType.UNKNOWN and file_type!='parquet': 21 | click.FileError(filename=file, hint=f"Error: Could not determine file type! {file}") 22 | 23 | def _get_file_type(self, infile) -> str: 24 | """Method to get file type extension from file.""" 25 | return os.path.splitext(infile)[-1].split('.')[-1] 26 | 27 | def _get_file_type_id(self, file_type: str) -> int: 28 | """Method to get file type id as annotated in OpenMS filetype database.""" 29 | return self.file_types.nameToType(file_type)-1 30 | 31 | def load_library(self, infile: str, in_type: Union[str, None]=None) -> None: 32 | """ 33 | Method to load data from input transition list into an OpenMS TargetedExperiment Object 34 | 35 | Parameters: 36 | infile: (str) input transition list file to load 37 | in_type: (str|None) input file type. Default: None. Will be inferred from infile 38 | """ 39 | if in_type is None: 40 | in_type = self._get_file_type(infile) 41 | self._validate_type(infile, in_type) 42 | # Convert infile str to ctype c char 43 | c_in_file = ctypes.create_string_buffer(infile.encode()) 44 | if self._get_file_type_id(in_type) == po.FileType.TSV or self._get_file_type_id(in_type) == po.FileType.MRM: 45 | click.echo("Info: Reading TSV transition list data...") 46 | tsv_reader = po.TransitionTSVFile() 47 | tsv_reader.convertTSVToTargetedExperiment(c_in_file.value, self._get_file_type_id(in_type), self.tr_exp) 48 | tsv_reader.validateTargetedExperiment(self.tr_exp) 49 | 50 | elif self._get_file_type_id(self.in_type) == po.FileType.PQP: 51 | click.echo("Info: Reading PQP transition list data...") 52 | pqp_reader = po.TransitionPQPFile() 53 | pqp_reader.convertPQPToTargetedExperiment(c_in_file.value, self.tr_exp, self.legacy_traml_id) 54 | pqp_reader.validateTargetedExperiment(self.tr_exp) 55 | 56 | elif self._get_file_type_id(in_type) == po.FileType.TRAML: 57 | click.echo("Info: Reading TraML transition list data...") 58 | traml_reader = po.TraMLFile() 59 | traml_reader.load(c_in_file.value, self.tr_exp) 60 | 61 | click.echo(f"Info: Loaded {len(self.tr_exp.getCompounds())} Compounds, {len(self.tr_exp.getProteins()) } Proteins, {len(self.tr_exp.getPeptides())} Peptides, and {len(self.tr_exp.getTransitions())} Transitions") 62 | 63 | def write_library(self, outfile: str, out_type: Union[str, None]=None) -> None: 64 | """ 65 | Method to write data from an OpenMS TargetedExperiment Object to disk 66 | 67 | Parameters: 68 | outfile: (str) output transition list file to load 69 | out_type: (str|None) output file type Default: None. Will be inferred from infile 70 | """ 71 | if out_type is None: 72 | out_type = self._get_file_type(outfile) 73 | self._validate_type(outfile, out_type) 74 | # Convert outfile str to ctype c char 75 | c_out_file = ctypes.create_string_buffer(outfile.encode()) 76 | if self._get_file_type_id(out_type) == po.FileType.TSV: 77 | click.echo("Info: Writing TSV transition list data to disk...") 78 | tsv_reader = po.TransitionTSVFile() 79 | self.tr_exp.getPeptides() 80 | tsv_reader.convertTargetedExperimentToTSV(c_out_file.value, self.tr_exp) 81 | 82 | elif self._get_file_type_id(out_type) == po.FileType.PQP: 83 | click.echo("Info: Writing PQP transition list data to disk...") 84 | pqp_reader = po.TransitionPQPFile() 85 | pqp_reader.convertTargetedExperimentToPQP(c_out_file.value, self.tr_exp) 86 | 87 | elif self._get_file_type_id(out_type) == po.FileType.TRAML: 88 | click.echo("Info: Writing TraML transition list data to disk...") 89 | traml_reader = po.TraMLFile() 90 | traml_reader.store(c_out_file.value, self.tr_exp) 91 | 92 | 93 | 94 | class TargetedFileConverter(TargetedExperiment): 95 | ''' 96 | TargetedFileConverter 97 | 98 | Converts different spectral libraries / transition files for targeted proteomics and metabolomics analysis. 99 | 100 | Can convert multiple formats to and from TraML (standardized transition format). The following formats are supported: 101 | 102 | - @ref OpenMS::TraMLFile "TraML" 103 | - @ref OpenMS::TransitionTSVFile "OpenSWATH TSV transition lists" 104 | - @ref OpenMS::TransitionPQPFile "OpenSWATH PQP SQLite files" 105 | - SpectraST MRM transition lists 106 | - Skyline transition lists 107 | - Spectronaut transition lists 108 | - Parquet transition lists 109 | ''' 110 | 111 | def __init__(self, infile: str, outfile: str="library.pqp", in_type: Union[str, None]=None, out_type: Union[str, None]=None, legacy_traml_id: bool=True) -> None: 112 | super().__init__(legacy_traml_id) 113 | self.infile = infile 114 | self.outfile = outfile 115 | 116 | # Handle types 117 | if in_type is None: 118 | in_type = self._get_file_type(self.infile) 119 | self.in_type = in_type 120 | if out_type is None: 121 | out_type = self._get_file_type(self.outfile) 122 | self.out_type = out_type 123 | 124 | def convert(self) -> None: 125 | """Method for converting between spectral library formats""" 126 | # If input is parquet, need to write out a temporary tsv to consume for conversion 127 | if self.in_type == 'parquet': 128 | tr_list = pd.read_parquet(self.infile) 129 | # Write out a temp tsv file for loading into a TargetedExperiment Object 130 | temp_in_tsv = f"{os.path.splitext(self.infile)[0]}.tsv" 131 | tr_list.to_csv(temp_in_tsv, sep="\t") 132 | # Save org infile information 133 | self.infile_parquet = self.infile 134 | self.in_type_parquet = self.in_type 135 | # Overwrite org infile information with TSV information 136 | self.infile = temp_in_tsv 137 | self.in_type = "tsv" 138 | 139 | # Read Input into TargetedExperiment 140 | self.load_library(self.infile, self.in_type) 141 | 142 | # Write TargetedExperiment to Output 143 | self.write_library(self.outfile, self.out_type) 144 | 145 | # Clean Up 146 | if 'in_type_parquet' in dir(self) and self.out_type!='tsv': 147 | os.remove(self.infile) 148 | self.infile = self.infile_parquet 149 | 150 | click.echo(f"Info: Finished converting {self.infile} to {self.outfile}") -------------------------------------------------------------------------------- /easypqp/openswathassaygenerator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pyopenms as po 4 | import ctypes 5 | import click 6 | from typing import Union, Tuple 7 | 8 | from .targetedfileconverter import TargetedExperiment 9 | 10 | def check_argument_values(arg_name: str, arg_value: any, expected_type: Tuple[Union[type, None], Union[Tuple, None]]) -> None: 11 | """ 12 | Check if the given argument value is of the expected type and value range (if applicable). 13 | Raise a TypeError or ValueError if the value is invalid. 14 | """ 15 | expected_type, expected_range = expected_type 16 | if isinstance(expected_type, list) and None in expected_type: 17 | pass 18 | elif not isinstance(arg_value, expected_type): 19 | raise TypeError(f"{arg_name} should be of type {expected_type.__name__} not type {arg_value.__class__}.") 20 | if expected_range is not None: 21 | # Handle numeric range 22 | if isinstance(expected_range, tuple) and len(expected_range) == 2: 23 | if not (expected_range[0] <= arg_value <= expected_range[1]): 24 | raise ValueError(f"{arg_name} should be within the range {expected_range}, cannot except {arg_value}.") 25 | elif isinstance(expected_range, list) and arg_value not in expected_range: 26 | raise ValueError(f"{arg_name} should be one of {expected_range}, cannot except '{arg_value}'.") 27 | 28 | def check_fragment_type(input_str: str): 29 | possible_fragment_types = ['b','y','a','x','c','z'] 30 | if input_str not in possible_fragment_types: 31 | raise ValueError(f"{input_str} is not one of the possible fragment types {possible_fragment_types}") 32 | 33 | def string_to_list(input_str: str, output_type: type): 34 | str_list = input_str.split(",") 35 | ret_list = [] 36 | for s in str_list: 37 | if (output_type == bytes): 38 | check_fragment_type(s) 39 | convert = bytes(s,encoding='utf-8' ) 40 | else: 41 | convert = int(s) 42 | ret_list.append(convert) 43 | 44 | return ret_list 45 | 46 | def read_swath_file(file: str): 47 | click.echo("Validate provided Swath windows file:") 48 | swath_window_loader = po.SwathWindowLoader() 49 | swath_prec_lower = [] 50 | swath_prec_upper = [] 51 | ret_val = [] 52 | swath_window_loader.readSwathWindows(file, swath_prec_lower, swath_prec_upper) 53 | click.echo("Read Swath maps file with %s windows" % str(len(swath_prec_lower))) 54 | for idx, s in enumerate(swath_prec_lower): 55 | current_win = [] 56 | current_win.append(s) 57 | current_win.append(swath_prec_upper[idx]) 58 | click.echo("Read lower swath window %s and upper window %s" % (s, swath_prec_upper[idx])) 59 | ret_val.append(current_win) 60 | return ret_val 61 | 62 | def read_unimod_file(unimod_file): 63 | ### TODO 64 | return None 65 | # mods_database = po.ModificationsDB(unimod_file) 66 | 67 | # click.echo("Unimod XML: %s modification types and residue specificities imported from file: %s" % (mods_database.getNumberOfModifications(), unimod_file)) 68 | 69 | class OpenSwathAssayGenerator(TargetedExperiment): 70 | def __init__(self, infile, in_type, outfile, out_type, min_transitions, max_transitions, allowed_fragment_type, allowed_fragment_charges, enable_detection_specific_losses, enable_detection_unspecific_losses, precursor_mz_threshold, precursor_lower_mz_limit, 71 | precursor_upper_mz_limit, product_mz_threshold, product_lower_mz_limit, product_upper_mz_limit, swath_windows_file, unimod_file, enable_ipf, max_num_alternative_localizations, disable_identification_ms2_precursors, disable_identification_specific_losses, enable_identification_unspecific_losses, enable_swath_specifity) -> None: 72 | super().__init__(True) 73 | 74 | self.infile = infile 75 | self.in_type = in_type 76 | 77 | self.outfile = outfile 78 | self.out_type = out_type 79 | 80 | self.min_transitions = min_transitions 81 | self.max_transitions = max_transitions 82 | 83 | self.allowed_fragment_type = string_to_list(allowed_fragment_type, bytes) 84 | 85 | self.allowed_fragment_charges = string_to_list(allowed_fragment_charges, int) ### TODO: check valid fragment charges 86 | 87 | self.enable_detection_specific_losses = enable_detection_specific_losses 88 | self.enable_detection_unspecific_losses = enable_detection_unspecific_losses 89 | self.precursor_mz_threshold = precursor_mz_threshold 90 | self.precursor_lower_mz_limit = precursor_lower_mz_limit 91 | self.precursor_upper_mz_limit = precursor_upper_mz_limit 92 | self.product_mz_threshold = product_mz_threshold 93 | self.product_lower_mz_limit = product_lower_mz_limit 94 | self.product_upper_mz_limit = product_upper_mz_limit 95 | 96 | self.swathes = list(list()) if swath_windows_file == None else read_swath_file(swath_windows_file) 97 | 98 | 99 | ### TODO: read unimod file 100 | self.unimod_file = None if unimod_file == None else read_unimod_file(unimod_file) 101 | print(self.unimod_file ) 102 | ### TODO: implement enable ipf 103 | self.enable_ipf = enable_ipf 104 | self.max_num_alternative_localizations = max_num_alternative_localizations 105 | self.disable_identification_ms2_precursors = disable_identification_ms2_precursors 106 | self.disable_identification_specific_losses = disable_identification_specific_losses 107 | self.enable_identification_unspecific_losses = enable_identification_unspecific_losses 108 | self.enable_swath_specifity = enable_swath_specifity 109 | 110 | 111 | 112 | ### check argument 113 | # # Valdiate arguments 114 | # check_argument_values("infile", infile, (str, None)) 115 | # check_argument_values("outfile", outfile, (str, None)) 116 | # # Handle types 117 | # if in_type is None: 118 | # in_type = self._get_file_type(infile) 119 | # if out_type is None: 120 | # out_type = self._get_file_type(outfile) 121 | # check_argument_values("in_type", in_type, ([str, None], ['tsv', 'mrm', 'pqp', 'TraML'])) 122 | # check_argument_values("out_type", out_type, ([str, None], ['tsv', 'pqp', 'TraML'])) 123 | # check_argument_values("product_mz_threshold", product_mz_threshold, (float, None)) 124 | # check_argument_values("allowed_fragment_types", allowed_fragment_types, (str, None)) # TODO: Add value check, to ensure valid fragment types 125 | # check_argument_values("allowed_fragment_charges", allowed_fragment_charges, (str, None)) # TODO: Add value check to ensure ints are in string of charges 126 | # check_argument_values("enable_detection_specific_losses", enable_detection_specific_losses, (bool, None)) 127 | # check_argument_values("enable_detection_unspecific_losses", enable_detection_unspecific_losses, (bool, None)) 128 | 129 | def read_input_file(self) -> None: 130 | self.load_library(self.infile, self.in_type) 131 | ### convert to tsv (panda df) 132 | 133 | ### get all transtion for specific precursors 134 | 135 | 136 | 137 | def annotate_transitions(self) -> None: 138 | click.echo("Info: Annotating transitions") 139 | assays = po.MRMAssay() 140 | assays.reannotateTransitions(self.tr_exp, self.precursor_mz_threshold, self.product_mz_threshold, self.allowed_fragment_type, self.allowed_fragment_charges, self.enable_detection_specific_losses, self.enable_detection_unspecific_losses, -4) ### todo convert fragment type to bytes 141 | 142 | click.echo("Info: Annotating detecting transitions") 143 | assays.restrictTransitions(self.tr_exp, self.product_lower_mz_limit, self.product_upper_mz_limit, self.swathes) 144 | assays.detectingTransitions(self.tr_exp, self.min_transitions, self.max_transitions) 145 | 146 | def write_output_file(self) -> None: 147 | self.write_library(self.outfile, self.out_type) -------------------------------------------------------------------------------- /tests/_regtest_outputs/test_openswathdecoy_generator.test_openswath_decoy_generator.out: -------------------------------------------------------------------------------- 1 | ID PROTEIN_ACCESSION DECOY 2 | 9 0 DECOY_Q04637 1 3 | 8 1 DECOY_Q2M2I8 1 4 | 6 2 DECOY_Q86WB0 1 5 | 7 3 DECOY_Q8WWI1 1 6 | 4 4 DECOY_Q92890 1 7 | 3 5 Q04637 0 8 | 2 6 Q2M2I8 0 9 | 1 7 Q86WB0 0 10 | 0 8 Q8WWI1 0 11 | 5 9 Q92890 0 12 | ID UNMODIFIED_SEQUENCE MODIFIED_SEQUENCE DECOY 13 | 9 0 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR 0 14 | 8 1 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR 0 15 | 6 2 EAALPPVSPLK EAALPPVS(UniMod:21)PLK 0 16 | 5 3 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK 0 17 | 4 4 IDGPKIPSPSPEVGK IDGPKIPS(UniMod:21)PSPEVGK 1 18 | 2 5 LPSVPPLAAEK LPS(UniMod:21)VPPLAAEK 1 19 | 1 6 LPSSPVELGPTDGTGMSR LPSS(UniMod:21)PVELGPTDGTGM(UniMod:35)SR 1 20 | 7 7 PTLAPQIPLIGPNPQTQGAR PT(UniMod:21)LAPQIPLIGPNPQTQGAR 1 21 | 0 8 QPSIEGEGSESMLDLGSTSSLTAR QPS(UniMod:21)IEGEGSESMLDLGSTSSLTAR 1 22 | 3 9 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR 0 23 | ID TRAML_ID GROUP_LABEL PRECURSOR_MZ CHARGE LIBRARY_INTENSITY LIBRARY_RT LIBRARY_DRIFT_TIME DECOY 24 | 3 0 AGQTQPNPGILPIQPALT(Phospho)PR_2 1075.0619 2 None 70.6096 -1.0 0 25 | 4 1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 1266.5622 2 None 67.4251 -1.0 0 26 | 8 2 DECOY_AGQTQPNPGILPIQPALT(Phospho)PR_2 1075.0619 2 None 70.6096 -1.0 1 27 | 9 3 DECOY_ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 1266.5622 2 None 67.4251 -1.0 1 28 | 5 4 DECOY_EAALPPVS(Phospho)PLK_2 601.3151 2 None 48.0082 -1.0 1 29 | 6 5 DECOY_GVEPSPS(Phospho)PIKPGDIK_2 800.9028 2 None 32.9698 -1.0 1 30 | 7 6 DECOY_SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 948.9241 2 None 52.9017 -1.0 1 31 | 0 7 EAALPPVS(Phospho)PLK_2 601.3151 2 None 48.0082 -1.0 0 32 | 1 8 GVEPSPS(Phospho)PIKPGDIK_2 800.9028 2 None 32.9698 -1.0 0 33 | 2 9 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 948.9241 2 None 52.9017 -1.0 0 34 | ID TRAML_ID PRODUCT_MZ CHARGE TYPE ANNOTATION ORDINAL DETECTING IDENTIFYING QUANTIFYING LIBRARY_INTENSITY DECOY 35 | 0 0 2346 409.2146 2 y y7^2 7 1 0 1 2965.7283 0 36 | 1 1 2347 465.7566 2 y y8^2 8 1 0 1 132.8395 0 37 | 2 2 2349 720.3692 1 y y6^1 6 1 0 1 1580.4800 0 38 | 3 3 2350 817.4219 1 y y7^1 7 1 0 1 10000.0000 0 39 | 4 4 2352 930.5060 1 y y8^1 8 1 0 1 417.7673 0 40 | 5 5 2353 1001.5431 1 y y9^1 9 1 0 1 278.9014 0 41 | 6 6 12006 375.2238 1 y y3^1 3 1 0 1 1621.3933 0 42 | 7 7 12009 529.2980 1 y y5^1 5 1 0 1 10000.0000 0 43 | 8 8 12010 566.2941 2 y y10^2 10 1 0 1 3326.0842 0 44 | 9 9 12012 657.3930 1 y y6^1 6 1 0 1 1924.5614 0 45 | 10 10 12013 658.3365 2 y y12^2 12 1 0 1 4464.7860 0 46 | 11 11 12015 867.5298 1 y y8^1 8 1 0 1 5222.4050 0 47 | 12 12 21489 385.2558 1 y y3^1 3 1 0 1 5179.5244 0 48 | 13 13 21490 393.1438 1 b b4^1 4 1 0 1 2606.7244 0 49 | 14 14 21491 565.1923 1 b b6^1 6 1 0 1 3256.2622 0 50 | 15 15 21493 666.2399 1 b b7^1 7 1 0 1 3735.2622 0 51 | 16 16 21494 736.3389 1 y y6^1 6 1 0 1 10000.0000 0 52 | 17 17 21496 835.4073 1 y y7^1 7 1 0 1 3901.4023 0 53 | 18 18 31640 486.2307 1 b b5^1 5 1 0 1 10000.0000 0 54 | 19 19 31641 697.3264 1 b b7^1 7 1 0 1 7081.5693 0 55 | 20 20 31643 734.3597 1 y y6^1 6 1 0 1 8579.1080 0 56 | 21 21 31644 832.4502 2 y y15^2 15 1 0 1 2923.7356 0 57 | 22 22 31646 964.4847 1 b b10^1 10 1 0 1 3234.0083 0 58 | 23 23 31647 1072.5551 1 y y9^1 9 1 0 1 3853.8560 0 59 | 24 24 42446 400.2303 1 y y3^1 3 1 0 1 7762.3594 0 60 | 25 25 42450 567.2287 1 y y4^1 4 1 0 1 8796.0370 0 61 | 26 26 42457 818.4254 1 b b9^1 9 1 0 1 5399.1875 0 62 | 27 27 42458 866.3768 1 y y7^1 7 1 0 1 6659.5240 0 63 | 28 28 42459 933.4524 1 b b10^1 10 1 0 1 6236.0680 0 64 | 29 29 42468 1139.4729 1 y y10^1 10 1 0 1 3636.2630 0 65 | 30 30 DECOY_31640 560.2480 1 b b5^1 5 1 0 1 10000.0000 1 66 | 31 31 DECOY_31641 801.3906 1 b b7^1 7 1 0 1 7081.5693 1 67 | 32 32 DECOY_31643 660.3424 1 y y6^1 6 1 0 1 8579.1080 1 68 | 33 33 DECOY_31644 795.4415 2 y y15^2 15 1 0 1 2923.7356 1 69 | 34 34 DECOY_31646 1124.6115 1 b b10^1 10 1 0 1 3234.0083 1 70 | 35 35 DECOY_31647 968.4908 1 y y9^1 9 1 0 1 3853.8560 1 71 | 36 36 DECOY_42446 347.2037 1 y y3^1 3 1 0 1 7762.3594 1 72 | 37 37 DECOY_42450 460.2878 1 y y4^1 4 1 0 1 8796.0370 1 73 | 38 38 DECOY_42457 965.3612 1 b b9^1 9 1 0 1 5399.1875 1 74 | 39 39 DECOY_42458 735.3995 1 y y7^1 7 1 0 1 6659.5240 1 75 | 40 40 DECOY_42459 1094.4038 1 b b10^1 10 1 0 1 6236.0680 1 76 | 41 41 DECOY_42468 992.5371 1 y y10^1 10 1 0 1 3636.2630 1 77 | 42 42 DECOY_2346 363.2132 2 y y7^2 7 1 0 1 2965.7283 1 78 | 43 43 DECOY_2347 412.7475 2 y y8^2 8 1 0 1 132.8395 1 79 | 44 44 DECOY_2349 628.3665 1 y y6^1 6 1 0 1 1580.4800 1 80 | 45 45 DECOY_2350 725.4192 1 y y7^1 7 1 0 1 10000.0000 1 81 | 46 46 DECOY_2352 824.4876 1 y y8^1 8 1 0 1 417.7673 1 82 | 47 47 DECOY_2353 991.4860 1 y y9^1 9 1 0 1 278.9014 1 83 | 48 48 DECOY_12006 303.2027 1 y y3^1 3 1 0 1 1621.3933 1 84 | 49 49 DECOY_12009 529.2980 1 y y5^1 5 1 0 1 10000.0000 1 85 | 50 50 DECOY_12010 545.7627 2 y y10^2 10 1 0 1 3326.0842 1 86 | 51 51 DECOY_12012 616.3301 1 y y6^1 6 1 0 1 1924.5614 1 87 | 52 52 DECOY_12013 658.3365 2 y y12^2 12 1 0 1 4464.7860 1 88 | 53 53 DECOY_12015 880.3812 1 y y8^1 8 1 0 1 5222.4050 1 89 | 54 54 DECOY_21489 409.1864 1 y y3^1 3 1 0 1 5179.5244 1 90 | 55 55 DECOY_21490 465.1745 1 b b4^1 4 1 0 1 2606.7244 1 91 | 56 56 DECOY_21491 661.2957 1 b b6^1 6 1 0 1 3256.2622 1 92 | 57 57 DECOY_21493 790.3383 1 b b7^1 7 1 0 1 3735.2622 1 93 | 58 58 DECOY_21494 624.2770 1 y y6^1 6 1 0 1 10000.0000 1 94 | 59 59 DECOY_21496 739.3039 1 y y7^1 7 1 0 1 3901.4023 1 95 | -------------------------------------------------------------------------------- /tests/data/test_transition_list.tsv: -------------------------------------------------------------------------------- 1 | PrecursorMz ProductMz PrecursorCharge ProductCharge LibraryIntensity NormalizedRetentionTime PeptideSequence ModifiedPeptideSequence PeptideGroupLabel LabelType CompoundName SumFormula SMILES Adducts ProteinId UniprotId GeneName FragmentType FragmentSeriesNumber Annotation CollisionEnergy PrecursorIonMobility TransitionGroupId TransitionId Decoy DetectingTransition IdentifyingTransition QuantifyingTransition Peptidoforms 2 | 601.31505 260.196869 2 1 81.93454 48.008226163 EAALPPVSPLK EAALPPVS(UniMod:21)PLK Q04637 EIF4G1 y 2 y2^1 -1 -1 EAALPPVS(Phospho)PLK_2 2345 0 1 0 1 3 | 601.31505 409.214607 2 2 2965.7283 48.008226163 EAALPPVSPLK EAALPPVS(UniMod:21)PLK Q04637 EIF4G1 y 7 y7^2 -1 -1 EAALPPVS(Phospho)PLK_2 2346 0 1 0 1 4 | 601.31505 465.756639 2 2 132.83946 48.008226163 EAALPPVSPLK EAALPPVS(UniMod:21)PLK Q04637 EIF4G1 y 8 y8^2 -1 -1 EAALPPVS(Phospho)PLK_2 2347 0 1 0 1 5 | 601.31505 623.316408 2 1 101.36074 48.008226163 EAALPPVSPLK EAALPPVS(UniMod:21)PLK Q04637 EIF4G1 y 5 y5^1 -1 -1 EAALPPVS(Phospho)PLK_2 2348 0 1 0 1 6 | 601.31505 720.369173 2 1 1580.48 48.008226163 EAALPPVSPLK EAALPPVS(UniMod:21)PLK Q04637 EIF4G1 y 6 y6^1 -1 -1 EAALPPVS(Phospho)PLK_2 2349 0 1 0 1 7 | 601.31505 817.421937 2 1 10000 48.008226163 EAALPPVSPLK EAALPPVS(UniMod:21)PLK Q04637 EIF4G1 y 7 y7^1 -1 -1 EAALPPVS(Phospho)PLK_2 2350 0 1 0 1 8 | 601.31505 845.380467 2 1 22.41705 48.008226163 EAALPPVSPLK EAALPPVS(UniMod:21)PLK Q04637 EIF4G1 b 8 b8^1 -1 -1 EAALPPVS(Phospho)PLK_2 2351 0 1 0 1 9 | 601.31505 930.506001 2 1 417.7673 48.008226163 EAALPPVSPLK EAALPPVS(UniMod:21)PLK Q04637 EIF4G1 y 8 y8^1 -1 -1 EAALPPVS(Phospho)PLK_2 2352 0 1 0 1 10 | 601.31505 1001.543115 2 1 278.9014 48.008226163 EAALPPVSPLK EAALPPVS(UniMod:21)PLK Q04637 EIF4G1 y 9 y9^1 -1 -1 EAALPPVS(Phospho)PLK_2 2353 0 1 0 1 11 | 800.902751 286.139749 2 1 1231.8224 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 b 3 b3^1 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12005 0 1 0 1 12 | 800.902751 375.223813 2 1 1621.3933 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 y 3 y3^1 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12006 0 1 0 1 13 | 800.902751 434.268555 2 2 391.9799 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 y 8 y8^2 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12007 0 1 0 1 14 | 800.902751 470.224542 2 1 612.1298 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 b 5 b5^1 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12008 0 1 0 1 15 | 800.902751 529.298042 2 1 10000 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 y 5 y5^1 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12009 0 1 0 1 16 | 800.902751 566.294118 2 2 3326.0842 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 y 10 y10^2 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12010 0 1 0 1 17 | 800.902751 567.277307 2 1 1098.7223 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 b 6 b6^1 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12011 0 1 0 1 18 | 800.902751 657.393005 2 1 1924.5614 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 y 6 y6^1 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12012 0 1 0 1 19 | 800.902751 658.336514 2 2 4464.786 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 y 12 y12^2 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12013 0 1 0 1 20 | 800.902751 722.857811 2 2 1568.4036 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 y 13 y13^2 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12014 0 1 0 1 21 | 800.902751 867.529834 2 1 5222.405 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 y 8 y8^1 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12015 0 1 0 1 22 | 800.902751 1072.507459 2 1 1544.1755 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 b 10 b10^1 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12016 0 1 0 1 23 | 800.902751 1131.580959 2 1 1269.2039 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 y 10 y10^1 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12017 0 1 0 1 24 | 800.902751 1315.665752 2 1 800.73376 32.969786991 GVEPSPSPIKPGDIK GVEPSPS(UniMod:21)PIKPGDIK Q92890 UFD1 y 12 y12^1 -1 -1 GVEPSPS(Phospho)PIKPGDIK_2 12018 0 1 0 1 25 | 948.924087 385.255781 2 1 5179.5244 52.901659389 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR Q86WB0 ZC3HC1 y 3 y3^1 -1 -1 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 21489 0 1 0 1 26 | 948.924087 393.143849 2 1 2606.7244 52.901659389 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR Q86WB0 ZC3HC1 b 4 b4^1 -1 -1 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 21490 0 1 0 1 27 | 948.924087 565.192257 2 1 3256.2622 52.901659389 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR Q86WB0 ZC3HC1 b 6 b6^1 -1 -1 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 21491 0 1 0 1 28 | 948.924087 639.286171 2 1 2192.347 52.901659389 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR Q86WB0 ZC3HC1 y 5 y5^1 -1 -1 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 21492 0 1 0 1 29 | 948.924087 666.239936 2 1 3735.2622 52.901659389 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR Q86WB0 ZC3HC1 b 7 b7^1 -1 -1 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 21493 0 1 0 1 30 | 948.924087 736.338936 2 1 10000 52.901659389 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR Q86WB0 ZC3HC1 y 6 y6^1 -1 -1 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 21494 0 1 0 1 31 | 948.924087 831.890372 2 2 699.7533 52.901659389 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR Q86WB0 ZC3HC1 y 16 y16^2 -1 -1 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 21495 0 1 0 1 32 | 948.924087 835.40735 2 1 3901.4023 52.901659389 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR Q86WB0 ZC3HC1 y 7 y7^1 -1 -1 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 21496 0 1 0 1 33 | 948.924087 1134.555473 2 1 428.92825 52.901659389 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR Q86WB0 ZC3HC1 y 10 y10^1 -1 -1 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 21497 0 1 0 1 34 | 948.924087 1231.608237 2 1 563.8044 52.901659389 SMGTGDTPGLEVPSSPLR SM(UniMod:35)GTGDTPGLEVPS(UniMod:21)SPLR Q86WB0 ZC3HC1 y 11 y11^1 -1 -1 SM(Oxidation)GTGDTPGLEVPS(Phospho)SPLR_2 21498 0 1 0 1 35 | 1075.061909 257.124433 2 1 2472.791 70.609603879 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR Q2M2I8 AAK1 b 3 b3^1 -1 -1 AGQTQPNPGILPIQPALT(Phospho)PR_2 31639 0 1 0 1 36 | 1075.061909 486.23069 2 1 10000 70.609603879 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR Q2M2I8 AAK1 b 5 b5^1 -1 -1 AGQTQPNPGILPIQPALT(Phospho)PR_2 31640 0 1 0 1 37 | 1075.061909 697.326383 2 1 7081.5693 70.609603879 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR Q2M2I8 AAK1 b 7 b7^1 -1 -1 AGQTQPNPGILPIQPALT(Phospho)PR_2 31641 0 1 0 1 38 | 1075.061909 726.902355 2 2 729.74634 70.609603879 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR Q2M2I8 AAK1 y 13 y13^2 -1 -1 AGQTQPNPGILPIQPALT(Phospho)PR_2 31642 0 1 0 1 39 | 1075.061909 734.359671 2 1 8579.108 70.609603879 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR Q2M2I8 AAK1 y 6 y6^1 -1 -1 AGQTQPNPGILPIQPALT(Phospho)PR_2 31643 0 1 0 1 40 | 1075.061909 832.450202 2 2 2923.7356 70.609603879 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR Q2M2I8 AAK1 y 15 y15^2 -1 -1 AGQTQPNPGILPIQPALT(Phospho)PR_2 31644 0 1 0 1 41 | 1075.061909 851.400611 2 1 2737.5051 70.609603879 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR Q2M2I8 AAK1 b 9 b9^1 -1 -1 AGQTQPNPGILPIQPALT(Phospho)PR_2 31645 0 1 0 1 42 | 1075.061909 964.484676 2 1 3234.0083 70.609603879 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR Q2M2I8 AAK1 b 10 b10^1 -1 -1 AGQTQPNPGILPIQPALT(Phospho)PR_2 31646 0 1 0 1 43 | 1075.061909 1072.555077 2 1 3853.856 70.609603879 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR Q2M2I8 AAK1 y 9 y9^1 -1 -1 AGQTQPNPGILPIQPALT(Phospho)PR_2 31647 0 1 0 1 44 | 1075.061909 1077.56874 2 1 2261.7683 70.609603879 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR Q2M2I8 AAK1 b 11 b11^1 -1 -1 AGQTQPNPGILPIQPALT(Phospho)PR_2 31648 0 1 0 1 45 | 1075.061909 1185.639142 2 1 1024.3945 70.609603879 AGQTQPNPGILPIQPALTPR AGQTQPNPGILPIQPALT(UniMod:21)PR Q2M2I8 AAK1 y 10 y10^1 -1 -1 AGQTQPNPGILPIQPALT(Phospho)PR_2 31649 0 1 0 1 46 | 1266.562206 286.176134 2 1 567.14233 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 3 b3^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42443 0 1 0 1 47 | 1266.562206 303.177531 2 1 1103.8254 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 2 y2^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42444 0 1 0 1 48 | 1266.562206 373.208163 2 1 1820.2323 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 4 b4^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42445 0 1 0 1 49 | 1266.562206 400.230295 2 1 7762.3594 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 3 y3^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42446 0 1 0 1 50 | 1266.562206 405.181296 2 2 114.97278 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 6 y6^2 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42447 0 1 0 1 51 | 1266.562206 460.240193 2 1 2922.4272 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 5 b5^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42448 0 1 0 1 52 | 1266.562206 561.287872 2 1 2393.0298 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 6 b6^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42449 0 1 0 1 53 | 1266.562206 567.228656 2 1 8796.037 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 4 y4^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42450 0 1 0 1 54 | 1266.562206 570.240071 2 2 502.2687 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 10 y10^2 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42451 0 1 0 1 55 | 1266.562206 589.2921 2 2 364.18185 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 12 b12^2 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42452 0 1 0 1 56 | 1266.562206 648.319901 2 1 3448.3528 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 7 b7^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42453 0 1 0 1 57 | 1266.562206 680.31272 2 1 3568.9507 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 5 y5^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42454 0 1 0 1 58 | 1266.562206 705.341365 2 1 1443.644 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 8 b8^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42455 0 1 0 1 59 | 1266.562206 769.356158 2 2 681.4857 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 16 b16^2 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42456 0 1 0 1 60 | 1266.562206 818.425429 2 1 5399.1875 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 9 b9^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42457 0 1 0 1 61 | 1266.562206 866.376779 2 1 6659.524 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 7 y7^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42458 0 1 0 1 62 | 1266.562206 933.452374 2 1 6236.068 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 10 b10^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42459 0 1 0 1 63 | 1266.562206 986.421909 2 2 985.9538 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 18 y18^2 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42460 0 1 0 1 64 | 1266.562206 995.419373 2 1 1262.0918 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 8 y8^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42461 0 1 0 1 65 | 1266.562206 1036.945748 2 2 1161.6108 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 19 y19^2 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42462 0 1 0 1 66 | 1266.562206 1046.536438 2 1 3329.5352 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 11 b11^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42463 0 1 0 1 67 | 1266.562206 1052.440837 2 1 2903.2126 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 9 y9^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42464 0 1 0 1 68 | 1266.562206 1080.461763 2 2 1668.9382 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 20 y20^2 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42465 0 1 0 1 69 | 1266.562206 1115.477079 2 2 786.84143 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 22 b22^2 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42466 0 1 0 1 70 | 1266.562206 1123.977777 2 2 2038.5486 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 21 y21^2 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42467 0 1 0 1 71 | 1266.562206 1139.472866 2 1 3636.263 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 10 y10^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42468 0 1 0 1 72 | 1266.562206 1177.576923 2 1 2879.441 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 12 b12^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42469 0 1 0 1 73 | 1266.562206 1180.51981 2 2 829.09406 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 22 y22^2 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42470 0 1 0 1 74 | 1266.562206 1231.043649 2 2 1074.6931 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 23 y23^2 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42471 0 1 0 1 75 | 1266.562206 1264.608952 2 1 890.7413 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 13 b13^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42472 0 1 0 1 76 | 1266.562206 1268.51546 2 1 1830.4344 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 11 y11^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42473 0 1 0 1 77 | 1266.562206 1355.54749 2 1 2691.2388 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 12 y12^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42474 0 1 0 1 78 | 1266.562206 1393.651546 2 1 870.2799 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 b 14 b14^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42475 0 1 0 1 79 | 1266.562206 1486.587975 2 1 851.35144 67.42507992 ATLSSTSGLDLMSESGEGEISPQR ATLSSTSGLDLMSESGEGEIS(UniMod:21)PQR Q8WWI1 LMO7 y 13 y13^1 -1 -1 ATLSSTSGLDLMSESGEGEIS(Phospho)PQR_2 42476 0 1 0 1 80 | -------------------------------------------------------------------------------- /easypqp/library.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from .util import timestamped_echo 3 | 4 | try: 5 | import matplotlib 6 | 7 | matplotlib.use("Agg") 8 | import matplotlib.pyplot as plt 9 | except ImportError: 10 | plt = None 11 | 12 | import click 13 | import os 14 | import pathlib 15 | import posixpath, ntpath 16 | import numpy as np 17 | import pandas as pd 18 | 19 | # alignment 20 | from sklearn import preprocessing 21 | import sklearn.isotonic 22 | import sklearn.linear_model 23 | import statsmodels.api as sm 24 | from scipy.interpolate import interp1d 25 | 26 | # error rate estimation 27 | HAS_PYPROPHET = False 28 | try: 29 | from pyprophet.stats import pemp, qvalue, pi0est 30 | from pyprophet.ipf import compute_model_fdr 31 | 32 | HAS_PYPROPHET = True 33 | except (ModuleNotFoundError, ImportError): 34 | # pyprophet (or one of its runtime deps) is not available or incompatible 35 | # Export placeholders so importing this module does not raise — commands 36 | # that require pyprophet should check HAS_PYPROPHET at runtime. 37 | pemp = None 38 | qvalue = None 39 | pi0est = None 40 | compute_model_fdr = None 41 | 42 | # plotting 43 | from scipy.stats import gaussian_kde 44 | from numpy import linspace, concatenate 45 | from seaborn import lmplot 46 | 47 | 48 | def plot(path, title, targets, decoys): 49 | plt.figure(figsize=(10, 5)) 50 | plt.subplots_adjust(hspace=0.5) 51 | 52 | plt.subplot(121) 53 | plt.title("group score distributions") 54 | plt.xlabel("score") 55 | plt.ylabel("# of groups") 56 | plt.hist( 57 | [targets, decoys], 58 | 20, 59 | color=["g", "r"], 60 | label=["target", "decoy"], 61 | histtype="bar", 62 | ) 63 | plt.legend(loc=2) 64 | 65 | plt.subplot(122) 66 | tdensity = gaussian_kde(targets) 67 | tdensity.covariance_factor = lambda: 0.25 68 | tdensity._compute_covariance() 69 | ddensity = gaussian_kde(decoys) 70 | ddensity.covariance_factor = lambda: 0.25 71 | ddensity._compute_covariance() 72 | xs = linspace( 73 | min(concatenate((targets, decoys))), max(concatenate((targets, decoys))), 200 74 | ) 75 | plt.title("group score densities") 76 | plt.xlabel("score") 77 | plt.ylabel("density") 78 | plt.plot(xs, tdensity(xs), color="g", label="target") 79 | plt.plot(xs, ddensity(xs), color="r", label="decoy") 80 | plt.legend(loc=2) 81 | 82 | plt.suptitle(title) 83 | plt.savefig(path) 84 | plt.close() 85 | 86 | 87 | def peptide_fdr(psms, peptide_fdr_threshold, pi0_lambda, plot_path, nofdr): 88 | pi0_method = "bootstrap" 89 | pi0_smooth_df = 3 90 | pi0_smooth_log_pi0 = False 91 | pfdr = False 92 | 93 | if nofdr: 94 | peptides = ( 95 | psms.groupby(["modified_peptide", "decoy", "q_value"])["pp"] 96 | .max() 97 | .reset_index() 98 | ) 99 | targets = peptides[~peptides["decoy"]].copy() 100 | decoys = peptides[peptides["decoy"]].copy() 101 | 102 | else: 103 | peptides = psms.groupby(["modified_peptide", "decoy"])["pp"].max().reset_index() 104 | targets = peptides[~peptides["decoy"]].copy() 105 | decoys = peptides[peptides["decoy"]].copy() 106 | 107 | targets["p_value"] = pemp(targets["pp"], decoys["pp"]) 108 | targets["q_value"] = qvalue( 109 | targets["p_value"], 110 | pi0est( 111 | targets["p_value"], 112 | pi0_lambda, 113 | pi0_method, 114 | pi0_smooth_df, 115 | pi0_smooth_log_pi0, 116 | )["pi0"], 117 | pfdr, 118 | ) 119 | 120 | plot(plot_path, "global peptide scores", targets["pp"], decoys["pp"]) 121 | 122 | return targets[targets["q_value"] < peptide_fdr_threshold][ 123 | "modified_peptide" 124 | ], np.min(targets[targets["q_value"] < peptide_fdr_threshold]["pp"]) 125 | 126 | 127 | def protein_fdr(psms, protein_fdr_threshold, pi0_lambda, plot_path, nofdr): 128 | pi0_method = "bootstrap" 129 | pi0_smooth_df = 3 130 | pi0_smooth_log_pi0 = False 131 | pfdr = False 132 | 133 | if nofdr: 134 | proteins = ( 135 | psms.groupby(["protein_id", "decoy", "q_value"])["pp"].max().reset_index() 136 | ) 137 | targets = proteins[~proteins["decoy"]].copy() 138 | decoys = proteins[proteins["decoy"]].copy() 139 | 140 | else: 141 | proteins = psms.groupby(["protein_id", "decoy"])["pp"].max().reset_index() 142 | targets = proteins[~proteins["decoy"]].copy() 143 | decoys = proteins[proteins["decoy"]].copy() 144 | 145 | targets["p_value"] = pemp(targets["pp"], decoys["pp"]) 146 | targets["q_value"] = qvalue( 147 | targets["p_value"], 148 | pi0est( 149 | targets["p_value"], 150 | pi0_lambda, 151 | pi0_method, 152 | pi0_smooth_df, 153 | pi0_smooth_log_pi0, 154 | )["pi0"], 155 | pfdr, 156 | ) 157 | 158 | plot(plot_path, "global protein scores", targets["pp"], decoys["pp"]) 159 | 160 | return targets[targets["q_value"] < protein_fdr_threshold]["protein_id"], np.min( 161 | targets[targets["q_value"] < protein_fdr_threshold]["pp"] 162 | ) 163 | 164 | 165 | def process_psms( 166 | psms, 167 | psmtsv, 168 | peptidetsv, 169 | psm_fdr_threshold, 170 | peptide_fdr_threshold, 171 | protein_fdr_threshold, 172 | pi0_lambda, 173 | peptide_plot_path, 174 | protein_plot_path, 175 | proteotypic, 176 | nofdr, 177 | ): 178 | # Append columns 179 | psms["base_name"] = psms["run_id"].apply( 180 | lambda x: os.path.splitext(os.path.basename(x))[0] 181 | ) 182 | 183 | if None not in (psmtsv, peptidetsv): 184 | # Read psm.tsv and peptide.tsv 185 | peptidetsv_df = pd.read_csv( 186 | peptidetsv, 187 | index_col=False, 188 | sep="\t", 189 | usecols=["Peptide", "Gene", "Protein ID"], 190 | ) 191 | psmtsv_df = pd.read_csv( 192 | psmtsv, 193 | index_col=False, 194 | sep="\t", 195 | usecols=["Spectrum", "Spectrum File", "Peptide"], 196 | ) 197 | 198 | # Filter out PSMs whose peptides are not in peptide.tsv 199 | psmtsv_df = psmtsv_df[psmtsv_df["Peptide"].isin(peptidetsv_df["Peptide"])] 200 | 201 | # Generate a group_id column 202 | temp_df = psmtsv_df["Spectrum"].str.split(".", expand=True) 203 | psmtsv_df["group_id"] = ( 204 | temp_df.iloc[:, 0] 205 | + "_" 206 | + pd.to_numeric(temp_df.iloc[:, -2]).astype(str) 207 | + psmtsv_df["Spectrum File"] 208 | .apply(lambda x: posixpath.basename(ntpath.basename(x))) 209 | .str.extract("(_rank[0-9]+)", expand=False) 210 | .apply(lambda x: "" if pd.isna(x) else x) 211 | ) 212 | 213 | # Filter psm dataframe 214 | psms = psms[psms["group_id"].isin(psmtsv_df["group_id"])] 215 | 216 | # Update gene_id and protein_id 217 | psms = psms.merge( 218 | peptidetsv_df, how="left", left_on="peptide_sequence", right_on="Peptide" 219 | ) 220 | psms.drop(["gene_id", "protein_id"], inplace=True, axis=1) 221 | psms.rename( 222 | columns={"Gene": "gene_id", "Protein ID": "protein_id"}, inplace=True 223 | ) 224 | psms["num_tot_proteins"] = 1 225 | timestamped_echo( 226 | "Info: %s redundant PSMs identified after filtering with %s and %s" 227 | % (psms.shape[0], psmtsv, peptidetsv) 228 | ) 229 | else: 230 | # Filter proteotypic peptides 231 | if proteotypic: 232 | psms = psms[psms["num_tot_proteins"] == 1].copy() 233 | else: 234 | raise click.ClickException( 235 | "Support for non-proteotypic peptides is not yet implemented." 236 | ) 237 | 238 | # Generate canonical set of protein identifiers 239 | proteinset = psms[["peptide_sequence", "protein_id"]].drop_duplicates() 240 | proteinset["protein_id"] = proteinset["protein_id"].astype(str) 241 | proteinset_canonical = ( 242 | proteinset.groupby("peptide_sequence") 243 | .apply( 244 | lambda x: ";".join( 245 | sorted( 246 | list( 247 | set( 248 | [ 249 | a 250 | for b in x["protein_id"].str.split(";").tolist() 251 | for a in b 252 | ] 253 | ) 254 | ) 255 | ) 256 | ) 257 | ) 258 | .reset_index(name="protein_id") 259 | ) 260 | 261 | psms = pd.merge( 262 | psms.drop(columns="protein_id"), proteinset_canonical, on="peptide_sequence" 263 | ) 264 | 265 | # Prepare PeptideProphet / iProphet results 266 | if "q_value" not in psms.columns: 267 | psms["q_value"] = compute_model_fdr(psms["pep"].values) 268 | 269 | # Confident peptides and protein in global context 270 | peptides, peptide_pp_threshold = peptide_fdr( 271 | psms, peptide_fdr_threshold, pi0_lambda, peptide_plot_path, nofdr 272 | ) 273 | timestamped_echo( 274 | "Info: %s modified peptides identified (q-value < %s; PP threshold = %s)" 275 | % (len(peptides), peptide_fdr_threshold, peptide_pp_threshold) 276 | ) 277 | proteins, protein_pp_threshold = protein_fdr( 278 | psms, protein_fdr_threshold, pi0_lambda, protein_plot_path, nofdr 279 | ) 280 | timestamped_echo( 281 | "Info: %s proteins identified (q-value < %s; PP threshold = %s)" 282 | % (len(proteins), protein_fdr_threshold, protein_pp_threshold) 283 | ) 284 | 285 | # Filter peptides and proteins 286 | psms = psms[psms["modified_peptide"].isin(peptides)] 287 | psms = psms[psms["protein_id"].isin(proteins)] 288 | 289 | # Filter PSMs 290 | psms = psms[psms["q_value"] < psm_fdr_threshold] 291 | 292 | # Remove decoys 293 | psms = psms[~psms["decoy"]] 294 | 295 | timestamped_echo( 296 | "Info: %s redundant PSMs identified (q-value < %s; PP threshold = %s)" 297 | % (psms.shape[0], psm_fdr_threshold, np.min(1 - psms["pep"])) 298 | ) 299 | 300 | return psms 301 | 302 | 303 | def lowess_iso(x, y, lowess_frac): 304 | with warnings.catch_warnings(): 305 | warnings.filterwarnings( 306 | "ignore", message="invalid value encountered in ", category=RuntimeWarning 307 | ) 308 | lwf = sm.nonparametric.lowess(y, x.ravel(), frac=lowess_frac) 309 | while pd.isna(lwf[:, 1]).any(): 310 | lowess_frac *= 2 311 | lwf = sm.nonparametric.lowess(y, x.ravel(), frac=lowess_frac) 312 | lwf_x = lwf[:, 0] 313 | ir = ( 314 | sklearn.isotonic.IsotonicRegression() 315 | ) # make the regression strictly increasing 316 | lwf_y = ir.fit_transform(lwf_x, lwf[:, 1]) 317 | mask = np.concatenate([[True], np.diff(lwf_y) != 0]) # remove non increasing points 318 | try: 319 | return interp1d( 320 | lwf_x[mask], lwf_y[mask], bounds_error=False, fill_value="extrapolate" 321 | ) 322 | except ValueError as e: 323 | timestamped_echo(e) 324 | return interp1d(lwf_x, lwf_y, bounds_error=False, fill_value="extrapolate") 325 | 326 | 327 | class LowessIsoEstimator: 328 | def __init__(self, lowess_frac): 329 | self.lowess_frac = lowess_frac 330 | 331 | def fit(self, x, y): 332 | self.lwi = lowess_iso(x, y, self.lowess_frac) 333 | return self 334 | 335 | def get_params(self, deep=False): 336 | return {"lowess_frac": self.lowess_frac} 337 | 338 | def set_params(self, lowess_frac): 339 | self.lowess_frac = lowess_frac 340 | return self 341 | 342 | def score(self, x, y): 343 | resid = self.lwi(x.ravel()) - y 344 | return 1 / resid.dot(resid) 345 | 346 | def predict(self, x): 347 | return self.lwi(x.ravel()) 348 | 349 | def __repr__(self): 350 | return str(self.get_params()) 351 | 352 | 353 | def lowess_iso_predictor(filename, x, y, xpred): 354 | gsc = sklearn.model_selection.GridSearchCV( 355 | LowessIsoEstimator(None), 356 | {"lowess_frac": [0.01, 0.02, 0.04, 0.08]}, 357 | cv=sklearn.model_selection.KFold(4, shuffle=True, random_state=0), 358 | n_jobs=min(os.cpu_count(), 61), 359 | ) 360 | 361 | gsc.fit(x.reshape(-1, 1), y) 362 | timestamped_echo( 363 | f"Info: {filename}; Lowess fraction used: {gsc.best_params_['lowess_frac']}." 364 | ) 365 | return gsc.best_estimator_.predict(xpred) 366 | 367 | 368 | def lowess( 369 | run, 370 | reference_run, 371 | xcol, 372 | ycol, 373 | lowess_frac, 374 | psm_fdr_threshold, 375 | min_peptides, 376 | filename, 377 | main_path, 378 | ): 379 | # Filter alignment data 380 | run_alignment = run[run["q_value"] < psm_fdr_threshold] if "q_value" in run else run 381 | if "q_value" in reference_run: 382 | reference_run_alignment = reference_run[ 383 | reference_run["q_value"] < psm_fdr_threshold 384 | ] 385 | else: 386 | reference_run_alignment = reference_run 387 | 388 | dfm = pd.merge( 389 | run_alignment, 390 | reference_run_alignment[["modified_peptide", "precursor_charge", ycol]], 391 | on=["modified_peptide", "precursor_charge"], 392 | ) 393 | timestamped_echo( 394 | f"Info: {filename}; Peptide overlap between run and reference: {dfm.shape[0]}." 395 | ) 396 | if dfm.shape[0] <= min_peptides: 397 | timestamped_echo( 398 | f"Info: {filename}; Skipping run because not enough peptides could be found for alignment." 399 | ) 400 | return pd.DataFrame() 401 | 402 | if dfm.shape[0] < 50: # use linear regression for small reference size 403 | linreg = sklearn.linear_model.LinearRegression().fit( 404 | dfm[xcol].to_numpy().reshape(-1, 1), dfm[ycol] 405 | ) 406 | run[ycol] = linreg.predict(run[xcol].to_numpy().reshape(-1, 1)) 407 | else: 408 | # Fit and apply the lowess model 409 | run[ycol] = ( 410 | lowess_iso_predictor( 411 | filename, 412 | dfm[xcol].to_numpy(), 413 | dfm[ycol].to_numpy(), 414 | run[xcol].to_numpy(), 415 | ) 416 | if lowess_frac == 0 417 | else lowess_iso(dfm[xcol].to_numpy(), dfm[ycol].to_numpy(), lowess_frac)( 418 | run[xcol].to_numpy() 419 | ) 420 | ) 421 | 422 | # Plot regression 423 | plt.plot(dfm[xcol].to_numpy(), dfm[ycol].to_numpy(), "o") 424 | run1 = run[[xcol, ycol]].sort_values(xcol) 425 | plt.plot(run1[xcol].to_numpy(), run1[ycol].to_numpy()) 426 | plt.xlabel(xcol) 427 | plt.ylabel(ycol) 428 | plt.savefig(os.path.join(main_path, filename + ".pdf")) 429 | plt.close() 430 | run1.to_pickle(os.path.join(main_path, filename + ".alignment_pkl")) 431 | return run 432 | 433 | 434 | def remove_rank_suffix(x): 435 | """ 436 | 437 | :param x: 438 | :return: 439 | 440 | >>> remove_rank_suffix('23aug2017_hela_serum_timecourse_4mz_narrow_6_rank4') 441 | '23aug2017_hela_serum_timecourse_4mz_narrow_6' 442 | >>> remove_rank_suffix('23aug2017_hela_serum_timecourse_4mz_narrow_6_rank44') 443 | '23aug2017_hela_serum_timecourse_4mz_narrow_6' 444 | >>> remove_rank_suffix('23aug2017_hela_serum_timecourse_4mz_narrow_6') 445 | '23aug2017_hela_serum_timecourse_4mz_narrow_6' 446 | """ 447 | import re 448 | 449 | return re.compile("(.+?)(?:_rank[0-9]+)?").fullmatch(x).group(1) 450 | 451 | 452 | def unify_modified_peptide_masses(mod_pep, transform=None): 453 | if not hasattr(mod_pep, "str"): 454 | return mod_pep, transform 455 | if transform is None: 456 | import collections 457 | 458 | float_list = {ee for e in mod_pep.str.findall("\\[(.+?)\\]") for ee in e} 459 | d = collections.defaultdict(list) 460 | current_group = None 461 | for i, (v, k) in enumerate(sorted((float(e), e) for e in float_list)): 462 | if current_group is None: 463 | current_group = d[i] 464 | else: 465 | if abs(current_group[-1][0] / v - 1) > 0.001: 466 | current_group = d[i] 467 | current_group.append((v, k)) 468 | transform = {s: l[0][1] for _, l in d.items() for f, s in l} 469 | 470 | def transform_func(mo): 471 | ret = mo.group(0) 472 | for k, v in transform.items(): 473 | ret = ret.replace(k, v) 474 | return ret 475 | 476 | return mod_pep.str.replace( 477 | "(?<=\\[).+?(?=\\])", transform_func, regex=True 478 | ), transform 479 | 480 | 481 | def generate( 482 | files, 483 | outfile, 484 | psmtsv, 485 | peptidetsv, 486 | perform_rt_calibration, 487 | rt_referencefile, 488 | rt_reference_run_path, 489 | rt_filter, 490 | perform_im_calibration, 491 | im_referencefile, 492 | im_reference_run_path, 493 | im_filter, 494 | psm_fdr_threshold, 495 | peptide_fdr_threshold, 496 | protein_fdr_threshold, 497 | rt_lowess_frac, 498 | rt_psm_fdr_threshold, 499 | im_lowess_frac, 500 | im_psm_fdr_threshold, 501 | pi0_lambda, 502 | peptide_plot_path, 503 | protein_plot_path, 504 | min_peptides, 505 | proteotypic, 506 | consensus, 507 | nofdr, 508 | diannpqp, 509 | ): 510 | # Parse input arguments 511 | psm_files = [] 512 | spectra = [] 513 | 514 | if len(files) == 1 and files[0].endswith(".txt"): 515 | files = pathlib.Path(files[0]).read_text().splitlines() 516 | 517 | for file in files: 518 | if "psmpkl" in file: 519 | psm_files.append(file) 520 | if "peakpkl" in file: 521 | spectra.append(file) 522 | 523 | if len(psm_files) == 0: 524 | raise click.ClickException( 525 | "No PSMs files present. Need to have tag 'psmpkl' in filename." 526 | ) 527 | 528 | if len(spectra) == 0: 529 | raise click.ClickException( 530 | "No spectrum files present. Need to have tag 'peakpkl' in filename." 531 | ) 532 | 533 | if peptidetsv is not None and psmtsv is None: 534 | raise click.ClickException("There is a peptide.tsv but no psm.tsv.") 535 | elif peptidetsv is None and psmtsv is not None: 536 | raise click.ClickException("There is a psm.tsv but no peptide.tsv.") 537 | 538 | if None not in (psmtsv, peptidetsv): 539 | timestamped_echo( 540 | "Info: There are psm.tsv and peptide.tsv. Will ignore --psm_fdr_threshold, --peptide_fdr_threshold, --protein_fdr_threshold, --pi0_lambda, --proteotypic, and --no-proteotypic." 541 | ) 542 | 543 | # Read all PSM files 544 | psms_list = [] 545 | for psm_file in psm_files: 546 | timestamped_echo("Info: Reading file %s." % psm_file) 547 | psm_tab = pd.read_pickle(psm_file) 548 | if psm_tab.shape[0] > 0: 549 | psms_list.append(psm_tab) 550 | psms = pd.concat(psms_list).reset_index(drop=True) 551 | psms["pp"] = 1 - psms["pep"] 552 | psms["modified_peptide"], transform_mass = unify_modified_peptide_masses( 553 | psms["modified_peptide"] 554 | ) 555 | 556 | timestamped_echo("Info: In total %s PSMs loaded." % psms.shape[0]) 557 | 558 | pepid = process_psms( 559 | psms, 560 | psmtsv, 561 | peptidetsv, 562 | psm_fdr_threshold, 563 | peptide_fdr_threshold, 564 | protein_fdr_threshold, 565 | pi0_lambda, 566 | peptide_plot_path, 567 | protein_plot_path, 568 | proteotypic, 569 | nofdr, 570 | ) 571 | 572 | # Get main path for figures 573 | main_path = os.path.dirname(os.path.abspath(peptide_plot_path)) 574 | 575 | # Generate set of best replicate identifications per run 576 | pepidr = pepid.loc[ 577 | pepid.groupby(["base_name", "modified_peptide", "precursor_charge"])[ 578 | "pp" 579 | ].idxmax() 580 | ].sort_index() 581 | 582 | aligned_runs = pepidr # this variable will store the aligned runs 583 | # Prepare reference iRT list (if enabled) 584 | if perform_rt_calibration: 585 | rt_reference_run_columns = ["modified_peptide", "precursor_charge", "irt"] 586 | 587 | if rt_referencefile is not None: 588 | # Read reference file if present 589 | rt_reference_run = pd.read_csv(rt_referencefile, index_col=False, sep="\t") 590 | if not set(rt_reference_run_columns).issubset(rt_reference_run.columns): 591 | raise click.ClickException( 592 | "Reference iRT file has wrong format. Requires columns 'modified_peptide', 'precursor_charge' and 'irt'." 593 | ) 594 | if rt_reference_run.shape[0] < 10: 595 | raise click.ClickException( 596 | "Reference iRT file has too few data points. Requires at least 10." 597 | ) 598 | else: 599 | # Select reference run 600 | pepidr_stats = ( 601 | pepidr.groupby("base_name")[["modified_peptide"]].count().reset_index() 602 | ) 603 | timestamped_echo(pepidr_stats) 604 | 605 | if rt_filter is not None: 606 | timestamped_echo( 607 | "Info: Filter candidate RT reference runs by tag '%s'." % rt_filter 608 | ) 609 | pepidr_stats = pepidr_stats[ 610 | pepidr_stats["base_name"].str.contains(rt_filter) 611 | ] 612 | timestamped_echo(pepidr_stats) 613 | 614 | rt_reference_run_base_name = pepidr_stats.loc[ 615 | pepidr_stats["modified_peptide"].idxmax() 616 | ]["base_name"] 617 | 618 | rt_reference_run = pepidr[ 619 | pepidr["base_name"] == rt_reference_run_base_name 620 | ].copy() 621 | 622 | # Normalize RT of reference run 623 | min_max_scaler = preprocessing.MinMaxScaler() 624 | rt_reference_run["irt"] = ( 625 | min_max_scaler.fit_transform(rt_reference_run[["retention_time"]]) * 100 626 | ) 627 | rt_reference_run[rt_reference_run_columns].to_csv( 628 | rt_reference_run_path, sep="\t", index=False 629 | ) 630 | 631 | # Normalize RT of all runs against reference 632 | aligned_runs = aligned_runs.groupby( 633 | "base_name", as_index=False, group_keys=False 634 | ).apply( 635 | lambda x: lowess( 636 | x, 637 | rt_reference_run, 638 | "retention_time", 639 | "irt", 640 | rt_lowess_frac, 641 | rt_psm_fdr_threshold, 642 | min_peptides, 643 | "easypqp_rt_alignment_" + x.name, 644 | main_path, 645 | ) 646 | ) 647 | 648 | else: # in this case no rt_calibration is performed, we just scale the retention time 649 | aligned_runs = pepidr 650 | min_max_scaler = preprocessing.MinMaxScaler() 651 | aligned_runs["irt"] = ( 652 | min_max_scaler.fit_transform(aligned_runs[["retention_time"]]) * 100 653 | ) 654 | 655 | # Determine if IM is present in the search data 656 | if pepidr["ion_mobility"].isnull().all(): 657 | enable_im = False 658 | else: 659 | enable_im = True 660 | 661 | if perform_im_calibration and enable_im: 662 | # Prepare reference IM list 663 | im_reference_run_columns = ["modified_peptide", "precursor_charge", "im"] 664 | 665 | if im_referencefile is not None: 666 | # Read reference file if present 667 | im_reference_run = pd.read_csv(im_referencefile, index_col=False, sep="\t") 668 | if not set(im_reference_run_columns).issubset(im_reference_run.columns): 669 | raise click.ClickException( 670 | "Reference IM file has wrong format. Requires columns 'modified_peptide', 'precursor_charge' and 'im'." 671 | ) 672 | if im_reference_run.shape[0] < 10: 673 | raise click.ClickException( 674 | "Reference IM file has too few data points. Requires at least 10." 675 | ) 676 | 677 | else: 678 | # Select reference run 679 | pepidr_stats = ( 680 | pepidr.groupby("base_name")[["modified_peptide"]].count().reset_index() 681 | ) 682 | timestamped_echo(pepidr_stats) 683 | 684 | if im_filter is not None: 685 | timestamped_echo( 686 | "Info: Filter candidate IM reference runs by tag '%s'." % im_filter 687 | ) 688 | pepidr_stats = pepidr_stats[ 689 | pepidr_stats["base_name"].str.contains(im_filter) 690 | ] 691 | timestamped_echo(pepidr_stats) 692 | 693 | im_reference_run_base_name = pepidr_stats.loc[ 694 | pepidr_stats["modified_peptide"].idxmax() 695 | ]["base_name"] 696 | 697 | im_reference_run = pepidr[ 698 | pepidr["base_name"] == im_reference_run_base_name 699 | ].copy() 700 | 701 | # Set IM of reference run 702 | im_reference_run["im"] = im_reference_run["ion_mobility"] 703 | im_reference_run[im_reference_run_columns].to_csv( 704 | im_reference_run_path, sep="\t", index=False 705 | ) 706 | 707 | # perform IM calibration 708 | aligned_runs = aligned_runs.groupby("base_name", as_index=False).apply( 709 | lambda x: lowess( 710 | x, 711 | im_reference_run, 712 | "ion_mobility", 713 | "im", 714 | im_lowess_frac, 715 | im_psm_fdr_threshold, 716 | min_peptides, 717 | "easypqp_im_alignment_" + x.name, 718 | main_path, 719 | ) 720 | ) 721 | 722 | elif enable_im: # if no calibration just transfer information as is 723 | aligned_runs["im"] = aligned_runs["ion_mobility"] 724 | else: 725 | pass 726 | 727 | pepida = aligned_runs 728 | 729 | if pepida.empty or "irt" not in pepida.columns: 730 | timestamped_echo( 731 | "Info: Not enough peptides could be found for alignment. There will be a blank spectral library." 732 | ) 733 | return 734 | 735 | # Remove peptides without valid iRT 736 | pepida = pepida.loc[np.isfinite(pepida["irt"])] 737 | 738 | # Remove peptides without valid IM 739 | if enable_im: 740 | pepida = pepida.loc[np.isfinite(pepida["im"])] 741 | else: 742 | pepida.loc[:, "im"] = np.nan 743 | 744 | if pepida.empty: 745 | timestamped_echo( 746 | "Info: Not enough peptides could be found for alignment. There will be a blank spectral library." 747 | ) 748 | return 749 | 750 | # Generate set of non-redundant global best replicate identifications 751 | pepidb = pepida.loc[ 752 | pepida.groupby(["modified_peptide", "precursor_charge"])["pp"].idxmax() 753 | ].sort_index() 754 | 755 | # Prepare ID mzML pairing 756 | peak_files = pd.DataFrame({"path": spectra}) 757 | peak_files["base_name"] = peak_files["path"].apply( 758 | lambda x: remove_rank_suffix(os.path.splitext(os.path.basename(x))[0]) 759 | ) 760 | 761 | # Parse mzXML to retrieve peaks and store results in peak files 762 | replicate_pqp = [] 763 | for idx, peak_file in peak_files.iterrows(): 764 | timestamped_echo("Info: Parsing file %s." % peak_file["path"]) 765 | meta_run = pepida[pepida["base_name"] == peak_file["base_name"]] 766 | if meta_run.shape[0] > 0: 767 | meta_global = pepidb[pepidb["base_name"] == peak_file["base_name"]] 768 | peaks = pd.read_pickle(peak_file["path"]) 769 | peaks["modified_peptide"], _ = unify_modified_peptide_masses( 770 | peaks["modified_peptide"], transform_mass 771 | ) 772 | # Generate run-specific PQP files for OpenSWATH alignment 773 | if consensus or ("_Q1" in peak_file["base_name"]): 774 | run_pqp = pd.merge( 775 | meta_run, 776 | peaks, 777 | on=["modified_peptide", "precursor_charge", "scan_id"], 778 | )[ 779 | [ 780 | "precursor_mz", 781 | "product_mz", 782 | "fragment", 783 | "intensity", 784 | "irt", 785 | "im", 786 | "protein_id", 787 | "gene_id", 788 | "peptide_sequence", 789 | "modified_peptide", 790 | "precursor_charge", 791 | ] 792 | ] 793 | run_pqp.columns = [ 794 | "PrecursorMz", 795 | "ProductMz", 796 | "Annotation", 797 | "LibraryIntensity", 798 | "NormalizedRetentionTime", 799 | "PrecursorIonMobility", 800 | "ProteinId", 801 | "GeneName", 802 | "PeptideSequence", 803 | "ModifiedPeptideSequence", 804 | "PrecursorCharge", 805 | ] 806 | run_pqp["PrecursorCharge"] = run_pqp["PrecursorCharge"].astype(int) 807 | run_pqp_path = os.path.splitext(peak_file["path"])[0] + "_run_peaks.tsv" 808 | run_pqp.to_csv(run_pqp_path, sep="\t", index=False) 809 | if consensus: 810 | replicate_pqp.append(run_pqp) 811 | 812 | # Generate global non-redundant PQP files 813 | if not consensus: 814 | global_pqp = pd.merge( 815 | meta_global, 816 | peaks, 817 | on=["modified_peptide", "precursor_charge", "scan_id"], 818 | )[ 819 | [ 820 | "precursor_mz", 821 | "product_mz", 822 | "fragment", 823 | "intensity", 824 | "irt", 825 | "im", 826 | "protein_id", 827 | "gene_id", 828 | "peptide_sequence", 829 | "modified_peptide", 830 | "precursor_charge", 831 | ] 832 | ] 833 | global_pqp.columns = [ 834 | "PrecursorMz", 835 | "ProductMz", 836 | "Annotation", 837 | "LibraryIntensity", 838 | "NormalizedRetentionTime", 839 | "PrecursorIonMobility", 840 | "ProteinId", 841 | "GeneName", 842 | "PeptideSequence", 843 | "ModifiedPeptideSequence", 844 | "PrecursorCharge", 845 | ] 846 | global_pqp["PrecursorCharge"] = global_pqp["PrecursorCharge"].astype( 847 | int 848 | ) 849 | replicate_pqp.append(global_pqp) 850 | 851 | # Aggregate consensus spectra 852 | pqp = pd.concat(replicate_pqp) 853 | if consensus: 854 | pqp_irt = ( 855 | pqp[ 856 | [ 857 | "ModifiedPeptideSequence", 858 | "PrecursorCharge", 859 | "NormalizedRetentionTime", 860 | "PrecursorIonMobility", 861 | ] 862 | ] 863 | .drop_duplicates() 864 | .groupby(["ModifiedPeptideSequence", "PrecursorCharge"])[ 865 | ["NormalizedRetentionTime", "PrecursorIonMobility"] 866 | ] 867 | .median() 868 | .reset_index() 869 | ) 870 | # group by modified peptide sequence before product m/z to avoid intermixing fragments of modified peptide positional isomers (e.g., T[80]PEPTIDE and TPEPT[80]IDE) 871 | pqp_mass = ( 872 | pqp.groupby( 873 | [ 874 | "PrecursorMz", 875 | "ModifiedPeptideSequence", 876 | "ProductMz", 877 | "Annotation", 878 | "ProteinId", 879 | "GeneName", 880 | "PeptideSequence", 881 | "PrecursorCharge", 882 | ], 883 | dropna=False, 884 | )["LibraryIntensity"] 885 | .median() 886 | .reset_index() 887 | ) 888 | pqp_mass = pqp_mass[ 889 | [ 890 | "PrecursorMz", 891 | "ProductMz", 892 | "Annotation", 893 | "ProteinId", 894 | "GeneName", 895 | "PeptideSequence", 896 | "ModifiedPeptideSequence", 897 | "PrecursorCharge", 898 | "LibraryIntensity", 899 | ] 900 | ] # rearrange columns back to the normal output order 901 | pqp = pd.merge( 902 | pqp_mass, pqp_irt, on=["ModifiedPeptideSequence", "PrecursorCharge"] 903 | ) 904 | 905 | # Generate DIA-NN2 compatible PQP file 906 | if diannpqp: 907 | pqp["FragmentLossType"] = np.nan 908 | pqp["FragmentType"] = pqp["Annotation"].str[0] 909 | pqp["FragmentSeriesNumber"] = pqp["Annotation"].str[1] 910 | pqp["FragmentCharge"] = pqp["Annotation"].str.split("^").str[1].astype(int) 911 | pqp["Proteotypic"] = [ 912 | 1 if ";" in prot_id else 0 for prot_id in pqp["ProteinId"] 913 | ] 914 | # Remove redundant columns 915 | pqp = pqp.drop(["Annotation"], axis=1) 916 | 917 | # Write output TSV file 918 | pqp.to_csv(outfile, sep="\t", index=False) 919 | -------------------------------------------------------------------------------- /easypqp/sage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import re 4 | import numpy as np 5 | import pandas as pd 6 | from typing import Optional, Tuple, Dict, List 7 | 8 | from .util import timestamped_echo 9 | from .convert import unimod as UniModHelper 10 | 11 | 12 | def _basename_wo_ext(p: str) -> str: 13 | """ 14 | Return the basename without extensions, handling common compression/archive 15 | extensions so that e.g. 'file.mzML.gz' -> 'file'. 16 | 17 | Strategy: 18 | - take the basename 19 | - if the final extension is a known compression/archive suffix ('.gz', '.bz2', 20 | '.zst', '.tgz', etc.) remove it 21 | - then remove one remaining extension (the common data file extension) 22 | - return the resulting stem 23 | """ 24 | name = os.path.basename(p or "") 25 | comp_suffixes = { 26 | ".gz", 27 | ".zst", 28 | ".tar", 29 | } 30 | 31 | root, ext = os.path.splitext(name) 32 | if ext and ext.lower() in comp_suffixes: 33 | name = root 34 | 35 | stem, _ = os.path.splitext(name) 36 | return stem 37 | 38 | 39 | def _get_first_existing(df: pd.DataFrame, cols: List[str], cast=None, default=None): 40 | """ 41 | Return the first existing column from a DataFrame as a pandas Series, with optional numeric casting, 42 | or a Series filled with a default value if none of the columns exist. 43 | 44 | Parameters 45 | ---------- 46 | df : pandas.DataFrame 47 | The DataFrame to search for columns. 48 | cols : list[str] 49 | Ordered list of column names to look for. The function returns the first name in this list 50 | that is present in df.columns. 51 | cast : Any, optional 52 | If None (the default), the matched column is returned unchanged (the Series as stored in df). 53 | If not None, the matched column is converted to numeric using pandas.to_numeric(..., errors="coerce") 54 | before being returned. Note: the provided value is used only as a flag; it is not called/applied. 55 | default : Any, optional 56 | If no column from cols is present in df and default is None, the function returns None. 57 | If default is not None, the function returns a pandas.Series of length len(df) where every 58 | element equals default. 59 | 60 | Returns 61 | ------- 62 | pandas.Series or None 63 | - If a matching column is found: the corresponding Series from df (possibly converted to numeric). 64 | - If no matching column is found and default is provided: a Series filled with default values. 65 | - If no matching column is found and default is None: None. 66 | 67 | Notes 68 | ----- 69 | - Column lookup is an exact string membership check against df.columns. 70 | - When cast is not None, non-convertible values in the selected column become NaN due to 71 | errors="coerce" in pandas.to_numeric. 72 | - The function does not modify the input DataFrame. 73 | - If df is empty and default is provided, an empty Series (length 0) of the default value is returned. 74 | 75 | Examples 76 | -------- 77 | - If cols = ["a", "b"] and df has column "b" but not "a", the function returns df["b"] (or its numeric cast). 78 | - If none of the cols exist and default=0, the function returns a Series of zeros with length len(df). 79 | """ 80 | for c in cols: 81 | if c in df.columns: 82 | return df[c] if cast is None else pd.to_numeric(df[c], errors="coerce") 83 | if default is None: 84 | return None 85 | return pd.Series([default] * len(df)) 86 | 87 | 88 | def _read_table(path: str) -> pd.DataFrame: 89 | """Read a TSV or Parquet file into a DataFrame.""" 90 | p = (path or "").lower() 91 | if p.endswith((".parquet", ".pq")): 92 | try: 93 | return pd.read_parquet(path) 94 | except Exception as e: 95 | raise RuntimeError(f"Failed to read Parquet file: {path}\n{e}") 96 | if p.endswith((".tsv")): 97 | try: 98 | return pd.read_csv(path, sep="\t", dtype=str) 99 | except Exception as e: 100 | raise RuntimeError(f"Failed to read TSV file: {path}\n{e}") 101 | 102 | 103 | class SagePSMParser: 104 | """ 105 | Parse results.sage.tsv to EasyPQP PSM schema (subset used by library.generate) 106 | 107 | Output columns: 108 | run_id, scan_id, hit_rank, massdiff, precursor_charge, retention_time, 109 | ion_mobility, peptide_sequence, protein_id, gene_id, num_tot_proteins, 110 | decoy, pep, modified_peptide, group_id, precursor_mz 111 | """ 112 | 113 | PROTON = 1.0072764 114 | NEUTRON = 1.00335 115 | # Sage bracket delta pattern: A[+15.9949], C[-0.9840], etc. 116 | BRACKET_RE = re.compile(r"([A-Z])\[(?P[+-]?\d+(?:\.\d+)?)\]") 117 | # Uniprot token pattern: db|ACCESSION|ENTRY_NAME (e.g., sp|P01903|DRA_HUMAN) 118 | _ACC_ENTRY_RE = re.compile(r"^[A-Za-z]{2}\|(?P[^|]+)\|(?P[^|]+)$") 119 | # Common decoy prefixes occasionally carried into protein tokens (we still rely on label for decoy) 120 | _DECOY_PREFIX_RE = re.compile(r"^(?:decoy_|rev_)+", flags=re.IGNORECASE) 121 | 122 | def __init__( 123 | self, 124 | results_tsv: str, 125 | unimod_xml: Optional[str], 126 | max_delta_unimod: float = 0.02, 127 | mz_precision_digits: int = 6, 128 | ): 129 | self.results_tsv = results_tsv 130 | self.um = UniModHelper(unimod_xml, max_delta_unimod) if unimod_xml else None 131 | self.max_delta_unimod = max_delta_unimod 132 | self.mz_precision_digits = mz_precision_digits 133 | 134 | @staticmethod 135 | def _uniq_preserve(seq): 136 | """De-duplicate while preserving order.""" 137 | seen = set() 138 | out = [] 139 | for x in seq: 140 | if x not in seen: 141 | seen.add(x) 142 | out.append(x) 143 | return out 144 | 145 | def _clean_token(self, tok: str) -> str: 146 | """Strip decoy prefixes and whitespace from an individual protein token.""" 147 | tok = (tok or "").strip() 148 | return self._DECOY_PREFIX_RE.sub("", tok) 149 | 150 | def _parse_protein_token(self, tok: str) -> Tuple[str, str]: 151 | """ 152 | Extract (accession, entry_name) from a single token. 153 | Falls back gracefully if format isn't db|ACC|ENTRY. 154 | """ 155 | t = self._clean_token(tok) 156 | m = self._ACC_ENTRY_RE.match(t) 157 | if m: 158 | return m.group("acc"), m.group("entry") 159 | # Fallbacks: 160 | if "|" in t: 161 | parts = t.split("|") 162 | if len(parts) >= 3: 163 | return parts[1] or "", parts[2] or "" 164 | # unknown pipe-y format: best-effort 165 | return parts[-1] or "", "" 166 | # No pipes at all: treat token as accession-only 167 | return t, "" 168 | 169 | def _split_accessions_and_entries( 170 | self, proteins: pd.Series 171 | ) -> Tuple[pd.Series, pd.Series, pd.Series]: 172 | """ 173 | Vectorized split of Sage protein strings into: 174 | - accessions (semicolon-joined) 175 | - entry names (semicolon-joined) 176 | - count of unique accessions (for num_tot_proteins) 177 | """ 178 | acc_list = [] 179 | entry_list = [] 180 | counts = [] 181 | for s in proteins.astype(str): 182 | if not s or s == "nan": 183 | accs, entries = [], [] 184 | else: 185 | toks = [t for t in s.split(";") if t.strip()] 186 | pairs = [self._parse_protein_token(t) for t in toks] 187 | accs = self._uniq_preserve([a for a, _ in pairs if a]) 188 | entries = self._uniq_preserve([e for _, e in pairs if e]) 189 | 190 | acc_list.append(";".join(accs)) 191 | entry_list.append(";".join(entries)) 192 | counts.append(len(accs)) 193 | 194 | return pd.Series(acc_list), pd.Series(entry_list), pd.Series(counts) 195 | 196 | def _annotate_unimod(self, pep: str) -> str: 197 | """ 198 | Convert Sage bracket deltas (e.g., M[+15.9949]) to (UniMod:). 199 | Tries position-specific contexts (N-term / C-term) before 'Anywhere'. 200 | Falls back to leaving the numeric delta if nothing matches. 201 | """ 202 | if self.um is None or "[" not in pep: 203 | return pep 204 | 205 | # 1) get clean sequence and site->delta map from Sage string 206 | seq = re.sub(r"\[[-+0-9.]+\]", "", pep) 207 | site2delta: Dict[int, float] = {} 208 | site = 0 209 | i = 0 210 | while i < len(pep): 211 | ch = pep[i] 212 | if ch.isalpha(): 213 | site += 1 214 | i += 1 215 | if i < len(pep) and pep[i] == "[": 216 | j = pep.find("]", i + 1) 217 | site2delta[site] = float(pep[i + 1 : j]) 218 | i = j + 1 219 | else: 220 | i += 1 221 | 222 | # 2) position preference helper 223 | def positions_for_site(idx: int, length: int): 224 | if idx == 1: 225 | # try N-terminus flavors first, then Anywhere 226 | return ["Any N-term", "Protein N-term", "Anywhere"] 227 | if idx == length: 228 | # try C-terminus flavors first, then Anywhere 229 | return ["Any C-term", "Protein C-term", "Anywhere"] 230 | return ["Anywhere"] 231 | 232 | # 3) very small fallback table for the most common N-term losses 233 | # (used only if UniMod lookup fails) 234 | def fallback_unimod(aa: str, idx: int, delta: float, tol=0.02) -> int: 235 | if idx == 1 and aa == "Q" and abs(delta - (-17.026549)) <= tol: 236 | return 28 # Gln->pyro-Glu (N-term) 237 | if idx == 1 and aa == "E" and abs(delta - (-18.010565)) <= tol: 238 | return 27 # Glu->pyro-Glu (N-term) 239 | return -1 240 | 241 | # 4) build output by injecting (UniMod:) after the modified residue 242 | out = list(seq) 243 | L = len(seq) 244 | for idx in sorted(site2delta.keys(), reverse=True): 245 | delta = site2delta[idx] 246 | aa = seq[idx - 1] 247 | rec_id = -1 248 | 249 | # Try position-specific contexts first 250 | for pos in positions_for_site(idx, L): 251 | rid = self.um.get_id(aa, pos, delta) 252 | if isinstance(rid, tuple): 253 | rid = rid[0] 254 | if rid != -1: 255 | rec_id = rid 256 | break 257 | 258 | # Fallback: known N-term conversions (pyro-Glu/Q,E) 259 | if rec_id == -1: 260 | rec_id = fallback_unimod(aa, idx, delta, self.max_delta_unimod) 261 | 262 | insert = f"(UniMod:{rec_id})" if rec_id != -1 else f"[{delta:+.6f}]" 263 | out.insert(idx, insert) 264 | 265 | return "".join(out) 266 | 267 | def parse(self) -> pd.DataFrame: 268 | df = _read_table(self.results_tsv).fillna("") 269 | 270 | filename = _get_first_existing( 271 | df, ["filename", "file", "rawfile", "raw_file", "source_file"] 272 | ) 273 | if filename is None: 274 | raise ValueError("results.sage.tsv is missing a filename/raw file column.") 275 | run_id = filename.astype(str).apply(_basename_wo_ext) 276 | 277 | scan_id = ( 278 | _get_first_existing( 279 | df, 280 | ["scannr", "scan", "scan_id", "spectrum_index"], 281 | cast=float, 282 | default=np.nan, 283 | ) 284 | .fillna(1) 285 | .astype(int) 286 | ) 287 | hit_rank = ( 288 | _get_first_existing(df, ["rank", "hit_rank"], cast=float, default=1) 289 | .fillna(1) 290 | .astype(int) 291 | ) 292 | z = ( 293 | _get_first_existing( 294 | df, ["precursor_charge", "charge", "z"], cast=float, default=2 295 | ) 296 | .fillna(2) 297 | .astype(int) 298 | ) 299 | 300 | rt = _get_first_existing( 301 | df, 302 | ["rt", "retention_time", "retention", "retention_time_sec"], 303 | cast=float, 304 | default=np.nan, 305 | ) 306 | im = _get_first_existing( 307 | df, ["ion_mobility", "mobility", "ccs", "k0"], cast=float, default=np.nan 308 | ) 309 | # If im is all 0s, set to NaN 310 | if im.eq(0).all(): 311 | im = pd.Series([np.nan] * len(df)) 312 | 313 | pep_seq = df["peptide"].astype(str) 314 | proteins_raw = _get_first_existing(df, ["proteins", "protein", "protein_id"]) 315 | proteins_raw = ( 316 | proteins_raw.astype(str) 317 | if proteins_raw is not None 318 | else pd.Series([""] * len(df)) 319 | ) 320 | protein_ids, gene_ids, num_prot = self._split_accessions_and_entries( 321 | proteins_raw 322 | ) 323 | 324 | if "label" in df.columns: 325 | # decoy detection from label 326 | # Sage TSV: label == -1 (decoy), +1 (target) 327 | label_series = pd.to_numeric(df["label"], errors="coerce") 328 | decoy = label_series == -1 329 | elif "is_decoy" in df.columns: 330 | # The parquet format uses a boolean is_decoy column 331 | decoy = df["is_decoy"] 332 | 333 | # spectrum-level q-value, peptide-level q-value and protein-level q-value 334 | pep = ( 335 | pd.to_numeric(df["posterior_error"], errors="coerce") 336 | if "posterior_error" in df.columns 337 | else pd.Series([np.nan] * len(df)) 338 | ) 339 | spectrum_q = ( 340 | pd.to_numeric(df["spectrum_q"], errors="coerce") 341 | if "spectrum_q" in df.columns 342 | else pd.Series([np.nan] * len(df)) 343 | ) 344 | peptide_q = ( 345 | pd.to_numeric(df["peptide_q"], errors="coerce") 346 | if "peptide_q" in df.columns 347 | else pd.Series([np.nan] * len(df)) 348 | ) 349 | protein_q = ( 350 | pd.to_numeric(df["protein_q"], errors="coerce") 351 | if "protein_q" in df.columns 352 | else pd.Series([np.nan] * len(df)) 353 | ) 354 | 355 | # compute precursor m/z from neurtal mass using the theoretical calculated mass of the peptide. 356 | calcmass = _get_first_existing(df, ["calcmass"], cast=float, default=np.nan) 357 | prec_mz = pd.Series(np.nan, index=df.index, dtype=float) 358 | mask_calc = calcmass.notna() & (z > 0) 359 | prec_mz.loc[mask_calc] = (calcmass[mask_calc] + z[mask_calc] * self.PROTON) / z[ 360 | mask_calc 361 | ] 362 | 363 | ## If we wanted to compute from experimental mass instead: 364 | # expmass = _get_first_existing(df, ['expmass'], cast=float, default=np.nan) 365 | # iso_err = _get_first_existing( 366 | # df, ["isotope_error", "isotope"], cast=float, default=0.0 367 | # ).fillna(0.0) 368 | # mask_exp = prec_mz.isna() & expmass.notna() & (z > 0) 369 | # mz_exp = (expmass[mask_exp] + z[mask_exp] * PROTON) / z[mask_exp] 370 | # prec_mz.loc[mask_exp] = mz_exp - (iso_err[mask_exp] * NEUTRON) / z[mask_exp] 371 | 372 | ## set precision 373 | prec_mz = prec_mz.round(self.mz_precision_digits) 374 | 375 | # modified peptide 376 | modpep = pep_seq.apply(self._annotate_unimod) 377 | 378 | # group id (same style as convert paths) 379 | group_id = ( 380 | run_id 381 | + "_" 382 | + scan_id.astype(str) 383 | + np.where(hit_rank > 1, "_rank" + hit_rank.astype(str), "") 384 | ) 385 | 386 | out = pd.DataFrame( 387 | { 388 | "run_id": run_id, 389 | "scan_id": scan_id, 390 | "hit_rank": hit_rank, 391 | "massdiff": 0.0, 392 | "precursor_charge": z, 393 | "retention_time": rt, 394 | "ion_mobility": im, 395 | "peptide_sequence": pep_seq.str.replace( 396 | r"\[[-+0-9.]+\]", "", regex=True 397 | ), 398 | "protein_id": protein_ids.fillna(""), 399 | "gene_id": gene_ids.fillna(""), 400 | "num_tot_proteins": num_prot.fillna(0).astype(int), 401 | "decoy": decoy.astype(bool), 402 | "modified_peptide": modpep, 403 | "group_id": group_id, 404 | "precursor_mz": prec_mz, 405 | "pep": pep, 406 | "q_value": spectrum_q, 407 | "peptide_q": peptide_q, 408 | "protein_q": protein_q, 409 | } 410 | ) 411 | return out 412 | 413 | def parse_df( 414 | self, df: pd.DataFrame, psm_id_series: Optional[pd.Series] = None 415 | ) -> pd.DataFrame: 416 | """ 417 | Parse a provided DataFrame slice (same logic as `parse` but works on an 418 | already-loaded DataFrame). This is useful for chunked/streaming flows. 419 | 420 | If `psm_id_series` is provided it will be attached to the returned 421 | DataFrame as a `psm_id` column (preserving positional alignment). 422 | """ 423 | df = df.fillna("") 424 | 425 | filename = _get_first_existing( 426 | df, ["filename", "file", "rawfile", "raw_file", "source_file"] 427 | ) 428 | if filename is None: 429 | raise ValueError("results.sage.tsv is missing a filename/raw file column.") 430 | run_id = filename.astype(str).apply(_basename_wo_ext) 431 | 432 | scan_id = ( 433 | _get_first_existing( 434 | df, 435 | ["scannr", "scan", "scan_id", "spectrum_index"], 436 | cast=float, 437 | default=np.nan, 438 | ) 439 | .fillna(1) 440 | .astype(int) 441 | ) 442 | hit_rank = ( 443 | _get_first_existing(df, ["rank", "hit_rank"], cast=float, default=1) 444 | .fillna(1) 445 | .astype(int) 446 | ) 447 | z = ( 448 | _get_first_existing( 449 | df, ["precursor_charge", "charge", "z"], cast=float, default=2 450 | ) 451 | .fillna(2) 452 | .astype(int) 453 | ) 454 | 455 | rt = _get_first_existing( 456 | df, 457 | ["rt", "retention_time", "retention", "retention_time_sec"], 458 | cast=float, 459 | default=np.nan, 460 | ) 461 | im = _get_first_existing( 462 | df, ["ion_mobility", "mobility", "ccs", "k0"], cast=float, default=np.nan 463 | ) 464 | # If im is all 0s, set to NaN 465 | if im.eq(0).all(): 466 | im = pd.Series([np.nan] * len(df)) 467 | 468 | pep_seq = df["peptide"].astype(str) 469 | proteins_raw = _get_first_existing(df, ["proteins", "protein", "protein_id"]) 470 | proteins_raw = ( 471 | proteins_raw.astype(str) 472 | if proteins_raw is not None 473 | else pd.Series([""] * len(df)) 474 | ) 475 | protein_ids, gene_ids, num_prot = self._split_accessions_and_entries( 476 | proteins_raw 477 | ) 478 | 479 | if "label" in df.columns: 480 | label_series = pd.to_numeric(df["label"], errors="coerce") 481 | decoy = label_series == -1 482 | elif "is_decoy" in df.columns: 483 | decoy = df["is_decoy"] 484 | else: 485 | decoy = pd.Series([False] * len(df)) 486 | 487 | pep = ( 488 | pd.to_numeric(df["posterior_error"], errors="coerce") 489 | if "posterior_error" in df.columns 490 | else pd.Series([np.nan] * len(df)) 491 | ) 492 | spectrum_q = ( 493 | pd.to_numeric(df["spectrum_q"], errors="coerce") 494 | if "spectrum_q" in df.columns 495 | else pd.Series([np.nan] * len(df)) 496 | ) 497 | peptide_q = ( 498 | pd.to_numeric(df["peptide_q"], errors="coerce") 499 | if "peptide_q" in df.columns 500 | else pd.Series([np.nan] * len(df)) 501 | ) 502 | protein_q = ( 503 | pd.to_numeric(df["protein_q"], errors="coerce") 504 | if "protein_q" in df.columns 505 | else pd.Series([np.nan] * len(df)) 506 | ) 507 | 508 | calcmass = _get_first_existing(df, ["calcmass"], cast=float, default=np.nan) 509 | prec_mz = pd.Series(np.nan, index=df.index, dtype=float) 510 | mask_calc = calcmass.notna() & (z > 0) 511 | prec_mz.loc[mask_calc] = (calcmass[mask_calc] + z[mask_calc] * self.PROTON) / z[ 512 | mask_calc 513 | ] 514 | prec_mz = prec_mz.round(self.mz_precision_digits) 515 | 516 | modpep = pep_seq.apply(self._annotate_unimod) 517 | 518 | group_id = ( 519 | run_id 520 | + "_" 521 | + scan_id.astype(str) 522 | + np.where(hit_rank > 1, "_rank" + hit_rank.astype(str), "") 523 | ) 524 | 525 | out = pd.DataFrame( 526 | { 527 | "run_id": run_id, 528 | "scan_id": scan_id, 529 | "hit_rank": hit_rank, 530 | "massdiff": 0.0, 531 | "precursor_charge": z, 532 | "retention_time": rt, 533 | "ion_mobility": im, 534 | "peptide_sequence": pep_seq.str.replace( 535 | r"\[[-+0-9.]+\]", "", regex=True 536 | ), 537 | "protein_id": protein_ids.fillna(""), 538 | "gene_id": gene_ids.fillna(""), 539 | "num_tot_proteins": num_prot.fillna(0).astype(int), 540 | "decoy": decoy.astype(bool), 541 | "modified_peptide": modpep, 542 | "group_id": group_id, 543 | "precursor_mz": prec_mz, 544 | "pep": pep, 545 | "q_value": spectrum_q, 546 | "peptide_q": peptide_q, 547 | "protein_q": protein_q, 548 | } 549 | ) 550 | 551 | if psm_id_series is not None: 552 | out = out.reset_index(drop=True) 553 | out["psm_id"] = psm_id_series.astype(str).str.strip().reset_index(drop=True) 554 | 555 | return out 556 | 557 | 558 | class SageFragmentParser: 559 | """ 560 | Parse matched_fragments.sage.tsv to EasyPQP 'peaks' table: 561 | columns: scan_id, modified_peptide, precursor_charge, precursor_mz, fragment, product_mz, intensity 562 | """ 563 | 564 | def __init__(self, frags_tsv: str, mz_precision_digits: int = 6): 565 | self.frags_tsv = frags_tsv 566 | self.mz_precision_digits = mz_precision_digits 567 | 568 | @staticmethod 569 | def _ann(ftype: str, ord_: int, z: int) -> str: 570 | return f"{ftype}{ord_}^{z}" 571 | 572 | def parse(self, psms_with_psmid: pd.DataFrame) -> pd.DataFrame: 573 | fr = _read_table(self.frags_tsv).fillna("") 574 | 575 | for c in [ 576 | "psm_id", 577 | "fragment_ordinals", 578 | "fragment_charge", 579 | "fragment_mz_calculated", 580 | "fragment_mz_experimental", 581 | "fragment_intensity", 582 | ]: 583 | if c in fr.columns: 584 | fr[c] = pd.to_numeric(fr[c], errors="coerce") 585 | if "psm_id" not in fr.columns: 586 | raise ValueError( 587 | "matched_fragments.sage.tsv must contain a 'psm_id' column." 588 | ) 589 | 590 | fr["psm_id"] = fr["psm_id"].astype(str).str.strip() 591 | 592 | fr["fragment"] = fr.apply( 593 | lambda r: self._ann( 594 | str(r["fragment_type"]), 595 | int(r["fragment_ordinals"]), 596 | int(r["fragment_charge"]), 597 | ), 598 | axis=1, 599 | ) 600 | fr["product_mz"] = fr["fragment_mz_calculated"] 601 | 602 | # join to PSMs 603 | join_cols = [ 604 | "psm_id", 605 | "scan_id", 606 | "modified_peptide", 607 | "precursor_mz", 608 | "precursor_charge", 609 | "run_id", 610 | ] 611 | j = fr.merge(psms_with_psmid[join_cols], on="psm_id", how="inner") 612 | 613 | peaks = j[ 614 | [ 615 | "run_id", 616 | "scan_id", 617 | "modified_peptide", 618 | "precursor_charge", 619 | "precursor_mz", 620 | "fragment", 621 | "product_mz", 622 | "fragment_intensity", 623 | ] 624 | ].copy() 625 | peaks.rename(columns={"fragment_intensity": "intensity"}, inplace=True) 626 | 627 | # per-PSM normalization to 10,000 (matches convert paths) 628 | peaks["intensity"] = peaks["intensity"].fillna(0.0) 629 | grp = peaks.groupby( 630 | ["run_id", "scan_id", "modified_peptide", "precursor_charge"], dropna=False 631 | )["intensity"] 632 | denom = grp.transform(lambda x: np.nanmax(x.values) if len(x) else np.nan) 633 | peaks["intensity"] = (peaks["intensity"] / denom) * 10000.0 634 | peaks["intensity"] = peaks["intensity"].fillna(0.0) 635 | 636 | # round and de-duplicate (keep most intense per exact fragment/product_mz) 637 | peaks["product_mz"] = peaks["product_mz"].round(self.mz_precision_digits) 638 | peaks["precursor_mz"] = peaks["precursor_mz"].round(self.mz_precision_digits) 639 | peaks["intensity"] = peaks["intensity"].round(self.mz_precision_digits) 640 | 641 | peaks = peaks.groupby( 642 | [ 643 | "run_id", 644 | "scan_id", 645 | "modified_peptide", 646 | "precursor_charge", 647 | "precursor_mz", 648 | "fragment", 649 | "product_mz", 650 | ], 651 | as_index=False, 652 | )["intensity"].max() 653 | return peaks 654 | 655 | def parse_df(self, fr: pd.DataFrame, psms_with_psmid: pd.DataFrame) -> pd.DataFrame: 656 | """ 657 | Parse a fragments DataFrame (filtered to relevant rows) and join to the 658 | provided PSM DataFrame. This mirrors `parse` but operates on in-memory 659 | DataFrames to support streaming. 660 | """ 661 | fr = fr.fillna("") 662 | for c in [ 663 | "psm_id", 664 | "fragment_ordinals", 665 | "fragment_charge", 666 | "fragment_mz_calculated", 667 | "fragment_mz_experimental", 668 | "fragment_intensity", 669 | ]: 670 | if c in fr.columns: 671 | fr[c] = pd.to_numeric(fr[c], errors="coerce") 672 | if "psm_id" not in fr.columns: 673 | raise ValueError( 674 | "matched_fragments.sage.tsv must contain a 'psm_id' column." 675 | ) 676 | 677 | fr["psm_id"] = fr["psm_id"].astype(str).str.strip() 678 | 679 | fr["fragment"] = fr.apply( 680 | lambda r: self._ann( 681 | str(r["fragment_type"]), 682 | int(r["fragment_ordinals"]), 683 | int(r["fragment_charge"]), 684 | ), 685 | axis=1, 686 | ) 687 | fr["product_mz"] = fr["fragment_mz_calculated"] 688 | 689 | join_cols = [ 690 | "psm_id", 691 | "scan_id", 692 | "modified_peptide", 693 | "precursor_mz", 694 | "precursor_charge", 695 | "run_id", 696 | ] 697 | j = fr.merge(psms_with_psmid[join_cols], on="psm_id", how="inner") 698 | 699 | peaks = j[ 700 | [ 701 | "run_id", 702 | "scan_id", 703 | "modified_peptide", 704 | "precursor_charge", 705 | "precursor_mz", 706 | "fragment", 707 | "product_mz", 708 | "fragment_intensity", 709 | ] 710 | ].copy() 711 | peaks.rename(columns={"fragment_intensity": "intensity"}, inplace=True) 712 | 713 | peaks["intensity"] = peaks["intensity"].fillna(0.0) 714 | grp = peaks.groupby( 715 | ["run_id", "scan_id", "modified_peptide", "precursor_charge"], dropna=False 716 | )["intensity"] 717 | denom = grp.transform(lambda x: np.nanmax(x.values) if len(x) else np.nan) 718 | peaks["intensity"] = (peaks["intensity"] / denom) * 10000.0 719 | peaks["intensity"] = peaks["intensity"].fillna(0.0) 720 | 721 | peaks["product_mz"] = peaks["product_mz"].round(self.mz_precision_digits) 722 | peaks["precursor_mz"] = peaks["precursor_mz"].round(self.mz_precision_digits) 723 | peaks["intensity"] = peaks["intensity"].round(self.mz_precision_digits) 724 | 725 | peaks = peaks.groupby( 726 | [ 727 | "run_id", 728 | "scan_id", 729 | "modified_peptide", 730 | "precursor_charge", 731 | "precursor_mz", 732 | "fragment", 733 | "product_mz", 734 | ], 735 | as_index=False, 736 | )["intensity"].max() 737 | return peaks 738 | 739 | 740 | def convert_sage( 741 | results_tsv: str, 742 | fragments_tsv: str, 743 | unimod_xml: Optional[str], 744 | max_delta_unimod: float = 0.02, 745 | mz_precision_digits: int = 6, 746 | *, 747 | force_streaming: Optional[bool] = None, 748 | streaming_threshold_bytes: int = 1_000_000_000, 749 | ) -> Optional[List[str]]: 750 | """ 751 | High-level conversion: Sage TSV/Parquet to EasyPQP PSM and peaks pickles written to disk. 752 | """ 753 | # Auto-switch to streaming mode when inputs are very large, unless caller 754 | # explicitly requested non-streaming via force_streaming=False. 755 | try: 756 | if force_streaming is None: 757 | # determine combined size (fall back to streaming if either file is missing) 758 | try: 759 | rsize = os.path.getsize(results_tsv) 760 | except Exception: 761 | rsize = 0 762 | try: 763 | fsize = os.path.getsize(fragments_tsv) 764 | except Exception: 765 | fsize = 0 766 | use_stream = (rsize + fsize) >= streaming_threshold_bytes 767 | else: 768 | use_stream = bool(force_streaming) 769 | except Exception: 770 | use_stream = False 771 | 772 | if use_stream: 773 | timestamped_echo("Info: Using streaming Sage conversion for low memory usage") 774 | return convert_sage_streaming( 775 | results_tsv, 776 | fragments_tsv, 777 | unimod_xml, 778 | max_delta_unimod=max_delta_unimod, 779 | mz_precision_digits=mz_precision_digits, 780 | ) 781 | 782 | # Read raw to extract psm_id for joining 783 | timestamped_echo("Info: Reading Sage PSMs") 784 | raw_res = _read_table(results_tsv) 785 | if "psm_id" not in raw_res.columns: 786 | raise ValueError( 787 | "results.sage.tsv must contain a 'psm_id' for joining with matched fragments." 788 | ) 789 | 790 | raw_res["psm_id"] = raw_res["psm_id"].astype(str).str.strip() 791 | 792 | psms = SagePSMParser( 793 | results_tsv, unimod_xml, max_delta_unimod, mz_precision_digits 794 | ).parse() 795 | psms = raw_res[["psm_id"]].join(psms) 796 | 797 | if psms.empty: 798 | raise ValueError("No PSMs were parsed from the provided results.sage.tsv file.") 799 | 800 | timestamped_echo("Info: Reading Sage matched fragment peaks") 801 | peaks = SageFragmentParser(fragments_tsv, mz_precision_digits).parse(psms) 802 | 803 | if peaks.empty: 804 | raise ValueError( 805 | "No fragment peaks were parsed from the provided matched_fragments.sage.tsv file." 806 | ) 807 | 808 | # Trim to minimal schema expected by library.generate 809 | keep = [ 810 | "run_id", 811 | "scan_id", 812 | "hit_rank", 813 | "massdiff", 814 | "precursor_charge", 815 | "retention_time", 816 | "ion_mobility", 817 | "peptide_sequence", 818 | "protein_id", 819 | "gene_id", 820 | "num_tot_proteins", 821 | "decoy", 822 | "modified_peptide", 823 | "group_id", 824 | "pep", 825 | "q_value", 826 | "peptide_q", 827 | "protein_q", 828 | ] 829 | psms_export = psms[keep].copy() 830 | 831 | runs = sorted(psms_export["run_id"].dropna().unique().tolist()) 832 | new_infiles = [] 833 | for run in runs: 834 | psms_r = psms_export.loc[psms_export["run_id"] == run] 835 | peaks_r = ( 836 | peaks.loc[peaks["run_id"] == run] if "run_id" in peaks.columns else peaks 837 | ) 838 | 839 | if psms_r.empty or peaks_r.empty: 840 | timestamped_echo( 841 | f"Info: Skipping run {run}: psms={len(psms_r)}, peaks={len(peaks_r)}" 842 | ) 843 | continue 844 | 845 | psmpkl = f"{run}.psmpkl" 846 | peakpkl = f"{run}.peakpkl" 847 | psms_r.to_pickle(psmpkl) 848 | peaks_r.to_pickle(peakpkl) 849 | timestamped_echo(f"Info: Wrote {psmpkl} and {peakpkl}") 850 | new_infiles.extend([psmpkl, peakpkl]) 851 | 852 | if len(new_infiles) == 0: 853 | # click may not be available in all contexts; raise a generic error here 854 | raise RuntimeError("No non-empty runs detected after Sage conversion.") 855 | 856 | 857 | def convert_sage_streaming( 858 | results_tsv: str, 859 | fragments_tsv: str, 860 | unimod_xml: Optional[str], 861 | max_delta_unimod: float = 0.02, 862 | mz_precision_digits: int = 6, 863 | chunksize: int = 800_000, 864 | tmpdir: Optional[str] = None, 865 | ) -> List[str]: 866 | """ 867 | Memory-efficient streaming conversion that processes one run at a time. 868 | 869 | PSMs are held in memory (they're relatively small); the *fragments* file is 870 | streamed twice per run: 871 | - pass 1: compute per-PSM-group max intensity (denom) 872 | - pass 2: normalize and aggregate peaks 873 | 874 | This is logically equivalent to the non-streaming convert_sage(). 875 | """ 876 | import tempfile 877 | 878 | # --- 1) Load all PSMs once (as in non-streaming convert_sage) --- 879 | timestamped_echo("Info: [streaming] Reading Sage PSMs (full table in memory)") 880 | 881 | raw_res = _read_table(results_tsv) 882 | if "psm_id" not in raw_res.columns: 883 | raise ValueError( 884 | "results.sage.tsv must contain a 'psm_id' for joining with matched fragments." 885 | ) 886 | raw_res["psm_id"] = raw_res["psm_id"].astype(str).str.strip() 887 | 888 | parser = SagePSMParser( 889 | results_tsv, unimod_xml, max_delta_unimod, mz_precision_digits 890 | ) 891 | psms_parsed = parser.parse() 892 | # align on index, same as non-streaming convert_sage 893 | psms_all = raw_res[["psm_id"]].join(psms_parsed) 894 | 895 | if psms_all.empty: 896 | raise ValueError("No PSMs were parsed from the provided results.sage.tsv file.") 897 | 898 | # Runs defined exactly as in non-streaming convert_sage 899 | runs = sorted(psms_all["run_id"].dropna().unique().tolist()) 900 | timestamped_echo(f"Info: [streaming] Discovered {len(runs)} runs") 901 | 902 | outfiles: List[str] = [] 903 | tmpdir = tmpdir or tempfile.mkdtemp(prefix="easypqp_sage_") 904 | 905 | # --- 2) Process one run at a time --- 906 | for run in runs: 907 | timestamped_echo(f"Info: [streaming] Processing run {run}") 908 | 909 | psms_run = psms_all.loc[psms_all["run_id"] == run].copy() 910 | if psms_run.empty: 911 | timestamped_echo(f"Info: Skipping run {run}: no PSMs") 912 | continue 913 | 914 | # Limit to columns needed for joins / normalization 915 | norm_cols = psms_run[ 916 | [ 917 | "psm_id", 918 | "run_id", 919 | "scan_id", 920 | "modified_peptide", 921 | "precursor_charge", 922 | "precursor_mz", 923 | ] 924 | ].copy() 925 | norm_cols["psm_id"] = norm_cols["psm_id"].astype(str).str.strip() 926 | 927 | run_psm_ids = set(norm_cols["psm_id"]) 928 | 929 | # --- PASS 1: build denom_map from joined fragments --- 930 | denom_map: Dict[str, float] = {} 931 | 932 | first_pass_matches = 0 933 | for fr_chunk in pd.read_csv( 934 | fragments_tsv, sep="\t", dtype=str, chunksize=chunksize 935 | ): 936 | if "psm_id" not in fr_chunk.columns: 937 | continue 938 | 939 | fr_chunk["psm_id"] = fr_chunk["psm_id"].astype(str).str.strip() 940 | mask = fr_chunk["psm_id"].isin(run_psm_ids) 941 | if not mask.any(): 942 | continue 943 | 944 | sub = fr_chunk.loc[mask].copy() 945 | first_pass_matches += int(mask.sum()) 946 | 947 | # convert intensity to numeric 948 | if "fragment_intensity" not in sub.columns: 949 | continue 950 | sub["fragment_intensity"] = pd.to_numeric( 951 | sub["fragment_intensity"], errors="coerce" 952 | ).fillna(0.0) 953 | 954 | # join to get run_id / scan_id / modified_peptide / charge 955 | j = sub.merge(norm_cols, on="psm_id", how="inner") 956 | if j.empty: 957 | continue 958 | 959 | # group key: exactly matches non-streaming normalization groups 960 | j["group_key"] = ( 961 | j["run_id"].astype(str) 962 | + "||" 963 | + j["scan_id"].astype(str) 964 | + "||" 965 | + j["modified_peptide"].astype(str) 966 | + "||" 967 | + j["precursor_charge"].astype(str) 968 | ) 969 | 970 | gb = j.groupby("group_key")["fragment_intensity"].max() 971 | for k, mx in gb.items(): 972 | prev = denom_map.get(k) 973 | if prev is None or mx > prev: 974 | denom_map[k] = float(mx) 975 | 976 | timestamped_echo( 977 | f"Info: [streaming] Run {run}: PASS1 matched {first_pass_matches} fragment rows; " 978 | f"{len(denom_map)} normalization groups" 979 | ) 980 | 981 | if not denom_map: 982 | timestamped_echo(f"Info: Skipping run {run}: no fragment peaks") 983 | continue 984 | 985 | # --- PASS 2: normalize intensities and build peaks_run --- 986 | peaks_parts: List[pd.DataFrame] = [] 987 | second_pass_matches = 0 988 | 989 | join_cols = [ 990 | "psm_id", 991 | "scan_id", 992 | "modified_peptide", 993 | "precursor_mz", 994 | "precursor_charge", 995 | "run_id", 996 | ] 997 | join_psms = psms_run[join_cols].copy() 998 | join_psms["psm_id"] = join_psms["psm_id"].astype(str).str.strip() 999 | 1000 | for fr_chunk in pd.read_csv( 1001 | fragments_tsv, sep="\t", dtype=str, chunksize=chunksize 1002 | ): 1003 | if "psm_id" not in fr_chunk.columns: 1004 | continue 1005 | 1006 | fr_chunk["psm_id"] = fr_chunk["psm_id"].astype(str).str.strip() 1007 | mask = fr_chunk["psm_id"].isin(run_psm_ids) 1008 | if not mask.any(): 1009 | continue 1010 | 1011 | sub = fr_chunk.loc[mask].copy() 1012 | second_pass_matches += int(mask.sum()) 1013 | 1014 | # numeric conversions 1015 | for c in [ 1016 | "fragment_ordinals", 1017 | "fragment_charge", 1018 | "fragment_mz_calculated", 1019 | "fragment_mz_experimental", 1020 | "fragment_intensity", 1021 | ]: 1022 | if c in sub.columns: 1023 | sub[c] = pd.to_numeric(sub[c], errors="coerce") 1024 | 1025 | # annotate fragment + product_mz 1026 | sub["fragment"] = sub.apply( 1027 | lambda r: SageFragmentParser._ann( 1028 | str(r.get("fragment_type", "")), 1029 | int(r.get("fragment_ordinals", 0) or 0), 1030 | int(r.get("fragment_charge", 0) or 0), 1031 | ), 1032 | axis=1, 1033 | ) 1034 | sub["product_mz"] = sub.get("fragment_mz_calculated") 1035 | 1036 | # join to PSMs for this run 1037 | j = sub.merge(join_psms, on="psm_id", how="inner") 1038 | if j.empty: 1039 | continue 1040 | 1041 | peaks = j[ 1042 | [ 1043 | "run_id", 1044 | "scan_id", 1045 | "modified_peptide", 1046 | "precursor_charge", 1047 | "precursor_mz", 1048 | "fragment", 1049 | "product_mz", 1050 | "fragment_intensity", 1051 | ] 1052 | ].copy() 1053 | peaks.rename(columns={"fragment_intensity": "intensity"}, inplace=True) 1054 | 1055 | # normalize using denom_map 1056 | peaks["intensity"] = pd.to_numeric( 1057 | peaks["intensity"], errors="coerce" 1058 | ).fillna(0.0) 1059 | 1060 | peaks["group_key"] = ( 1061 | peaks["run_id"].astype(str) 1062 | + "||" 1063 | + peaks["scan_id"].astype(str) 1064 | + "||" 1065 | + peaks["modified_peptide"].astype(str) 1066 | + "||" 1067 | + peaks["precursor_charge"].astype(str) 1068 | ) 1069 | peaks["denom"] = peaks["group_key"].map(lambda x: denom_map.get(x, np.nan)) 1070 | peaks["intensity"] = (peaks["intensity"] / peaks["denom"]) * 10000.0 1071 | peaks["intensity"] = peaks["intensity"].fillna(0.0) 1072 | peaks.drop(columns=["denom", "group_key"], inplace=True) 1073 | 1074 | # round like non-streaming 1075 | peaks["product_mz"] = peaks["product_mz"].round(mz_precision_digits) 1076 | peaks["precursor_mz"] = peaks["precursor_mz"].round(mz_precision_digits) 1077 | peaks["intensity"] = peaks["intensity"].round(mz_precision_digits) 1078 | 1079 | peaks_parts.append(peaks) 1080 | 1081 | timestamped_echo( 1082 | f"Info: [streaming] Run {run}: PASS2 matched {second_pass_matches} fragment rows" 1083 | ) 1084 | 1085 | if not peaks_parts: 1086 | timestamped_echo( 1087 | f"Info: Skipping run {run}: no fragment peaks after processing" 1088 | ) 1089 | continue 1090 | 1091 | peaks_run = pd.concat(peaks_parts, ignore_index=True) 1092 | 1093 | # Final aggregation: identical grouping keys to non-streaming parse() 1094 | peaks_run = peaks_run.groupby( 1095 | [ 1096 | "run_id", 1097 | "scan_id", 1098 | "modified_peptide", 1099 | "precursor_charge", 1100 | "precursor_mz", 1101 | "fragment", 1102 | "product_mz", 1103 | ], 1104 | as_index=False, 1105 | )["intensity"].max() 1106 | 1107 | # --- PSM export (same schema as non-streaming convert_sage) --- 1108 | keep = [ 1109 | "run_id", 1110 | "scan_id", 1111 | "hit_rank", 1112 | "massdiff", 1113 | "precursor_charge", 1114 | "retention_time", 1115 | "ion_mobility", 1116 | "peptide_sequence", 1117 | "protein_id", 1118 | "gene_id", 1119 | "num_tot_proteins", 1120 | "decoy", 1121 | "modified_peptide", 1122 | "group_id", 1123 | "pep", 1124 | "q_value", 1125 | "peptide_q", 1126 | "protein_q", 1127 | ] 1128 | psms_export = psms_run[keep].copy() 1129 | 1130 | # Optional: dedup by group_id if you really want it, 1131 | # but to be 1:1 with old convert_sage, you can *omit* this. 1132 | # psms_export = psms_export.drop_duplicates(subset=["group_id"]).reset_index(drop=True) 1133 | 1134 | if psms_export.empty or peaks_run.empty: 1135 | timestamped_echo( 1136 | f"Info: Skipping run {run}: psms={len(psms_export)}, peaks={len(peaks_run)}" 1137 | ) 1138 | continue 1139 | 1140 | psmpkl = f"{run}.psmpkl" 1141 | peakpkl = f"{run}.peakpkl" 1142 | psms_export.to_pickle(psmpkl) 1143 | peaks_run.to_pickle(peakpkl) 1144 | 1145 | timestamped_echo( 1146 | f"Info: [streaming] Wrote {psmpkl} (n_psms={len(psms_export)}) " 1147 | f"and {peakpkl} (n_peaks={len(peaks_run)})" 1148 | ) 1149 | outfiles.extend([psmpkl, peakpkl]) 1150 | 1151 | if not outfiles: 1152 | raise RuntimeError( 1153 | "No non-empty runs detected after Sage streaming conversion." 1154 | ) 1155 | 1156 | return outfiles 1157 | --------------------------------------------------------------------------------