├── .gitattributes
├── .github
    └── workflows
    │   ├── sagepy-connector-publish.yml
    │   └── sagepy-publish.yml
├── .gitignore
├── LICENSE
├── README.md
├── qfdrust
    ├── Cargo.toml
    └── src
    │   ├── dataset.rs
    │   ├── intensity.rs
    │   ├── lib.rs
    │   ├── main.rs
    │   ├── picked.rs
    │   ├── psm.rs
    │   └── utility.rs
├── sagepy-connector
    ├── .idea
    │   ├── .gitignore
    │   ├── modules.xml
    │   ├── sagepy-connector.iml
    │   └── vcs.xml
    ├── Cargo.lock
    ├── Cargo.toml
    └── src
    │   ├── lib.rs
    │   ├── py_database.rs
    │   ├── py_enzyme.rs
    │   ├── py_fasta.rs
    │   ├── py_fdr.rs
    │   ├── py_intensity.rs
    │   ├── py_ion_series.rs
    │   ├── py_lfq.rs
    │   ├── py_mass.rs
    │   ├── py_mobility_model.rs
    │   ├── py_modification.rs
    │   ├── py_peptide.rs
    │   ├── py_qfdr.rs
    │   ├── py_retention_alignment.rs
    │   ├── py_retention_model.rs
    │   ├── py_scoring.rs
    │   ├── py_spectrum.rs
    │   ├── py_tmt.rs
    │   ├── py_unimod.rs
    │   ├── py_utility.rs
    │   └── utilities.rs
├── sagepy
    ├── README.md
    ├── examples
    │   ├── lfq
    │   │   ├── LFQ.ipynb
    │   │   └── helpers.py
    │   ├── property-prediction
    │   │   └── property_prediction.ipynb
    │   ├── readme
    │   │   └── readme_example.ipynb
    │   ├── rescoring
    │   │   ├── bayesian_and_frequentist_rescoring.ipynb
    │   │   ├── data
    │   │   │   └── psm_data.csv
    │   │   └── rescoring.ipynb
    │   ├── sage-fdr
    │   │   └── FDRControl.ipynb
    │   └── scoring
    │   │   └── scoring.ipynb
    ├── pyproject.toml
    ├── sagepy
    │   ├── __init__.py
    │   ├── core
    │   │   ├── __init__.py
    │   │   ├── database.py
    │   │   ├── enzyme.py
    │   │   ├── fasta.py
    │   │   ├── fdr.py
    │   │   ├── ion_series.py
    │   │   ├── lfq.py
    │   │   ├── mass.py
    │   │   ├── ml
    │   │   │   ├── __init__.py
    │   │   │   ├── mobility_model.py
    │   │   │   ├── pep.py
    │   │   │   ├── retention_alignment.py
    │   │   │   └── retention_model.py
    │   │   ├── modification.py
    │   │   ├── peptide.py
    │   │   ├── scoring.py
    │   │   ├── spectrum.py
    │   │   ├── tmt.py
    │   │   └── unimod.py
    │   ├── qfdr
    │   │   ├── __init__.py
    │   │   └── tdc.py
    │   ├── rescore
    │   │   ├── __init__.py
    │   │   ├── lda.py
    │   │   ├── rescore.py
    │   │   ├── rt_predictor.py
    │   │   └── utility.py
    │   └── utility.py
    └── sagepy_logo.png
└── unimod
    ├── Cargo.toml
    └── src
        ├── lib.rs
        └── unimod
            ├── modification_atomic_composition.rs
            ├── title_to_unimod_id.rs
            ├── unimod_quantized.rs
            └── unimod_to_mass.rs


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-vendored
2 | 


--------------------------------------------------------------------------------
/.github/workflows/sagepy-connector-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Publish Rust Binding
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   build-and-publish:
12 |     runs-on: ${{ matrix.os }}
13 |     strategy:
14 |       matrix:
15 |         os: [ubuntu-latest, windows-latest, macos-13, macos-14]
16 |         python-version: ['3.11', '3.12']
17 |         include:
18 |           - os: ubuntu-latest
19 |             python-version: '3.11'
20 |             publish: true
21 |           - os: windows-latest
22 |             python-version: '3.11'
23 |             publish: true
24 |           - os: macos-13
25 |             python-version: '3.11'
26 |             publish: true
27 |           - os: macos-14
28 |             python-version: '3.11'
29 |             publish: true
30 |           - os: ubuntu-latest
31 |             python-version: '3.12'
32 |             publish: true
33 |           - os: windows-latest
34 |             python-version: '3.12'
35 |             publish: true
36 |           - os: macos-13
37 |             python-version: '3.12'
38 |             publish: true
39 |           - os: macos-14
40 |             python-version: '3.12'
41 |             publish: true
42 | 
43 |     steps:
44 |       - uses: actions/checkout@v3
45 | 
46 |       - name: Set up Python
47 |         uses: actions/setup-python@v3
48 |         with:
49 |           python-version: ${{ matrix.python-version }}
50 | 
51 |       - name: Install Maturin
52 |         run: |
53 |           python -m pip install --upgrade pip
54 |           pip install maturin
55 | 
56 |       - name: Set up Rust
57 |         uses: actions-rs/toolchain@v1
58 |         with:
59 |           profile: minimal
60 |           toolchain: stable
61 |           override: true
62 | 
63 |       - name: Change to sagepy-connector directory
64 |         run: cd sagepy-connector
65 | 
66 |       - name: Clean Cargo Artifacts
67 |         run: |
68 |           cd sagepy-connector
69 |           cargo clean
70 | 
71 |       - name: Build with Maturin
72 |         run: |
73 |           cd sagepy-connector
74 |           maturin build --release
75 | 
76 |       - name: Publish
77 |         if: matrix.publish
78 |         env:
79 |           MATURIN_PYPI_TOKEN: ${{ secrets.SAGEPY_CONNECTOR_PYPI_API_TOKEN }}
80 |         run: |
81 |           cd sagepy-connector
82 |           maturin publish --no-sdist
83 | 


--------------------------------------------------------------------------------
/.github/workflows/sagepy-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Publish Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v3
16 | 
17 |       - name: Set up Python
18 |         uses: actions/setup-python@v3
19 |         with:
20 |           python-version: '3.11'
21 | 
22 |       - name: Install Poetry
23 |         run: |
24 |           python -m pip install --upgrade pip
25 |           pip install poetry
26 | 
27 |       - name: Change to sagepy directory
28 |         run: cd sagepy
29 | 
30 |       - name: Build package
31 |         run: |
32 |           cd sagepy
33 |           poetry build
34 | 
35 |       - name: Publish package
36 |         env:
37 |           POETRY_PYPI_TOKEN_PYPI: ${{ secrets.SAGEPY_PYPI_API_TOKEN }}
38 |         run: |
39 |           cd sagepy
40 |           poetry config http-basic.pypi __token__ $POETRY_PYPI_TOKEN_PYPI
41 |           poetry publish
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 David Teschner
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sagepy
 2 | A python interface to the [SAGE](https://github.com/lazear/sage) search engine for mass spectrometry proteomics.
 3 | 
 4 | This repository hosts the main codebase for the sagepy project, which is dedicated to creating a fully functional Python interface for the powerful Sage search engine, originally written in Rust.
 5 | 
 6 | The project is structured as follows:
 7 | 
 8 | * `sagepy-connector`: This crate creates a Python interface using [PyO3](https://github.com/PyO3) to bind Rust to Python.
 9 | * `sagepy`: A pure Python, fully Pythonic wrapper around the exposed Rust code.
10 | *	`qfdrust`: This crate implements basic false discovery rate (FDR) estimation using TDC, following the methods proposed by [Crema](https://github.com/Noble-Lab/crema).
11 | *	`unimod`: A work-in-progress crate that bridges Sage-style PSM annotation with the UNIMOD standard.
12 | 
13 | ## Quickstart
14 | Get started quickly by installing sagepy via pip:
15 | ```
16 | pip install sagepy
17 | ```
18 | Check out the tutorial notebooks to dive into [DB generation, searching, and FDR estimation](https://github.com/theGreatHerrLebert/sagepy/blob/main/sagepy/examples/scoring/scoring.ipynb), [peptide property prediction](https://github.com/theGreatHerrLebert/sagepy/blob/main/sagepy/examples/property-prediction/property_prediction.ipynb), and [re-scoring of results](https://github.com/theGreatHerrLebert/sagepy/blob/main/sagepy/examples/rescoring/rescoring.ipynb).
19 | 
20 | ## Get involved
21 | Do you have any questions or want to contribute? Feel free to reach out at any time!
22 | 
23 | 
24 | ## Cite
25 | 
26 | If you find sagepy useful, please cite the original SAGE publication and consider citing our paper on sagepy:
27 | 
28 | Lazear, M. “Sage: An Open-Source Tool for Fast Proteomics Searching and Quantification at Scale.” [Journal of Proteome Research (2023)](https://pubs.acs.org/doi/10.1021/acs.jproteome.3c00486).
29 | 
30 | Teschner, D et al. “Rustims: An Open-Source Framework for Rapid Development and Processing of timsTOF Data-Dependent Acquisition Data.” [Journal of Proteome Research (2025)]( https://pubs.acs.org/doi/full/10.1021/acs.jproteome.4c00966).
31 | 
32 | Thanks for supporting free and open-source software and science!
33 | 


--------------------------------------------------------------------------------
/qfdrust/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "qfdrust"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | rand = "0.9.0-beta.0"
10 | rustms = { git = "https://github.com/theGreatHerrLebert/rustims.git" }
11 | # rustms = { path = "../../rustims/rustms" }
12 | # sage-core = {path = "../../sage/crates/sage" }
13 | sage-core = { git = "https://github.com/theGreatHerrLebert/sage.git" }
14 | itertools = "0.13.0"
15 | serde = { version = "1.0.217", features = ["derive"] }
16 | ndarray = "0.16.1"
17 | rayon = "1.10.0"
18 | bincode = "2.0.0-rc.3"
19 | zstd = "0.13.2"
20 | 


--------------------------------------------------------------------------------
/qfdrust/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod utility;
2 | pub mod dataset;
3 | pub mod intensity;
4 | pub mod psm;
5 | pub mod picked;


--------------------------------------------------------------------------------
/qfdrust/src/main.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     println!("Hello, world!");
3 | }
4 | 


--------------------------------------------------------------------------------
/qfdrust/src/picked.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use itertools::Itertools;
  3 | use crate::psm::Psm;
  4 | use rayon::prelude::*;
  5 | 
  6 | pub fn protein_id_from_psm(psm: &Psm, decoy_tag: &str, generate_decoys: bool) -> String {
  7 |     if psm.sage_feature.label == -1 {
  8 |         psm.proteins
  9 |             .iter()
 10 |             .map(|s| {
 11 |                 if generate_decoys {
 12 |                     format!("{}{}", decoy_tag, s)
 13 |                 } else {
 14 |                     s.to_string()
 15 |                 }
 16 |             })
 17 |             .join(";")
 18 |     } else {
 19 |         psm.proteins.iter().join(";")
 20 |     }
 21 | }
 22 | 
 23 | #[derive(Default)]
 24 | struct Row {
 25 |     ix: String,
 26 |     decoy: bool,
 27 |     score: f32,
 28 |     q: f32,
 29 | }
 30 | 
 31 | #[derive(Clone, Debug)]
 32 | struct Competition {
 33 |     forward_ix: Option<String>,
 34 |     forward: f32,
 35 |     reverse_ix: Option<String>,
 36 |     reverse: f32,
 37 | }
 38 | 
 39 | impl Default for Competition {
 40 |     fn default() -> Self {
 41 |         Competition {
 42 |             forward_ix: None,
 43 |             forward: f32::MIN,
 44 |             reverse_ix: None,
 45 |             reverse: f32::MIN,
 46 |         }
 47 |     }
 48 | }
 49 | 
 50 | fn assign_q_value(
 51 |     scores: HashMap<String, Competition>,
 52 | ) -> HashMap<String, f64> {
 53 | 
 54 |     let mut q_values: HashMap<String, f64> = HashMap::new();
 55 | 
 56 |     let mut scores = scores
 57 |         .into_par_iter()
 58 |         .flat_map(|(_, comp)| {
 59 |             [
 60 |                 (comp.forward_ix.clone(), false, comp.forward),
 61 |                 (comp.reverse_ix.clone(), true, comp.reverse),
 62 |             ]
 63 |         })
 64 |         .filter_map(|(ix, decoy, score)| {
 65 |             ix.map(|ix| Row {
 66 |                 ix,
 67 |                 decoy,
 68 |                 score,
 69 |                 q: 1.0,
 70 |             })
 71 |         })
 72 |         .collect::<Vec<Row>>();
 73 | 
 74 |     scores.par_sort_by(|a, b| b.score.total_cmp(&a.score));
 75 | 
 76 |     let mut decoy_count: f64 = 1.0;
 77 |     let mut target_count: f64 = 0.0;
 78 |     let mut q_values_list: Vec<f64> = Vec::new();
 79 | 
 80 |     // First pass: Calculate the raw q-values
 81 |     for row in scores.iter() {
 82 |         if row.decoy {
 83 |             decoy_count += 1.0;
 84 |         } else {
 85 |             target_count += 1.0;
 86 |         }
 87 | 
 88 |         // Avoid division by zero
 89 |         if target_count == 0.0 {
 90 |             q_values_list.push(1.0);
 91 |             continue;
 92 |         }
 93 | 
 94 |         let q = decoy_count / target_count;
 95 |         q_values_list.push(q);
 96 |     }
 97 | 
 98 |     // Second pass: Compute the cumulative minimum from the end
 99 |     let mut q_min = 1.0;
100 |     for (i, row) in scores.iter_mut().enumerate().rev() {
101 |         let q = q_values_list[i];
102 |         if q < q_min {
103 |             q_min = q;
104 |         }
105 |         row.q = q_min as f32;
106 |         q_values.insert(row.ix.clone(), row.q as f64);
107 |     }
108 | 
109 |     q_values
110 | }
111 | 
112 | pub fn spectrum_q_value(scores: &Vec<Psm>, use_hyper_score: bool) -> Vec<f32> {
113 | 
114 |     // create a collection of PSMs sorted by score and keep the index
115 |     let mut indexed_inner_collection: Vec<(usize, Psm)> = scores.iter()
116 |         .enumerate()
117 |         .map(|(index, item)| (index, item.clone()))
118 |         .collect();
119 | 
120 |     // sort either by hyperscore or PSM re_score
121 |     match use_hyper_score {
122 |         // Sort by hyperscore
123 |         true => {
124 |             indexed_inner_collection.par_sort_unstable_by(|(_, a), (_, b)| b.sage_feature.hyperscore.total_cmp(&a.sage_feature.hyperscore));
125 |         }
126 |         // Sort by PSM re_score
127 |         false => {
128 |             indexed_inner_collection.par_sort_unstable_by(|(_, a), (_, b)| b.re_score.unwrap().total_cmp(&a.re_score.unwrap()));
129 |         }
130 |     }
131 | 
132 |     // Calculate the spectrum q-value
133 |     let mut decoy = 1;
134 |     let mut target = 0;
135 | 
136 |     for (_, psm) in indexed_inner_collection.iter_mut() {
137 |         match psm.sage_feature.label == -1 {
138 |             true => decoy += 1,
139 |             false => target += 1,
140 |         }
141 |         psm.sage_feature.spectrum_q = decoy as f32 / target as f32;
142 |     }
143 | 
144 |     // Reverse slice, and calculate the cumulative minimum
145 |     let mut q_min = 1.0f32;
146 |     for (_, psm) in indexed_inner_collection.iter_mut().rev() {
147 |         q_min = q_min.min(psm.sage_feature.spectrum_q);
148 |         psm.sage_feature.spectrum_q = q_min;
149 |     }
150 | 
151 |     // sort the q_values by the original index
152 |     let mut q_values = vec![0.0; scores.len()];
153 |     for (sorted_index, psm) in indexed_inner_collection.iter() {
154 |         q_values[*sorted_index] = psm.sage_feature.spectrum_q;
155 |     }
156 | 
157 |     q_values
158 | }
159 | 
160 | pub fn picked_peptide(features: &mut Vec<Psm>, use_hyper_score: bool) -> HashMap<String, f64> {
161 | 
162 |     let mut map: HashMap<String, Competition> = HashMap::default();
163 | 
164 |     for feat in features.iter() {
165 | 
166 |         let peptide_sequence_key = match feat.sage_feature.label == -1 {
167 |             true => feat.sequence_decoy.clone().unwrap().sequence,
168 |             false => feat.sequence.clone().unwrap().sequence,
169 |         };
170 | 
171 |         let entry = map.entry(peptide_sequence_key).or_default();
172 | 
173 |         match feat.sage_feature.label == -1 {
174 |             true => {
175 |                 match use_hyper_score {
176 |                     true => {
177 |                         entry.reverse_ix = Some(feat.sequence_decoy.clone().unwrap().sequence);
178 |                         entry.reverse = entry.reverse.max(feat.sage_feature.hyperscore as f32);
179 |                     }
180 |                     false => {
181 |                         entry.reverse_ix = Some(feat.sequence_decoy.clone().unwrap().sequence);
182 |                         entry.reverse = entry.reverse.max(feat.re_score.unwrap() as f32);
183 |                     }
184 |                 }
185 |             }
186 |             false => {
187 |                 match use_hyper_score {
188 |                     true => {
189 |                         entry.forward_ix = Some(feat.sequence.clone().unwrap().sequence);
190 |                         entry.forward = entry.forward.max(feat.sage_feature.hyperscore as f32);
191 |                     }
192 |                     false => {
193 |                         entry.forward_ix = Some(feat.sequence.clone().unwrap().sequence);
194 |                         entry.forward = entry.forward.max(feat.re_score.unwrap() as f32);
195 |                     }
196 |                 }
197 |             }
198 |         }
199 |     }
200 | 
201 |     let q_value_map = assign_q_value(map);
202 | 
203 |     q_value_map
204 | }
205 | 
206 | pub fn picked_protein(features: &mut Vec<Psm>, use_hyper_score: bool) -> HashMap<String, f64> {
207 | 
208 |     let mut map: HashMap<String, Competition> = HashMap::default();
209 | 
210 |     for feat in features.iter() {
211 | 
212 |         let protein_key = protein_id_from_psm(feat, "rev_", true);
213 | 
214 |         let entry = map.entry(protein_key).or_default();
215 | 
216 |         match feat.sage_feature.label == -1 {
217 |             true => {
218 |                 match use_hyper_score {
219 |                     true => {
220 |                         entry.reverse_ix = Some(protein_id_from_psm(feat, "rev_", true));
221 |                         entry.reverse = entry.reverse.max(feat.sage_feature.hyperscore as f32);
222 |                     }
223 |                     false => {
224 |                         entry.reverse_ix = Some(protein_id_from_psm(feat, "rev_", true));
225 |                         entry.reverse = entry.reverse.max(feat.re_score.unwrap() as f32);
226 |                     }
227 |                 }
228 |             }
229 |             false => {
230 |                 match use_hyper_score {
231 |                     true => {
232 |                         entry.forward_ix = Some(protein_id_from_psm(feat, "rev_", true));
233 |                         entry.forward = entry.forward.max(feat.sage_feature.hyperscore as f32);
234 |                     }
235 |                     false => {
236 |                         entry.forward_ix = Some(protein_id_from_psm(feat, "rev_", true));
237 |                         entry.forward = entry.forward.max(feat.re_score.unwrap() as f32);
238 |                     }
239 |                 }
240 |             }
241 |         }
242 |     }
243 | 
244 |     let q_value_map = assign_q_value(map);
245 | 
246 |     q_value_map
247 | }


--------------------------------------------------------------------------------
/qfdrust/src/psm.rs:
--------------------------------------------------------------------------------
  1 | use rustms::chemistry::formula::calculate_mz;
  2 | use rustms::proteomics::peptide::{PeptideSequence};
  3 | use sage_core::scoring::{Feature, Fragments};
  4 | use serde::{Deserialize, Serialize};
  5 | use bincode;
  6 | use std::io;
  7 | use bincode::config::standard;
  8 | use crate::intensity::{prosit_intensities_to_fragments, FragmentIntensityPrediction};
  9 | use zstd::stream::encode_all; // For compression
 10 | use bincode::{Encode, Decode};
 11 | use zstd::decode_all;
 12 | 
 13 | #[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
 14 | pub struct Psm {
 15 |     pub spec_idx: String,
 16 |     pub peptide_idx: u32,
 17 |     pub proteins: Vec<String>,
 18 |     pub sage_feature: Feature,
 19 |     pub sequence: Option<PeptideSequence>,
 20 |     pub sequence_modified: Option<PeptideSequence>,
 21 |     pub sequence_decoy: Option<PeptideSequence>,
 22 |     pub sequence_decoy_modified: Option<PeptideSequence>,
 23 |     pub mono_mz_calculated: Option<f32>,
 24 |     pub intensity_ms1: Option<f32>,
 25 |     pub intensity_ms2: Option<f32>,
 26 |     pub collision_energy: Option<f32>,
 27 |     pub collision_energy_calibrated: Option<f32>,
 28 |     pub retention_time_projected: Option<f32>,
 29 |     pub prosit_predicted_intensities: Option<Vec<f32>>,
 30 |     pub re_score: Option<f64>,
 31 |     pub fragment_intensity_prediction: Option<FragmentIntensityPrediction>,
 32 | }
 33 | 
 34 | impl Psm {
 35 |     pub fn new(
 36 |         spec_idx: String,
 37 |         peptide_idx: u32,
 38 |         proteins: Vec<String>,
 39 |         sage_feature: Feature,
 40 |         sequence: Option<String>,
 41 |         sequence_modified: Option<String>,
 42 |         sequence_decoy: Option<String>,
 43 |         sequence_decoy_modified: Option<String>,
 44 |         intensity_ms1: Option<f32>,
 45 |         intensity_ms2: Option<f32>,
 46 |         collision_energy: Option<f32>,
 47 |         collision_energy_calibrated: Option<f32>,
 48 |         retention_time_projected: Option<f32>,
 49 |         prosit_predicted_intensities: Option<Vec<f32>>,
 50 |         re_score: Option<f64>,
 51 |     ) -> Self {
 52 | 
 53 |         let peptide_sequence = match &sequence {
 54 |             Some(seq) => Some(PeptideSequence::new(seq.clone(), Some(peptide_idx as i32))),
 55 |             None => None,
 56 |         };
 57 | 
 58 |         let sequence_decoy = match &sequence_decoy {
 59 |             Some(seq) => Some(PeptideSequence::new(seq.clone(), Some(peptide_idx as i32))),
 60 |             None => None,
 61 |         };
 62 | 
 63 |         let sequence_modified = match &sequence_modified {
 64 |             Some(seq) => Some(PeptideSequence::new(seq.clone(), Some(peptide_idx as i32))),
 65 |             None => None,
 66 |         };
 67 | 
 68 |         let sequence_decoy_modified = match &sequence_decoy_modified {
 69 |             Some(seq) => Some(PeptideSequence::new(seq.clone(), Some(peptide_idx as i32))),
 70 |             None => None,
 71 |         };
 72 | 
 73 |         let mono_mz_calculated = match (peptide_sequence.clone(), sage_feature.charge as i32) {
 74 |             (Some(seq), charge) => Some(calculate_mz(seq.mono_isotopic_mass(), charge) as f32),
 75 |             (_, _) => None,
 76 |         };
 77 |         
 78 |         Psm {
 79 |             spec_idx,
 80 |             peptide_idx,
 81 |             proteins,
 82 |             sage_feature,
 83 |             sequence: peptide_sequence,
 84 |             sequence_modified,
 85 |             sequence_decoy,
 86 |             sequence_decoy_modified,
 87 |             mono_mz_calculated,
 88 |             intensity_ms1,
 89 |             intensity_ms2,
 90 |             collision_energy,
 91 |             collision_energy_calibrated,
 92 |             retention_time_projected,
 93 |             prosit_predicted_intensities,
 94 |             re_score,
 95 |             fragment_intensity_prediction: None,
 96 |         }
 97 |     }
 98 | 
 99 |     pub fn get_fragment_intensity_prediction(&self) -> FragmentIntensityPrediction {
100 |         FragmentIntensityPrediction::new(
101 |             self.sage_feature.fragments.clone().unwrap(),
102 |             self.prosit_predicted_intensities.clone().unwrap(),
103 |         )
104 |     }
105 | 
106 |     pub fn calculate_fragment_intensity_prediction(&mut self) {
107 |         self.fragment_intensity_prediction = Some(self.get_fragment_intensity_prediction());
108 |     }
109 | 
110 |     pub fn prosit_intensity_to_fragments(&self) -> Option<Fragments> {
111 |         match &self.prosit_predicted_intensities {
112 |             Some(intensities) => Some(prosit_intensities_to_fragments(intensities.clone())),
113 |             None => None,
114 |         }
115 |     }
116 | 
117 |     pub fn get_feature_vector(&self) -> Vec<f64> {
118 | 
119 |         let sage_feature = &self.sage_feature;
120 |         let mut feature_vector = Vec::new();
121 |         feature_vector.push(sage_feature.expmass as f64);
122 |         feature_vector.push(sage_feature.calcmass as f64);
123 |         feature_vector.push(sage_feature.charge as f64);
124 |         feature_vector.push(sage_feature.rt as f64);
125 |         feature_vector.push(sage_feature.aligned_rt as f64);
126 |         feature_vector.push(sage_feature.predicted_rt as f64);
127 |         feature_vector.push(sage_feature.delta_rt_model as f64);
128 |         feature_vector.push(sage_feature.ims as f64);
129 |         feature_vector.push(sage_feature.predicted_ims as f64);
130 |         feature_vector.push(sage_feature.delta_ims_model as f64);
131 |         feature_vector.push(sage_feature.delta_mass as f64);
132 |         feature_vector.push(sage_feature.isotope_error as f64);
133 |         feature_vector.push(sage_feature.average_ppm as f64);
134 |         feature_vector.push(sage_feature.hyperscore);
135 |         feature_vector.push(self.re_score.unwrap_or(0.0));
136 |         feature_vector.push(sage_feature.delta_next);
137 |         feature_vector.push(sage_feature.delta_best);
138 |         feature_vector.push(sage_feature.matched_peaks as f64);
139 |         feature_vector.push(sage_feature.longest_b as f64);
140 |         feature_vector.push(sage_feature.longest_y as f64);
141 |         feature_vector.push(sage_feature.longest_y_pct as f64);
142 |         feature_vector.push(sage_feature.missed_cleavages as f64);
143 |         feature_vector.push(sage_feature.matched_intensity_pct as f64);
144 |         feature_vector.push(sage_feature.scored_candidates as f64);
145 |         feature_vector.push(sage_feature.poisson);
146 |         feature_vector.push(sage_feature.discriminant_score as f64);
147 |         feature_vector.push(sage_feature.posterior_error as f64);
148 |         feature_vector.push(sage_feature.ms2_intensity as f64);
149 |         feature_vector.push(sage_feature.rank as f64);
150 | 
151 |         feature_vector.push(self.intensity_ms1.unwrap_or(0.0) as f64);
152 |         feature_vector.push(self.intensity_ms2.unwrap_or(0.0) as f64);
153 |         feature_vector.push(self.collision_energy.unwrap_or(0.0) as f64);
154 |         feature_vector.push(self.collision_energy_calibrated.unwrap_or(0.0) as f64);
155 |         feature_vector.push(self.retention_time_projected.unwrap_or(0.0) as f64);
156 | 
157 |         let intensity_features = self.fragment_intensity_prediction.clone();
158 | 
159 |         match intensity_features {
160 |             Some(intensity_features) => {
161 |                 let features = intensity_features.get_feature_vector(0.00001, false);
162 |                 for feature in features {
163 |                     feature_vector.push(feature as f64);
164 |                 }
165 |             },
166 | 
167 |             None => {
168 |                 for _ in 0..5 {
169 |                     feature_vector.push(0.0);
170 |                 }
171 |             }
172 |         }
173 | 
174 |         feature_vector.push(sage_feature.delta_rt_model as f64);
175 |         feature_vector.push(sage_feature.delta_ims_model as f64);
176 | 
177 |         feature_vector.push(sage_feature.label as f64);
178 | 
179 |         feature_vector.push(sage_feature.spectrum_q as f64);
180 |         feature_vector.push(sage_feature.peptide_q as f64);
181 |         feature_vector.push(sage_feature.protein_q as f64);
182 | 
183 |         feature_vector
184 |     }
185 | 
186 |     pub fn get_feature_names(&self) -> Vec<&str> {
187 |         vec![
188 |             "expmass",
189 |             "calcmass",
190 |             "charge",
191 |             "rt",
192 |             "aligned_rt",
193 |             "predicted_rt",
194 |             "delta_rt_model",
195 |             "ims",
196 |             "predicted_ims",
197 |             "delta_ims_model",
198 |             "delta_mass",
199 |             "isotope_error",
200 |             "average_ppm",
201 |             "hyperscore",
202 |             "re_score",
203 |             "delta_next",
204 |             "delta_best",
205 |             "matched_peaks",
206 |             "longest_b",
207 |             "longest_y",
208 |             "longest_y_pct",
209 |             "missed_cleavages",
210 |             "matched_intensity_pct",
211 |             "scored_candidates",
212 |             "poisson",
213 |             "discriminant_score",
214 |             "posterior_error",
215 |             "ms2_intensity",
216 |             "rank",
217 |             "intensity_ms1",
218 |             "intensity_ms2",
219 |             "collision_energy",
220 |             "collision_energy_calibrated",
221 |             "retention_time_projected",
222 |             "cosine_similarity",
223 |             "spectral_angle_similarity",
224 |             "pearson_correlation",
225 |             "spearman_correlation",
226 |             "spectral_entropy_similarity",
227 |             "delta_rt",
228 |             "delta_ims",
229 |             "decoy",
230 |             "spectrum_q",
231 |             "peptide_q",
232 |             "protein_q",
233 |         ]
234 |     }
235 | }
236 | 
237 | pub fn compress_psms(psms: &[Psm]) -> io::Result<Vec<u8>> {
238 |     // Step 1: Configure bincode
239 |     let config = standard();
240 |     // Step 2: Serialize with the configured bincode
241 |     let serialized = bincode::encode_to_vec(psms, config).expect("Serialization failed");
242 |     // Step 3: Compress the serialized data using ZSTD
243 |     let compressed = encode_all(serialized.as_slice(), 0).expect("Compression failed");
244 |     // Step 4: Return compressed binary data
245 |     Ok(compressed)
246 | }
247 | 
248 | pub fn decompress_psms(compressed_data: &[u8]) -> io::Result<Vec<Psm>> {
249 |     // Step 1: Decompress the data using ZSTD
250 |     let decompressed = decode_all(compressed_data).expect("Decompression failed");
251 |     // Step 2: Configure bincode
252 |     let config = standard();
253 |     // Step 3: Deserialize the decompressed data back into Psm structs
254 |     let psms: Vec<Psm> = bincode::decode_from_slice(&decompressed, config)
255 |         .expect("Deserialization failed")
256 |         .0;
257 |     // Step 4: Return the deserialized data
258 |     Ok(psms)
259 | }


--------------------------------------------------------------------------------
/qfdrust/src/utility.rs:
--------------------------------------------------------------------------------
  1 | use rand::prelude::*;
  2 | 
  3 | /// Use target-decoy competition to calculate q-values
  4 | ///
  5 | /// # Arguments
  6 | ///
  7 | /// * `scores` - A vector of floats representing the scores
  8 | /// * `target` - A vector of booleans representing the target/decoy status
  9 | /// * `desc` - A boolean representing the sort order of the scores
 10 | ///
 11 | /// # Returns
 12 | ///
 13 | /// * `Vec<f64>` - A vector of floats representing the q-values
 14 | ///
 15 | pub fn target_decoy_competition(scores: &Vec<f64>, target: &Vec<bool>, desc: bool) -> Vec<f64> {
 16 |     assert_eq!(scores.len(), target.len(), "Scores and target must be the same length");
 17 | 
 18 |     // Create a vector of indices and sort by scores
 19 |     let mut indices: Vec<usize> = (0..scores.len()).collect();
 20 |     if desc {
 21 |         indices.sort_by(|&i, &j| scores[j].partial_cmp(&scores[i]).unwrap());
 22 |     } else {
 23 |         indices.sort_by(|&i, &j| scores[i].partial_cmp(&scores[j]).unwrap());
 24 |     }
 25 | 
 26 |     // Apply sorted indices to scores and targets
 27 |     let sorted_scores: Vec<f64> = indices.iter().map(|&i| scores[i]).collect();
 28 |     let sorted_target: Vec<bool> = indices.iter().map(|&i| target[i]).collect();
 29 | 
 30 |     // Calculate cumulative sums for targets and decoys
 31 |     let mut cum_targets = 0;
 32 |     let mut cum_decoys = 0;
 33 |     let mut cum_targets_vec = Vec::new();
 34 |     let mut cum_decoys_vec = Vec::new();
 35 | 
 36 |     for &t in &sorted_target {
 37 |         if t {
 38 |             cum_targets += 1;
 39 |         } else {
 40 |             cum_decoys += 1;
 41 |         }
 42 |         cum_targets_vec.push(cum_targets);
 43 |         cum_decoys_vec.push(cum_decoys);
 44 |     }
 45 | 
 46 |     // Calculate FDR
 47 |     let mut fdr: Vec<f64> = cum_decoys_vec.iter()
 48 |         .zip(cum_targets_vec.iter())
 49 |         .map(|(&d, &t)| if t > 0 { (d as f64 + 1.0) / t as f64 } else { 1.0 })
 50 |         .collect();
 51 | 
 52 |     // Calculate q-values
 53 |     fdr.reverse();
 54 |     let reversed_scores: Vec<f64> = sorted_scores.iter().rev().cloned().collect();
 55 |     let mut q_vals = fdr_to_q_value(&reversed_scores, &fdr);
 56 |     q_vals.reverse();
 57 | 
 58 |     // Reorder q_vals to original order
 59 |     let mut final_q_vals = vec![0.0; scores.len()];
 60 |     for (original_pos, &sorted_pos) in indices.iter().enumerate() {
 61 |         final_q_vals[sorted_pos] = q_vals[original_pos];
 62 |     }
 63 | 
 64 |     final_q_vals
 65 | }
 66 | 
 67 | /// Convert FDR to q-values
 68 | ///
 69 | /// # Arguments
 70 | ///
 71 | /// * `scores` - A vector of floats representing the scores
 72 | /// * `fdr` - A vector of floats representing the FDR
 73 | ///
 74 | /// # Returns
 75 | ///
 76 | /// * `Vec<f64>` - A vector of floats representing the q-values
 77 | ///
 78 | fn fdr_to_q_value(scores: &[f64], fdr: &[f64]) -> Vec<f64> {
 79 |     assert_eq!(scores.len(), fdr.len(), "Scores and FDR must be of the same length");
 80 | 
 81 |     let mut min_q = 1.0;
 82 |     let mut qvals = vec![1.0; fdr.len()];
 83 |     let mut start = 0;
 84 | 
 85 |     for (idx, &score) in scores.iter().enumerate() {
 86 |         // check if the next score is the same
 87 |         if idx < scores.len() - 1 && scores[idx + 1] == score {
 88 |             continue;
 89 |         }
 90 | 
 91 |         // update the minimum q-value
 92 |         if fdr[start] < min_q {
 93 |             min_q = fdr[start];
 94 |         }
 95 | 
 96 |         for qval in &mut qvals[start..=idx] {
 97 |             *qval = min_q;
 98 |         }
 99 |         start = idx + 1;
100 |     }
101 | 
102 |     qvals
103 | }
104 | 
105 | fn _estimate_pi0(pval_list: &Vec<f64>) -> f64 {
106 |     let num_lambda = 100;
107 |     let max_lambda = 0.5;
108 |     let num_boot = 100;
109 |     let max_size = 1000;
110 |     let mut rng = rand::rng();
111 | 
112 |     let n_pval = pval_list.len();
113 |     let mut pi0s_list = Vec::new();
114 |     let mut lambda_list = Vec::new();
115 | 
116 |     for idx in 0..num_lambda {
117 |         let cur_lambda = ((idx + 1) as f64 / num_lambda as f64) * max_lambda;
118 |         let start = pval_list.binary_search_by(|p| p.partial_cmp(&cur_lambda).unwrap()).unwrap_or_else(|pos| pos);
119 |         let w1 = n_pval - start;
120 |         let pi0 = w1 as f64 / n_pval as f64 / (1.0 - cur_lambda);
121 | 
122 |         if pi0 > 0.0 {
123 |             lambda_list.push(cur_lambda);
124 |             pi0s_list.push(pi0);
125 |         }
126 |     }
127 | 
128 |     assert!(!pi0s_list.is_empty(), "Error in the input data: too good separation between target and decoy PSMs.");
129 | 
130 |     let min_pi0 = *pi0s_list.iter().min_by(|a, b| a.partial_cmp(b).unwrap()).unwrap();
131 |     let mut mse_list = vec![0.0; pi0s_list.len()];
132 | 
133 |     let dist = rand::distr::Uniform::new(0, n_pval).unwrap();
134 | 
135 |     for _ in 0..num_boot {
136 |         let num_draw = std::cmp::min(n_pval, max_size);
137 |         let mut p_boot_list: Vec<f64> = (0..num_draw).map(|_| pval_list[dist.sample(&mut rng)]).collect();
138 |         p_boot_list.sort_by(|a, b| a.partial_cmp(b).unwrap());
139 | 
140 |         for (idx, &lambda) in lambda_list.iter().enumerate() {
141 |             let start = p_boot_list.binary_search_by(|p| p.partial_cmp(&lambda).unwrap()).unwrap_or_else(|pos| pos);
142 |             let w1 = num_draw - start;
143 |             let pi0_boot = w1 as f64 / num_draw as f64 / (1.0 - lambda);
144 |             mse_list[idx] += (pi0_boot - min_pi0).powi(2);
145 |         }
146 |     }
147 | 
148 |     let min_idx = mse_list.iter().enumerate().min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()).map(|(idx, _)| idx).unwrap();
149 | 
150 |     pi0s_list[min_idx].clamp(0.0, 1.0)
151 | }
152 | 
153 | #[cfg(test)]
154 | mod tests {
155 |     use super::*;
156 | 
157 |     fn setup_desc_scores() -> (Vec<f64>, Vec<bool>, Vec<f64>) {
158 |         let scores = vec![10.0, 10.0, 9.0, 8.0, 7.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0];
159 |         let target = vec![true, true, true, true, false, true, true, false, true, false, true, false, false, false, false, false];
160 |         let q_vals = vec![0.25, 0.25, 0.25, 0.25, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.42857142857142855, 0.42857142857142855, 0.5714285714285714, 0.625, 0.625, 1.0, 1.0, 1.0, 1.0];
161 |         (scores, target, q_vals)
162 |     }
163 | 
164 |     #[test]
165 |     fn test_tdc_descending() {
166 |         let (scores, target, true_q_vals) = setup_desc_scores();
167 |         let q_vals = target_decoy_competition(&scores, &target, true);
168 |         assert_eq!(q_vals, true_q_vals, "Q-values for descending scores are incorrect.");
169 |     }
170 | 
171 |     #[test]
172 |     fn test_tdc_ascending() {
173 |         let (mut scores, target, true_q_vals) = setup_desc_scores();
174 |         scores = scores.into_iter().map(|x| -x).collect(); // Negate scores for ascending test
175 |         let q_vals = target_decoy_competition(&scores, &target, false);
176 |         assert_eq!(q_vals, true_q_vals, "Q-values for ascending scores are incorrect.");
177 |     }
178 | }


--------------------------------------------------------------------------------
/sagepy-connector/.idea/.gitignore:
--------------------------------------------------------------------------------
 1 | # Default ignored files
 2 | /shelf/
 3 | /workspace.xml
 4 | # Editor-based HTTP Client requests
 5 | /httpRequests/
 6 | # Datasource local storage ignored files
 7 | /dataSources/
 8 | /dataSources.local.xml
 9 | # GitHub Copilot persisted chat sessions
10 | /copilot/chatSessions
11 | 


--------------------------------------------------------------------------------
/sagepy-connector/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/sagepy-connector.iml" filepath="$PROJECT_DIR$/.idea/sagepy-connector.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/sagepy-connector/.idea/sagepy-connector.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="EMPTY_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
 6 |       <excludeFolder url="file://$MODULE_DIR$/target" />
 7 |     </content>
 8 |     <orderEntry type="inheritedJdk" />
 9 |     <orderEntry type="sourceFolder" forTests="false" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/sagepy-connector/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
5 |     <mapping directory="$PROJECT_DIR$/../sage" vcs="Git" />
6 |   </component>
7 | </project>


--------------------------------------------------------------------------------
/sagepy-connector/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "sagepy-connector"
 3 | version = "0.3.12"
 4 | edition = "2021"
 5 | 
 6 | [lib]
 7 | name = "sagepy_connector"
 8 | crate-type = ["cdylib"]
 9 | 
10 | [dependencies]
11 | # sage-core = {path = "../../sage/crates/sage" }
12 | sage-core = { git = "https://github.com/theGreatHerrLebert/sage.git" }
13 | qfdrust = { path  = "../qfdrust" }
14 | unimod = { path = "../unimod" }
15 | pyo3 = { version = "0.23.4", features = ["extension-module"] }
16 | numpy = "0.23.0"
17 | rayon = "1.10.0"
18 | 
19 | serde = { version = "1.0.217", features = ["derive"] }
20 | bincode = "1.3.3"
21 | log = "0.4.22"
22 | itertools = "0.14.0"
23 | serde_json = "1.0.138"
24 | 


--------------------------------------------------------------------------------
/sagepy-connector/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::prelude::*;
 2 | use pyo3::wrap_pymodule;
 3 | 
 4 | pub mod py_database;
 5 | pub mod py_enzyme;
 6 | pub mod py_fasta;
 7 | pub mod py_ion_series;
 8 | pub mod py_mass;
 9 | pub mod py_modification;
10 | pub mod py_peptide;
11 | pub mod py_scoring;
12 | pub mod py_spectrum;
13 | pub mod py_fdr;
14 | pub mod py_lfq;
15 | pub mod py_tmt;
16 | pub mod py_qfdr;
17 | pub mod py_utility;
18 | pub mod py_unimod;
19 | pub mod utilities;
20 | pub mod py_intensity;
21 | pub mod py_retention_model;
22 | pub mod py_retention_alignment;
23 | pub mod py_mobility_model;
24 | #[pymodule]
25 | fn sagepy_connector(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
26 | 
27 |     m.add_wrapped(wrap_pymodule!(py_mass::py_mass))?;
28 |     m.add_wrapped(wrap_pymodule!(py_enzyme::py_enzyme))?;
29 |     m.add_wrapped(wrap_pymodule!(py_fasta::py_fasta))?;
30 |     m.add_wrapped(wrap_pymodule!(py_peptide::py_peptide))?;
31 |     m.add_wrapped(wrap_pymodule!(py_ion_series::py_ion_series))?;
32 |     m.add_wrapped(wrap_pymodule!(py_modification::py_modification))?;
33 |     m.add_wrapped(wrap_pymodule!(py_database::py_database))?;
34 |     m.add_wrapped(wrap_pymodule!(py_spectrum::py_spectrum))?;
35 |     m.add_wrapped(wrap_pymodule!(py_scoring::py_scoring))?;
36 |     m.add_wrapped(wrap_pymodule!(py_fdr::py_fdr))?;
37 |     m.add_wrapped(wrap_pymodule!(py_lfq::py_lfq))?;
38 |     m.add_wrapped(wrap_pymodule!(py_tmt::py_tmt))?;
39 |     m.add_wrapped(wrap_pymodule!(py_qfdr::py_qfdr))?;
40 |     m.add_wrapped(wrap_pymodule!(py_unimod::py_unimod))?;
41 |     m.add_wrapped(wrap_pymodule!(py_utility::py_utility))?;
42 |     m.add_wrapped(wrap_pymodule!(py_intensity::py_intensity))?;
43 |     m.add_wrapped(wrap_pymodule!(py_retention_alignment::py_retention_alignment))?;
44 |     m.add_wrapped(wrap_pymodule!(py_retention_model::py_retention_model))?;
45 |     m.add_wrapped(wrap_pymodule!(py_mobility_model::py_mobility_model))?;
46 | 
47 |     Ok(())
48 | }
49 | 


--------------------------------------------------------------------------------
/sagepy-connector/src/py_enzyme.rs:
--------------------------------------------------------------------------------
  1 | use numpy::{IntoPyArray, PyArray2, PyArrayMethods};
  2 | use pyo3::prelude::*;
  3 | use std::sync::Arc;
  4 | 
  5 | use std::hash::Hash;
  6 | 
  7 | use pyo3::exceptions::PyValueError;
  8 | use pyo3::types::PyList;
  9 | use sage_core::enzyme::{Digest, Enzyme, EnzymeParameters, Position};
 10 | use std::collections::hash_map::DefaultHasher;
 11 | use std::hash::Hasher;
 12 | 
 13 | #[pyclass]
 14 | #[derive(Clone)]
 15 | pub struct PyPosition {
 16 |     pub inner: Position,
 17 | }
 18 | 
 19 | #[pymethods]
 20 | impl PyPosition {
 21 |     #[staticmethod]
 22 |     fn nterm() -> Self {
 23 |         PyPosition {
 24 |             inner: Position::Nterm,
 25 |         }
 26 |     }
 27 | 
 28 |     #[staticmethod]
 29 |     fn cterm() -> Self {
 30 |         PyPosition {
 31 |             inner: Position::Cterm,
 32 |         }
 33 |     }
 34 | 
 35 |     #[staticmethod]
 36 |     fn full() -> Self {
 37 |         PyPosition {
 38 |             inner: Position::Full,
 39 |         }
 40 |     }
 41 | 
 42 |     #[staticmethod]
 43 |     fn internal() -> Self {
 44 |         PyPosition {
 45 |             inner: Position::Internal,
 46 |         }
 47 |     }
 48 | 
 49 |     #[staticmethod]
 50 |     fn from_string(position_string: &str) -> PyResult<Self> {
 51 |         match position_string {
 52 |             "n_term" => Ok(PyPosition::nterm()),
 53 |             "c_term" => Ok(PyPosition::cterm()),
 54 |             "full" => Ok(PyPosition::full()),
 55 |             "internal" => Ok(PyPosition::internal()),
 56 |             _ => Err(PyValueError::new_err("Invalid position string")),
 57 |         }
 58 |     }
 59 | 
 60 |     #[getter]
 61 |     fn to_string(&self) -> String {
 62 |         format!("{:?}", self.inner)
 63 |     }
 64 | }
 65 | 
 66 | #[pyclass]
 67 | #[derive(Clone)]
 68 | pub struct PyDigest {
 69 |     pub inner: Digest,
 70 | }
 71 | 
 72 | #[pymethods]
 73 | impl PyDigest {
 74 |     #[new]
 75 |     fn new(
 76 |         decoy: bool,
 77 |         sequence: &str,
 78 |         protein: &str,
 79 |         missed_cleavages: u8,
 80 |         position: PyPosition,
 81 |         semi_enzymatic: bool,
 82 |     ) -> Self {
 83 |         PyDigest {
 84 |             inner: Digest {
 85 |                 decoy,
 86 |                 sequence: sequence.to_string(),
 87 |                 protein: Arc::from(protein.to_string()),
 88 |                 missed_cleavages,
 89 |                 position: position.inner,
 90 |                 semi_enzymatic,
 91 |             },
 92 |         }
 93 |     }
 94 | 
 95 |     #[getter]
 96 |     fn decoy(&self) -> bool {
 97 |         self.inner.decoy
 98 |     }
 99 | 
100 |     #[getter]
101 |     fn sequence(&self) -> &str {
102 |         &self.inner.sequence
103 |     }
104 | 
105 |     #[getter]
106 |     fn protein(&self) -> &str {
107 |         &self.inner.protein
108 |     }
109 | 
110 |     #[getter]
111 |     fn missed_cleavages(&self) -> u8 {
112 |         self.inner.missed_cleavages
113 |     }
114 | 
115 |     #[getter]
116 |     fn position(&self) -> String {
117 |         format!("{:?}", self.inner.position)
118 |     }
119 | 
120 |     #[getter]
121 |     fn semi_enzymatic(&self) -> bool {
122 |         self.inner.semi_enzymatic
123 |     }
124 | 
125 |     fn reverse(&self) -> PyResult<PyDigest> {
126 |         Ok(PyDigest {
127 |             inner: self.inner.reverse(),
128 |         })
129 |     }
130 | 
131 |     fn __eq__(&self, other: &PyDigest) -> bool {
132 |         self.inner == other.inner
133 |     }
134 | 
135 |     fn __hash__(&self) -> isize {
136 |         let mut hasher = DefaultHasher::new();
137 |         self.inner.hash(&mut hasher);
138 |         hasher.finish() as isize
139 |     }
140 | }
141 | 
142 | #[pyclass]
143 | #[derive(Clone)]
144 | pub struct PyEnzyme {
145 |     pub inner: Enzyme,
146 | }
147 | 
148 | #[pymethods]
149 | impl PyEnzyme {
150 |     #[new]
151 |     #[pyo3(signature = (cleave, c_terminal, semi_enzymatic, skip_suffix=None))]
152 |     fn new(
153 |         cleave: &str,
154 |         c_terminal: bool,
155 |         semi_enzymatic: bool,
156 |         skip_suffix: Option<char>,
157 |     ) -> PyResult<Self> {
158 |         match Enzyme::new(cleave, skip_suffix, c_terminal, semi_enzymatic) {
159 |             Some(enzyme) => Ok(PyEnzyme { inner: enzyme }),
160 |             None => Err(PyValueError::new_err("Failed to create Enzyme")),
161 |         }
162 |     }
163 | 
164 |     #[getter]
165 |     fn c_terminal(&self) -> bool {
166 |         self.inner.c_terminal
167 |     }
168 | 
169 |     #[getter]
170 |     fn skip_suffix(&self) -> Option<char> {
171 |         self.inner.skip_suffix
172 |     }
173 | 
174 |     #[getter]
175 |     fn semi_enzymatic(&self) -> bool {
176 |         self.inner.semi_enzymatic
177 |     }
178 | 
179 |     fn cleavage_sites(&self, py: Python, sequence: &str) -> PyResult<Py<PyArray2<usize>>> {
180 |         // Call the original cleavage_sites method
181 |         let sites = self.inner.cleavage_sites(sequence);
182 | 
183 |         // Convert the Vec<Range<usize>> to Vec<usize> while flattening
184 |         let sites_flat: Vec<usize> = sites
185 |             .into_iter()
186 |             .flat_map(|s| vec![s.site.start, s.site.end])
187 |             .collect();
188 | 
189 |         let rows = sites_flat.len() / 2;
190 |         let np_array: Py<PyArray2<usize>> = sites_flat
191 |             .into_pyarray(py)
192 |             .reshape([rows, 2])?
193 |             .unbind();
194 | 
195 |         Ok(np_array)
196 |     }
197 | }
198 | 
199 | #[pyclass]
200 | pub struct PyEnzymeParameters {
201 |     pub inner: EnzymeParameters,
202 | }
203 | 
204 | #[pymethods]
205 | impl PyEnzymeParameters {
206 |     #[new]
207 |     #[pyo3(signature = (missed_cleavages, min_len, max_len, enzyme=None))]
208 |     fn new(missed_cleavages: u8, min_len: usize, max_len: usize, enzyme: Option<PyEnzyme>) -> Self {
209 |         PyEnzymeParameters {
210 |             inner: EnzymeParameters {
211 |                 missed_cleavages,
212 |                 min_len,
213 |                 max_len,
214 |                 enyzme: enzyme.map(|e| e.inner),
215 |             },
216 |         }
217 |     }
218 | 
219 |     #[getter]
220 |     fn missed_cleavages(&self) -> u8 {
221 |         self.inner.missed_cleavages
222 |     }
223 | 
224 |     #[getter]
225 |     fn min_len(&self) -> usize {
226 |         self.inner.min_len
227 |     }
228 | 
229 |     #[getter]
230 |     fn max_len(&self) -> usize {
231 |         self.inner.max_len
232 |     }
233 | 
234 |     #[getter]
235 |     fn enzyme(&self, _py: Python) -> PyResult<Option<PyEnzyme>> {
236 |         match &self.inner.enyzme {
237 |             Some(enzyme) => Ok(Some(PyEnzyme {
238 |                 inner: enzyme.clone(),
239 |             })),
240 |             None => Ok(None),
241 |         }
242 |     }
243 |     fn cleavage_sites(&self, py: Python, sequence: &str) -> PyResult<Py<PyArray2<usize>>> {
244 |         // Call the original cleavage_sites method
245 |         let sites = self.inner.cleavage_sites(sequence);
246 | 
247 |         // Convert the Vec<Range<usize>> to Vec<usize> while flattening
248 |         let sites_flat: Vec<usize> = sites
249 |             .into_iter()
250 |             .flat_map(|s| vec![s.site.start, s.site.end])
251 |             .collect();
252 | 
253 |         let rows = sites_flat.len() / 2;
254 |         let np_array: Py<PyArray2<usize>> =
255 |             sites_flat.into_pyarray(py).reshape([rows, 2])?.unbind();
256 | 
257 |         Ok(np_array)
258 |     }
259 | 
260 |     pub fn digest(&self, py: Python, sequence: &str, protein: &str) -> PyResult<Py<PyList>> {
261 |         let digests = self.inner.digest(sequence, Arc::from(protein.to_string()));
262 | 
263 |         // Create an empty Python list
264 |         let list: Py<PyList> = PyList::empty(py).into();
265 | 
266 |         // Iterate over the digests and append them to the list
267 |         for digest in digests {
268 |             let py_digest = Py::new(py, PyDigest { inner: digest })?;
269 |             list.bind(py).append(py_digest)?;
270 |         }
271 | 
272 |         Ok(list.into())
273 |     }
274 | }
275 | 
276 | #[pymodule]
277 | pub fn py_enzyme(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
278 |     m.add_class::<PyDigest>()?;
279 |     m.add_class::<PyPosition>()?;
280 |     m.add_class::<PyEnzyme>()?;
281 |     m.add_class::<PyEnzymeParameters>()?;
282 |     Ok(())
283 | }
284 | 


--------------------------------------------------------------------------------
/sagepy-connector/src/py_fasta.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::IntoPyObjectExt;
 2 | use sage_core::fasta::Fasta;
 3 | 
 4 | use crate::py_enzyme::{PyDigest, PyEnzymeParameters};
 5 | use pyo3::prelude::*;
 6 | 
 7 | #[pyclass]
 8 | #[derive(Clone)]
 9 | pub struct PyFasta {
10 |     pub inner: Fasta,
11 | }
12 | 
13 | #[pymethods]
14 | impl PyFasta {
15 |     #[staticmethod]
16 |     fn parse(contents: String, decoy_tag: String, generate_decoys: bool) -> PyResult<Self> {
17 |         Ok(PyFasta {
18 |             inner: Fasta::parse(contents, decoy_tag, generate_decoys),
19 |         })
20 |     }
21 | 
22 |     fn digest(&self, py: Python, enzyme_params: &PyEnzymeParameters) -> PyResult<PyObject> {
23 |         let digests = self.inner.digest(&enzyme_params.inner);
24 |         let py_digests: Vec<PyDigest> =
25 |             digests.into_iter().map(|d| PyDigest { inner: d }).collect();
26 |         Ok(py_digests.into_pyobject_or_pyerr(py)?.unbind())
27 |     }
28 | }
29 | 
30 | #[pymodule]
31 | pub fn py_fasta(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
32 |     m.add_class::<PyFasta>()?;
33 |     Ok(())
34 | }
35 | 


--------------------------------------------------------------------------------
/sagepy-connector/src/py_fdr.rs:
--------------------------------------------------------------------------------
  1 | use pyo3::prelude::*;
  2 | use pyo3::types::PyList;
  3 | use qfdrust::psm::Psm;
  4 | use sage_core::fdr::{Competition, picked_peptide, picked_protein};
  5 | 
  6 | use sage_core::database::{PeptideIx};
  7 | use sage_core::scoring::Feature;
  8 | use crate::py_database::{PyIndexedDatabase, PyPeptideIx};
  9 | use crate::py_scoring::{PyFeature, PyPsm};
 10 | use rayon::prelude::*;
 11 | 
 12 | #[pyclass]
 13 | // TODO: Check if it makes sense to tie this to PeptideIx
 14 | struct PyCompetitionPeptideIx {
 15 |     inner: Competition<PeptideIx>,
 16 | }
 17 | 
 18 | #[pymethods]
 19 | impl PyCompetitionPeptideIx {
 20 |     #[new]
 21 |     #[pyo3(signature = (forward, reverse, forward_ix=None, reverse_ix=None))]
 22 |     fn new(forward: f32, reverse: f32, forward_ix: Option<PyPeptideIx>, reverse_ix: Option<PyPeptideIx>) -> Self {
 23 |         PyCompetitionPeptideIx {
 24 |             inner: Competition {
 25 |                 forward,
 26 |                 foward_ix: forward_ix.map(|ix| ix.inner),
 27 |                 reverse,
 28 |                 reverse_ix: reverse_ix.map(|ix| ix.inner),
 29 |             },
 30 |         }
 31 |     }
 32 |     #[getter]
 33 |     fn forward(&self) -> f32 {
 34 |         self.inner.forward
 35 |     }
 36 | 
 37 |     #[getter]
 38 |     fn reverse(&self) -> f32 {
 39 |         self.inner.reverse
 40 |     }
 41 | 
 42 |     #[getter]
 43 |     fn forward_ix(&self) -> Option<PyPeptideIx> {
 44 |         self.inner.foward_ix.map(|ix| PyPeptideIx { inner: ix })
 45 |     }
 46 | 
 47 |     #[getter]
 48 |     fn reverse_ix(&self) -> Option<PyPeptideIx> {
 49 |         self.inner.reverse_ix.map(|ix| PyPeptideIx { inner: ix })
 50 |     }
 51 | }
 52 | 
 53 | #[pyfunction]
 54 | pub fn py_sage_fdr(_py: Python, feature_collection: &Bound<'_, PyList>, indexed_database: &PyIndexedDatabase, use_hyper_score: bool) -> PyResult<()> {
 55 | 
 56 |     // Extract the inner collection of Feature objects along with their original indices
 57 |     let mut indexed_inner_collection: Vec<(usize, Feature)> = feature_collection.iter()
 58 |         .enumerate()
 59 |         .map(|(index, item)| {
 60 |             // Extract each item as a Bound<PyFeature>
 61 |             let feature: Bound<'_, PyFeature> = item.extract().expect("Failed to extract PyFeature");
 62 |             // Clone the inner Feature and keep the original index
 63 |             (index, feature.borrow().inner.clone())
 64 |         })
 65 |         .collect();
 66 | 
 67 |     // Set discriminant score to hyper score
 68 |     indexed_inner_collection.par_iter_mut().for_each(|(_, feat)| {
 69 |         match use_hyper_score {
 70 |             false => {
 71 |                 feat.discriminant_score = (-feat.poisson as f32).ln_1p() + feat.longest_y_pct / 3.0
 72 |             }
 73 |             true => {
 74 |                 feat.discriminant_score = feat.hyperscore as f32;
 75 |             }
 76 |         }
 77 |     });
 78 | 
 79 |     // Sort indexed_inner_collection by discriminant_score
 80 |     indexed_inner_collection.par_sort_unstable_by(|(_, a), (_, b)| b.discriminant_score.total_cmp(&a.discriminant_score));
 81 | 
 82 |     // Extract the sorted indices
 83 |     let sorted_indices: Vec<usize> = indexed_inner_collection.iter().map(|(index, _)| *index).collect();
 84 | 
 85 |     // Perform additional operations on the sorted inner_collection
 86 |     let mut inner_collection: Vec<Feature> = indexed_inner_collection.into_iter().map(|(_, feat)| feat).collect();
 87 |     let _ = sage_core::ml::qvalue::spectrum_q_value(&mut inner_collection);
 88 |     let _ = picked_peptide(&indexed_database.inner, &mut inner_collection);
 89 |     let _ = picked_protein(&indexed_database.inner, &mut inner_collection);
 90 | 
 91 |     // Update the original feature_collection according to the sorted order
 92 |     for (sorted_index, sorted_feature) in sorted_indices.iter().zip(inner_collection.iter()) {
 93 |         let feature: Bound<'_, PyFeature> = feature_collection.get_item(*sorted_index).expect("Failed to get PyFeature").extract()?;
 94 |         let mut feature_borrow = feature.borrow_mut();
 95 |         // Update the feature's fields
 96 |         feature_borrow.inner.discriminant_score = sorted_feature.discriminant_score;
 97 |         feature_borrow.inner.spectrum_q = sorted_feature.spectrum_q;
 98 |         feature_borrow.inner.peptide_q = sorted_feature.peptide_q;
 99 |         feature_borrow.inner.protein_q = sorted_feature.protein_q;
100 |     }
101 | 
102 |     Ok(())
103 | }
104 | 
105 | #[pyfunction]
106 | pub fn py_sage_fdr_psm(_py: Python, psm_collection: &Bound<'_, PyList>, indexed_database: &PyIndexedDatabase, use_hyper_score: bool) -> PyResult<()> {
107 | 
108 |     // Extract the inner collection of Feature objects along with their original indices
109 |     let mut indexed_inner_collection: Vec<(usize, Psm)> = psm_collection.iter()
110 |         .enumerate()
111 |         .map(|(index, item)| {
112 |             // Extract each item as a Bound<PyPsm>
113 |             let feature: Bound<'_, PyPsm> = item.extract().expect("Failed to extract PyFeature");
114 |             // Clone the inner Feature and keep the original index
115 |             (index, feature.borrow().inner.clone())
116 |         })
117 |         .collect();
118 | 
119 |     // Set discriminant score to hyper score
120 |     indexed_inner_collection.par_iter_mut().for_each(|(_, feat)| {
121 |         match use_hyper_score {
122 |             false => {
123 |                 feat.sage_feature.discriminant_score = feat.re_score.unwrap_or(0.0) as f32;
124 |             }
125 |             true => {
126 |                 feat.sage_feature.discriminant_score = feat.sage_feature.hyperscore as f32;
127 |             }
128 |         }
129 |     });
130 | 
131 |     // Sort indexed_inner_collection by discriminant_score
132 |     indexed_inner_collection.par_sort_unstable_by(|(_, a), (_, b)| b.sage_feature.discriminant_score.total_cmp(&a.sage_feature.discriminant_score));
133 | 
134 |     // Extract the sorted indices
135 |     let sorted_indices: Vec<usize> = indexed_inner_collection.iter().map(|(index, _)| *index).collect();
136 | 
137 |     // Perform additional operations on the sorted inner_collection
138 |     let mut inner_collection: Vec<Feature> = indexed_inner_collection.into_iter().map(|(_, feat)| feat.sage_feature).collect();
139 |     let _ = sage_core::ml::qvalue::spectrum_q_value(&mut inner_collection);
140 |     let _ = picked_peptide(&indexed_database.inner, &mut inner_collection);
141 |     let _ = picked_protein(&indexed_database.inner, &mut inner_collection);
142 | 
143 |     // Update the original psm_collection according to the sorted order
144 |     for (sorted_index, sorted_feature) in sorted_indices.iter().zip(inner_collection.iter()) {
145 |         let feature: Bound<'_, PyPsm> = psm_collection.get_item(*sorted_index).expect("Failed to get PyFeature").extract()?;
146 |         let mut feature_borrow = feature.borrow_mut();
147 |         feature_borrow.inner.sage_feature.discriminant_score = sorted_feature.discriminant_score;
148 |         feature_borrow.inner.sage_feature.spectrum_q = sorted_feature.spectrum_q;
149 |         feature_borrow.inner.sage_feature.peptide_q = sorted_feature.peptide_q;
150 |         feature_borrow.inner.sage_feature.protein_q = sorted_feature.protein_q;
151 |     }
152 | 
153 |     Ok(())
154 | }
155 | 
156 | #[pymodule]
157 | pub fn py_fdr(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
158 |     m.add_class::<PyCompetitionPeptideIx>()?;
159 |     m.add_function(wrap_pyfunction!(py_sage_fdr, m)?)?;
160 |     m.add_function(wrap_pyfunction!(py_sage_fdr_psm, m)?)?;
161 |     Ok(())
162 | }


--------------------------------------------------------------------------------
/sagepy-connector/src/py_intensity.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::BTreeMap;
 2 | use pyo3::prelude::*;
 3 | use qfdrust::intensity::FragmentIntensityPrediction;
 4 | use crate::py_scoring::PyFragments;
 5 | 
 6 | #[pyclass]
 7 | #[derive(Clone, Debug)]
 8 | pub struct PyFragmentIntensityPrediction {
 9 |     pub inner: FragmentIntensityPrediction,
10 | }
11 | 
12 | #[pymethods]
13 | impl PyFragmentIntensityPrediction {
14 |     #[new]
15 |     fn new(
16 |         fragments: PyFragments,
17 |         prosit_intensity_predicted: Vec<f32>,
18 |     ) -> Self {
19 |         PyFragmentIntensityPrediction {
20 |             inner: FragmentIntensityPrediction {
21 |                 fragments: fragments.inner.clone(),
22 |                 prosit_intensity_predicted,
23 |             },
24 |         }
25 |     }
26 | 
27 |     #[getter]
28 |     fn prosit_intensity_predicted(&self) -> Vec<f32> {
29 |         self.inner.prosit_intensity_predicted.clone()
30 |     }
31 | 
32 |     #[setter]
33 |     fn set_prosit_intensity_predicted(&mut self, prosit_intensity_predicted: Vec<f32>) {
34 |         self.inner.prosit_intensity_predicted = prosit_intensity_predicted;
35 |     }
36 | 
37 |     fn cosine_similarity(&self, epsilon: f32, reduce_matched: bool) -> f32 {
38 |         self.inner.cosine_similarity(epsilon, reduce_matched).unwrap()
39 |     }
40 | 
41 |     fn spectral_angle_similarity(&self, epsilon: f32, reduce_matched: bool) -> f32 {
42 |         self.inner.spectral_angle_similarity(epsilon, reduce_matched)
43 |     }
44 | 
45 |     fn pearson_correlation(&self, epsilon: f32, reduce_matched: bool) -> f32 {
46 |         self.inner.pearson_correlation(epsilon, reduce_matched)
47 |     }
48 | 
49 |     fn spearman_correlation(&self, epsilon: f32, reduce_matched: bool) -> f32 {
50 |         self.inner.spearman_correlation(epsilon, reduce_matched)
51 |     }
52 | 
53 |     fn spectral_entropy_similarity(&self, epsilon: f32, reduce_matched: bool) -> f32 {
54 |         self.inner.spectral_entropy_similarity(epsilon, reduce_matched)
55 |     }
56 | 
57 |     fn observed_intensity_map(&self) -> BTreeMap<(u32, i32, i32), f32> {
58 |         self.inner.observed_intensity_to_fragments_map()
59 |     }
60 | 
61 |     fn predicted_intensity_map(&self) -> BTreeMap<(u32, i32, i32), f32> {
62 |         self.inner.prosit_intensity_to_fragments_map()
63 |     }
64 | 
65 |     fn prosit_intensity_to_fragments(&self) -> PyFragments {
66 |         PyFragments {
67 |             inner: self.inner.prosit_intensity_to_fragments(),
68 |         }
69 |     }
70 | }
71 | 
72 | #[pymodule]
73 | pub fn py_intensity(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
74 |     m.add_class::<PyFragmentIntensityPrediction>()?;
75 |     Ok(())
76 | }


--------------------------------------------------------------------------------
/sagepy-connector/src/py_ion_series.rs:
--------------------------------------------------------------------------------
  1 | use crate::py_peptide::PyPeptide;
  2 | use pyo3::prelude::*;
  3 | use sage_core::ion_series::{Ion, Kind};
  4 | use sage_core::mass::monoisotopic;
  5 | 
  6 | #[pyclass]
  7 | #[derive(Clone)]
  8 | pub struct PyKind {
  9 |     pub inner: Kind,
 10 | }
 11 | 
 12 | #[pymethods]
 13 | impl PyKind {
 14 |     #[new]
 15 |     fn new(kind: String) -> PyResult<Self> {
 16 |         match kind.to_lowercase().as_str() {
 17 |             "a" => Ok(PyKind { inner: Kind::A }),
 18 |             "b" => Ok(PyKind { inner: Kind::B }),
 19 |             "c" => Ok(PyKind { inner: Kind::C }),
 20 |             "x" => Ok(PyKind { inner: Kind::X }),
 21 |             "y" => Ok(PyKind { inner: Kind::Y }),
 22 |             "z" => Ok(PyKind { inner: Kind::Z }),
 23 |             _ => Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
 24 |                 "Invalid Kind value: {}",
 25 |                 kind
 26 |             ))),
 27 |         }
 28 |     }
 29 |     pub fn kind_as_string(&self) -> String {
 30 |         format!("{:?}", self.inner)
 31 |     }
 32 | }
 33 | 
 34 | #[pyclass]
 35 | pub struct PyIon {
 36 |     pub inner: Ion,
 37 | }
 38 | 
 39 | #[pymethods]
 40 | impl PyIon {
 41 |     #[new]
 42 |     fn new(kind: PyKind, monoisotopic_mass: f32) -> PyResult<Self> {
 43 |         let inner_ion = Ion {
 44 |             kind: kind.inner, // Conversion from PyKind to Rust Kind
 45 |             monoisotopic_mass,
 46 |         };
 47 |         Ok(PyIon { inner: inner_ion })
 48 |     }
 49 | 
 50 |     // Getter methods for accessing Ion properties
 51 |     #[getter]
 52 |     fn kind(&self) -> PyResult<PyKind> {
 53 |         Ok(PyKind {
 54 |             inner: self.inner.kind,
 55 |         })
 56 |     }
 57 | 
 58 |     #[getter]
 59 |     fn monoisotopic_mass(&self) -> PyResult<f32> {
 60 |         Ok(self.inner.monoisotopic_mass)
 61 |     }
 62 | }
 63 | 
 64 | #[pyclass]
 65 | pub struct PyIonSeries {
 66 |     pub kind: PyKind,
 67 |     pub cumulative_mass: f32,
 68 |     pub peptide: PyPeptide,
 69 | }
 70 | 
 71 | #[pymethods]
 72 | impl PyIonSeries {
 73 |     #[new]
 74 |     pub fn new(_py: Python, peptide: PyPeptide, kind: PyKind) -> PyResult<Self> {
 75 |         const C: f32 = 12.0;
 76 |         const O: f32 = 15.994914;
 77 |         const H: f32 = 1.007825;
 78 |         const PRO: f32 = 1.0072764;
 79 |         const N: f32 = 14.003074;
 80 |         const NH3: f32 = N + H * 3.0 + PRO;
 81 | 
 82 |         let cumulative_mass = match kind.inner {
 83 |             Kind::A => peptide.inner.nterm.unwrap_or_default() - (C + O),
 84 |             Kind::B => peptide.inner.nterm.unwrap_or_default(),
 85 |             Kind::C => peptide.inner.nterm.unwrap_or_default() + NH3,
 86 | 
 87 |             Kind::X => {
 88 |                 peptide.inner.monoisotopic - peptide.inner.nterm.unwrap_or_default()
 89 |                     + (C + O - NH3 + N + H)
 90 |             }
 91 |             Kind::Y => peptide.inner.monoisotopic - peptide.inner.nterm.unwrap_or_default(),
 92 |             Kind::Z => peptide.inner.monoisotopic - peptide.inner.nterm.unwrap_or_default() - NH3,
 93 |         };
 94 | 
 95 |         Ok(Self {
 96 |             kind,
 97 |             cumulative_mass,
 98 |             peptide,
 99 |         })
100 |     }
101 | 
102 |     #[getter]
103 |     fn kind(&self) -> PyResult<PyKind> {
104 |         Ok(self.kind.clone())
105 |     }
106 | 
107 |     #[getter]
108 |     fn cumulative_mass(&self) -> PyResult<f32> {
109 |         Ok(self.cumulative_mass)
110 |     }
111 | 
112 |     #[getter]
113 |     fn peptide(&self) -> PyResult<PyPeptide> {
114 |         Ok(self.peptide.clone())
115 |     }
116 | 
117 |     pub fn get_ion_series(&self) -> PyResult<Vec<PyIon>> {
118 |         let mut ions = Vec::new();
119 |         let mut cm = self.cumulative_mass;
120 | 
121 |         for idx in 0..self.peptide.inner.sequence.len() - 1 {
122 |             let r = self.peptide.inner.sequence[idx];
123 |             let m = self.peptide.inner.modifications.get(idx).unwrap_or(&0.0);
124 | 
125 |             cm += match self.kind.inner {
126 |                 Kind::A | Kind::B | Kind::C => monoisotopic(r) + m,
127 |                 Kind::X | Kind::Y | Kind::Z => -(monoisotopic(r) + m),
128 |             };
129 | 
130 |             ions.push(PyIon {
131 |                 inner: Ion {
132 |                     kind: self.kind.inner.clone(),
133 |                     monoisotopic_mass: cm,
134 |                 },
135 |             });
136 |         }
137 |         Ok(ions)
138 |     }
139 | }
140 | 
141 | #[pymodule]
142 | pub fn py_ion_series(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
143 |     m.add_class::<PyKind>()?;
144 |     m.add_class::<PyIon>()?;
145 |     m.add_class::<PyIonSeries>()?;
146 |     Ok(())
147 | }
148 | 


--------------------------------------------------------------------------------
/sagepy-connector/src/py_mass.rs:
--------------------------------------------------------------------------------
  1 | use pyo3::exceptions::PyValueError;
  2 | use pyo3::prelude::*;
  3 | use pyo3::types::PyList;
  4 | 
  5 | use sage_core::mass::{
  6 |     composition, monoisotopic, Composition, Tolerance, H2O, NEUTRON, NH3, PROTON,
  7 | };
  8 | 
  9 | #[pyfunction]
 10 | fn h2o() -> f32 {
 11 |     H2O
 12 | }
 13 | 
 14 | #[pyfunction]
 15 | fn proton() -> f32 {
 16 |     PROTON
 17 | }
 18 | 
 19 | #[pyfunction]
 20 | fn neutron() -> f32 {
 21 |     NEUTRON
 22 | }
 23 | 
 24 | #[pyfunction]
 25 | fn nh3() -> f32 {
 26 |     NH3
 27 | }
 28 | 
 29 | #[pyfunction]
 30 | fn py_monoisotopic(aa: &str) -> PyResult<f32> {
 31 |     if aa.len() == 1 && aa.chars().next().unwrap().is_ascii_uppercase() {
 32 |         let aa_u8 = aa.as_bytes()[0];
 33 |         Ok(monoisotopic(aa_u8))
 34 |     } else {
 35 |         Err(PyErr::new::<PyValueError, _>(
 36 |             "Input must be a single uppercase ASCII character.",
 37 |         ))
 38 |     }
 39 | }
 40 | 
 41 | #[pyclass]
 42 | #[derive(Clone)]
 43 | pub struct PyComposition {
 44 |     inner: Composition,
 45 | }
 46 | 
 47 | #[pymethods]
 48 | impl PyComposition {
 49 |     #[new]
 50 |     pub fn new(carbon: u16, sulfur: u16) -> Self {
 51 |         PyComposition {
 52 |             inner: Composition::new(carbon, 0, sulfur),
 53 |         }
 54 |     }
 55 | 
 56 |     // Exposing fields for Python access
 57 |     #[getter]
 58 |     pub fn carbon(&self) -> u16 {
 59 |         self.inner.carbon
 60 |     }
 61 | 
 62 |     #[getter]
 63 |     pub fn sulfur(&self) -> u16 {
 64 |         self.inner.sulfur
 65 |     }
 66 | 
 67 |     // Static method to sum compositions
 68 |     #[staticmethod]
 69 |     pub fn sum(compositions: &Bound<'_, PyList>) -> PyResult<PyComposition> {
 70 |         let mut total_composition = Composition::new(0, 0, 0);
 71 | 
 72 |         for comp in compositions.iter() {
 73 |             let py_comp: PyComposition = comp.extract()?;
 74 |             total_composition.carbon += py_comp.inner.carbon;
 75 |             total_composition.sulfur += py_comp.inner.sulfur;
 76 |         }
 77 | 
 78 |         Ok(PyComposition {
 79 |             inner: total_composition,
 80 |         })
 81 |     }
 82 | 
 83 |     #[staticmethod]
 84 |     fn py_composition(aa: &str) -> PyResult<PyComposition> {
 85 |         // Ensure the string is exactly one character long
 86 |         if aa.chars().count() == 1 {
 87 |             // Extract the first character
 88 |             let aa_char = aa.chars().next().unwrap(); // Safe to use unwrap here as we know it has exactly one character
 89 |             Ok(PyComposition {
 90 |                 inner: composition(aa_char as u8),
 91 |             })
 92 |         } else {
 93 |             // Return an error if the string is not a single character
 94 |             Err(PyErr::new::<PyValueError, _>(
 95 |                 "Expected a single character string",
 96 |             ))
 97 |         }
 98 |     }
 99 | }
100 | 
101 | #[pyclass]
102 | #[derive(Clone)]
103 | pub struct PyTolerance {
104 |     pub inner: Tolerance,
105 | }
106 | 
107 | #[pymethods]
108 | impl PyTolerance {
109 |     #[new]
110 |     #[pyo3(signature = (da=None, ppm=None))]
111 |     fn new(da: Option<(f32, f32)>, ppm: Option<(f32, f32)>) -> PyResult<Self> {
112 |         let tolerance = match (da, ppm) {
113 |             (Some((lo, hi)), None) => Tolerance::Da(lo, hi),
114 |             (None, Some((lo, hi))) => Tolerance::Ppm(lo, hi),
115 |             _ => {
116 |                 return Err(PyValueError::new_err(
117 |                     "Provide either da or ppm values, not both.",
118 |                 ))
119 |             }
120 |         };
121 | 
122 |         Ok(PyTolerance { inner: tolerance })
123 |     }
124 | 
125 |     #[getter]
126 |     fn da(&self) -> Option<(f32, f32)> {
127 |         match self.inner {
128 |             Tolerance::Da(lo, hi) => Some((lo, hi)),
129 |             _ => None,
130 |         }
131 |     }
132 | 
133 |     #[getter]
134 |     fn ppm(&self) -> Option<(f32, f32)> {
135 |         match self.inner {
136 |             Tolerance::Ppm(lo, hi) => Some((lo, hi)),
137 |             _ => None,
138 |         }
139 |     }
140 | 
141 |     fn bounds(&self, center: f32) -> (f32, f32) {
142 |         self.inner.bounds(center)
143 |     }
144 | 
145 |     fn contains(&self, center: f32, target: f32) -> bool {
146 |         self.inner.contains(center, target)
147 |     }
148 | 
149 |     #[staticmethod]
150 |     fn ppm_to_delta_mass(center: f32, ppm: f32) -> f32 {
151 |         Tolerance::ppm_to_delta_mass(center, ppm)
152 |     }
153 | 
154 |     fn __mul__(&self, rhs: f64) -> PyResult<Self> {
155 |         let result = self.inner.clone() * rhs as f32;
156 |         Ok(PyTolerance { inner: result })
157 |     }
158 | }
159 | 
160 | #[pymodule]
161 | pub fn py_mass(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
162 |     m.add_function(wrap_pyfunction!(h2o, m)?)?;
163 |     m.add_function(wrap_pyfunction!(proton, m)?)?;
164 |     m.add_function(wrap_pyfunction!(neutron, m)?)?;
165 |     m.add_function(wrap_pyfunction!(nh3, m)?)?;
166 |     m.add_function(wrap_pyfunction!(py_monoisotopic, m)?)?;
167 |     m.add_class::<PyTolerance>()?;
168 |     m.add_class::<PyComposition>()?;
169 |     Ok(())
170 | }
171 | 


--------------------------------------------------------------------------------
/sagepy-connector/src/py_mobility_model.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::prelude::*;
 2 | use pyo3::types::PyList;
 3 | use pyo3::exceptions::PyRuntimeError;
 4 | use sage_core::ml::mobility_model::predict;
 5 | use sage_core::scoring::Feature;
 6 | use crate::py_database::PyIndexedDatabase;
 7 | use crate::py_scoring::{PyPsm};
 8 | 
 9 | #[pyfunction]
10 | pub fn py_predict_im(
11 |     _py: Python,
12 |     psm_collection: &Bound<'_, PyList>,
13 |     indexed_database: &PyIndexedDatabase,
14 | ) -> PyResult<()> {
15 | 
16 |     let indexed_feats: Vec<(usize, Feature)> = psm_collection.iter()
17 |         .enumerate()
18 |         .map(|(idx, item)| {
19 |             let psm: Bound<'_, PyPsm> = item
20 |                 .extract()
21 |                 .expect("Failed to extract PyPsm");
22 |             // clone just the inner Feature (sage_feature)
23 |             (idx, psm.borrow().inner.sage_feature.clone())
24 |         })
25 |         .collect();
26 | 
27 |     let mut feats: Vec<Feature> = indexed_feats.iter()
28 |         .map(|(_, feat)| feat.clone())
29 |         .collect();
30 | 
31 |     if predict(&indexed_database.inner, &mut feats).is_none() {
32 |         return Err(PyRuntimeError::new_err(
33 |             "Retention model fit failed: not enough data or R² < 0.7"
34 |         ));
35 |     }
36 | 
37 |     // 3) write back the two mutated fields
38 |     for ((orig_idx, _), updated) in indexed_feats.iter().zip(feats.iter()) {
39 |         let psm: Bound<'_, PyPsm> = psm_collection
40 |             .get_item(*orig_idx)
41 |             .expect("Failed to get PyPsm")
42 |             .extract()?;
43 |         let mut psm_borrow = psm.borrow_mut();
44 |         psm_borrow.inner.sage_feature.predicted_ims   = updated.predicted_ims;
45 |         psm_borrow.inner.sage_feature.delta_ims_model = updated.delta_ims_model;
46 |     }
47 | 
48 |     Ok(())
49 | }
50 | 
51 | 
52 | #[pymodule]
53 | pub fn py_mobility_model(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
54 |     m.add_function(wrap_pyfunction!(py_predict_im, m)?)?;
55 |     Ok(())
56 | }


--------------------------------------------------------------------------------
/sagepy-connector/src/py_modification.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::exceptions::PyValueError;
 2 | use pyo3::prelude::*;
 3 | use pyo3::types::PyDict;
 4 | use sage_core::modification::{validate_mods, InvalidModification, ModificationSpecificity};
 5 | use std::collections::HashMap;
 6 | use std::str::FromStr;
 7 | 
 8 | #[pyclass]
 9 | #[derive(Clone, Debug, PartialEq, Hash)]
10 | pub struct PyModificationSpecificity {
11 |     pub inner: ModificationSpecificity,
12 | }
13 | 
14 | #[pymethods]
15 | impl PyModificationSpecificity {
16 |     #[new]
17 |     pub fn new(s: &str) -> PyResult<Self> {
18 |         match ModificationSpecificity::from_str(s) {
19 |             Ok(m) => Ok(PyModificationSpecificity { inner: m }),
20 |             Err(InvalidModification::Empty) => {
21 |                 Err(PyValueError::new_err("Empty modification string"))
22 |             }
23 |             Err(InvalidModification::InvalidResidue(c)) => Err(PyValueError::new_err(format!(
24 |                 "Invalid modification string: unrecognized residue ({})",
25 |                 c
26 |             ))),
27 |             Err(InvalidModification::TooLong(s)) => Err(PyValueError::new_err(format!(
28 |                 "Invalid modification string: {} is too long",
29 |                 s
30 |             ))),
31 |         }
32 |     }
33 | 
34 |     #[getter]
35 |     pub fn as_string(&self) -> String {
36 |         self.inner.to_string()
37 |     }
38 | }
39 | 
40 | impl Eq for PyModificationSpecificity {}
41 | 
42 | #[pyfunction]
43 | #[pyo3(signature = (input=None))]
44 | pub fn py_validate_mods(input: Option<&Bound<'_, PyDict>>) -> HashMap<PyModificationSpecificity, f32> {
45 |     // unwrap the input
46 |     let input = input.map(|d| d.extract::<HashMap<String, f32>>().unwrap());
47 |     // validate the mods
48 |     let output = validate_mods(input);
49 |     // convert to a py dict
50 |     let py_validated_mods = output
51 |         .iter()
52 |         .map(|(k, v)| (PyModificationSpecificity { inner: k.clone() }, *v))
53 |         .collect::<HashMap<PyModificationSpecificity, f32>>();
54 | 
55 |     py_validated_mods
56 | }
57 | 
58 | #[pyfunction]
59 | #[pyo3(signature = (input=None))]
60 | pub fn py_validate_var_mods(
61 |     input: Option<&Bound<'_, PyDict>>,
62 | ) -> HashMap<PyModificationSpecificity, Vec<f32>> {
63 |     // unwrap the input
64 |     let input = input.map(|d| d.extract::<HashMap<String, Vec<f32>>>().unwrap());
65 |     let mut output: HashMap<PyModificationSpecificity, Vec<f32>> = HashMap::new();
66 | 
67 |     if let Some(input) = input {
68 |         for (s, mass) in input {
69 |             match ModificationSpecificity::from_str(&s) {
70 |                 Ok(m) => {
71 |                     output.insert(PyModificationSpecificity { inner: m }, mass);
72 |                 }
73 |                 Err(InvalidModification::Empty) => {
74 |                     log::error!("Skipping invalid modification string: empty")
75 |                 }
76 |                 Err(InvalidModification::InvalidResidue(c)) => {
77 |                     log::error!(
78 |                         "Skipping invalid modification string: unrecognized residue ({})",
79 |                         c
80 |                     )
81 |                 }
82 |                 Err(InvalidModification::TooLong(s)) => {
83 |                     log::error!("Skipping invalid modification string: {} is too long", s)
84 |                 }
85 |             }
86 |         }
87 |     }
88 |     output
89 | }
90 | 
91 | #[pymodule]
92 | pub fn py_modification(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
93 |     m.add_class::<PyModificationSpecificity>()?;
94 |     m.add_wrapped(wrap_pyfunction!(py_validate_mods))?;
95 |     m.add_wrapped(wrap_pyfunction!(py_validate_var_mods))?;
96 |     Ok(())
97 | }
98 | 


--------------------------------------------------------------------------------
/sagepy-connector/src/py_peptide.rs:
--------------------------------------------------------------------------------
  1 | use pyo3::exceptions::PyValueError;
  2 | use pyo3::prelude::*;
  3 | use std::sync::Arc;
  4 | 
  5 | use crate::py_enzyme::{PyDigest, PyPosition};
  6 | use sage_core::peptide::Peptide;
  7 | 
  8 | #[pyclass]
  9 | #[derive(Clone)]
 10 | pub struct PyPeptide {
 11 |     pub inner: Peptide,
 12 | }
 13 | 
 14 | #[pymethods]
 15 | impl PyPeptide {
 16 |     #[new]
 17 |     #[pyo3(signature = (decoy, sequence, modifications, mono_isotopic, missed_cleavages, position, proteins, semi_enzymatic, n_term=None, c_term=None))]
 18 |     pub fn new(
 19 |         decoy: bool,
 20 |         sequence: String,
 21 |         modifications: Vec<f32>,
 22 |         mono_isotopic: f32,
 23 |         missed_cleavages: u8,
 24 |         position: PyPosition,
 25 |         proteins: Vec<String>,
 26 |         semi_enzymatic: bool,
 27 |         n_term: Option<f32>,
 28 |         c_term: Option<f32>,
 29 |     ) -> PyResult<PyPeptide> {
 30 |         let sequence_bytes = sequence.into_bytes(); // Convert the string to Vec<u8>
 31 |         let boxed_sequence = sequence_bytes.into_boxed_slice(); // Convert Vec<u8> to Box<[u8]>
 32 |         let arc_sequence = Arc::from(boxed_sequence); // Convert Box<[u8]> to Arc<[u8]> without dereferencing
 33 | 
 34 |         // Convert Python list of strings to Vec<Arc<String>>
 35 |         let arc_proteins = proteins.into_iter().map(Arc::from).collect();
 36 | 
 37 |         Ok(PyPeptide {
 38 |             inner: Peptide {
 39 |                 decoy,
 40 |                 sequence: arc_sequence,
 41 |                 modifications: modifications,
 42 |                 nterm: n_term,
 43 |                 cterm: c_term,
 44 |                 monoisotopic: mono_isotopic,
 45 |                 missed_cleavages,
 46 |                 position: position.inner,
 47 |                 proteins: arc_proteins,
 48 |                 semi_enzymatic,
 49 |             },
 50 |         })
 51 |     }
 52 | 
 53 |     #[staticmethod]
 54 |     fn try_new_from_digest(digest: &PyDigest) -> PyResult<Self> {
 55 |         let peptide = Peptide::try_from(digest.inner.clone())
 56 |             .map_err(|_e| PyErr::new::<PyValueError, _>(format!("Error creating peptide.")))?;
 57 |         Ok(PyPeptide { inner: peptide })
 58 |     }
 59 | 
 60 |     #[getter]
 61 |     pub fn decoy(&self) -> bool {
 62 |         self.inner.decoy
 63 |     }
 64 | 
 65 |     #[getter]
 66 |     pub fn sequence(&self) -> &str {
 67 |         std::str::from_utf8(&self.inner.sequence).unwrap()
 68 |     }
 69 | 
 70 |     #[getter]
 71 |     pub fn modifications(&self) -> Vec<f32> {
 72 |         self.inner.modifications.clone()
 73 |     }
 74 | 
 75 |     #[getter]
 76 |     pub fn n_term(&self) -> Option<f32> {
 77 |         self.inner.nterm
 78 |     }
 79 | 
 80 |     #[getter]
 81 |     pub fn c_term(&self) -> Option<f32> {
 82 |         self.inner.cterm
 83 |     }
 84 | 
 85 |     #[getter]
 86 |     pub fn monoisotopic(&self) -> f32 {
 87 |         self.inner.monoisotopic
 88 |     }
 89 | 
 90 |     #[getter]
 91 |     pub fn missed_cleavages(&self) -> u8 {
 92 |         self.inner.missed_cleavages
 93 |     }
 94 | 
 95 |     #[getter]
 96 |     pub fn position(&self) -> PyPosition {
 97 |         PyPosition {
 98 |             inner: self.inner.position,
 99 |         }
100 |     }
101 | 
102 |     #[getter]
103 |     pub fn proteins(&self) -> Vec<String> {
104 |         self.inner.proteins.iter().map(|s| s.to_string()).collect()
105 |     }
106 | 
107 |     #[getter]
108 |     pub fn semi_enzymatic(&self) -> bool {
109 |         self.inner.semi_enzymatic
110 |     }
111 | 
112 |     #[pyo3(signature = (keep_ends=None))]
113 |     pub fn reverse(&self, keep_ends: Option<bool>) -> PyPeptide {
114 |         PyPeptide { inner: self.inner.reverse(keep_ends.unwrap_or(true)), }
115 |     }
116 | 
117 |     #[pyo3(signature = (keep_ends=None))]
118 |     pub fn shuffle(&self, keep_ends: Option<bool>) -> PyPeptide {
119 |         PyPeptide { inner: self.inner.shuffle(keep_ends.unwrap_or(true)), }
120 |     }
121 | }
122 | 
123 | #[pymodule]
124 | pub fn py_peptide(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
125 |     m.add_class::<PyPeptide>()?;
126 |     Ok(())
127 | }
128 | 


--------------------------------------------------------------------------------
/sagepy-connector/src/py_qfdr.rs:
--------------------------------------------------------------------------------
  1 | use pyo3::prelude::*;
  2 | use pyo3::types::PyList;
  3 | use qfdrust::dataset::TDCMethod;
  4 | use qfdrust::picked::{protein_id_from_psm, spectrum_q_value, picked_peptide, picked_protein};
  5 | use qfdrust::psm::Psm;
  6 | use crate::py_scoring::PyPsm;
  7 | 
  8 | #[pyclass]
  9 | #[derive(Clone)]
 10 | pub struct PyTDCMethod {
 11 |     pub inner: TDCMethod,
 12 | }
 13 | 
 14 | #[pymethods]
 15 | impl PyTDCMethod {
 16 |     #[new]
 17 |     fn new(method: &str) -> Self {
 18 |         PyTDCMethod {
 19 |             inner: TDCMethod::from_str(method),
 20 |         }
 21 |     }
 22 |     pub fn to_str(&self) -> &str {
 23 |         self.inner.to_str()
 24 |     }
 25 | }
 26 | 
 27 | #[pyfunction]
 28 | pub fn target_decoy_competition(
 29 |     method: &PyTDCMethod,
 30 |     spectra_idx: Vec<String>,
 31 |     match_idx: Vec<String>,
 32 |     target: Vec<bool>,
 33 |     scores: Vec<f32>,
 34 |     match_identity_candidates: Vec<Option<Vec<String>>>,
 35 | ) -> (Vec<String>, Vec<String>, Vec<Vec<String>>, Vec<bool>, Vec<f32>, Vec<f64>) {
 36 |     let method = method.inner.clone();
 37 | 
 38 |     let (spec_idx, match_idx, match_identity, decoy, scores, q_values) =
 39 |         qfdrust::dataset::target_decoy_competition(
 40 |             method,
 41 |             spectra_idx,
 42 |             match_idx,
 43 |             target,
 44 |             scores,
 45 |             match_identity_candidates,
 46 |         );
 47 | 
 48 |     (spec_idx, match_idx, match_identity, decoy, scores, q_values)
 49 | }
 50 | 
 51 | #[pyfunction]
 52 | pub fn assign_spectrum_q(_py: Python, psm_collection: &Bound<'_, PyList>, use_hyper_score: bool) -> PyResult<()> {
 53 |     let inner_collection: Vec<Psm> = psm_collection
 54 |         .iter()
 55 |         .map(|item| {
 56 |             let feature: Bound<'_, PyPsm> = item.extract().expect("Failed to extract PyPsm");
 57 |             feature.borrow().inner.clone()
 58 |         })
 59 |         .collect();
 60 | 
 61 |     let q_values = spectrum_q_value(&inner_collection, use_hyper_score);
 62 | 
 63 |     for (index, q_value) in q_values.iter().enumerate() {
 64 |         let feature: Bound<'_, PyPsm> = psm_collection.get_item(index).expect("Failed to get PyPsm").extract()?;
 65 |         let mut feature_borrow = feature.borrow_mut();
 66 |         feature_borrow.inner.sage_feature.spectrum_q = *q_value as f32;
 67 |     }
 68 | 
 69 |     Ok(())
 70 | }
 71 | 
 72 | #[pyfunction]
 73 | pub fn assign_peptide_q(_py: Python, psm_collection: &Bound<'_, PyList>, use_hyper_score: bool) -> PyResult<()> {
 74 |     let mut inner_collection: Vec<Psm> = psm_collection
 75 |         .iter()
 76 |         .map(|item| {
 77 |             let feature: Bound<'_, PyPsm> = item.extract().expect("Failed to extract PyPsm");
 78 |             feature.borrow().inner.clone()
 79 |         })
 80 |         .collect();
 81 | 
 82 |     let q_values = picked_peptide(&mut inner_collection, use_hyper_score);
 83 | 
 84 |     for (index, _) in psm_collection.iter().enumerate() {
 85 |         let feature: Bound<'_, PyPsm> = psm_collection.get_item(index).expect("Failed to get PyPsm").extract()?;
 86 |         let mut feature_borrow = feature.borrow_mut();
 87 | 
 88 |         let key = match feature_borrow.inner.sage_feature.label {
 89 |             -1 => feature_borrow.inner.sequence_decoy.clone().unwrap().sequence.clone(),
 90 |             _ => feature_borrow.inner.sequence.clone().unwrap().sequence.clone(),
 91 |         };
 92 | 
 93 |         feature_borrow.inner.sage_feature.peptide_q = *q_values.get(&key).unwrap_or(&1.0) as f32;
 94 |     }
 95 | 
 96 |     Ok(())
 97 | }
 98 | 
 99 | #[pyfunction]
100 | pub fn assign_protein_q(_py: Python, psm_collection: &Bound<'_, PyList>, use_hyper_score: bool) -> PyResult<()> {
101 |     let mut inner_collection: Vec<Psm> = psm_collection
102 |         .iter()
103 |         .map(|item| {
104 |             let feature: Bound<'_, PyPsm> = item.extract().expect("Failed to extract PyPsm");
105 |             feature.borrow().inner.clone()
106 |         })
107 |         .collect();
108 | 
109 |     let q_values = picked_protein(&mut inner_collection, use_hyper_score);
110 | 
111 |     for (index, _) in psm_collection.iter().enumerate() {
112 |         let feature: Bound<'_, PyPsm> = psm_collection.get_item(index).expect("Failed to get PyPsm").extract()?;
113 |         let mut feature_borrow = feature.borrow_mut();
114 | 
115 |         let key = protein_id_from_psm(&feature_borrow.inner, "rev_", true);
116 | 
117 |         feature_borrow.inner.sage_feature.protein_q = *q_values.get(&key).unwrap_or(&1.0) as f32;
118 |     }
119 | 
120 |     Ok(())
121 | }
122 | 
123 | #[pymodule]
124 | pub fn py_qfdr(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
125 |     m.add_class::<PyTDCMethod>()?;
126 |     m.add_function(wrap_pyfunction!(target_decoy_competition, m)?)?;
127 |     m.add_function(wrap_pyfunction!(assign_spectrum_q, m)?)?;
128 |     m.add_function(wrap_pyfunction!(assign_peptide_q, m)?)?;
129 |     m.add_function(wrap_pyfunction!(assign_protein_q, m)?)?;
130 |     Ok(())
131 | }


--------------------------------------------------------------------------------
/sagepy-connector/src/py_retention_alignment.rs:
--------------------------------------------------------------------------------
  1 | use pyo3::prelude::*;
  2 | use pyo3::types::PyList;
  3 | use sage_core::ml::retention_alignment::{Alignment, global_alignment};
  4 | 
  5 | use sage_core::scoring::Feature;
  6 | use crate::py_scoring::{PyFeature, PyPsm};
  7 | 
  8 | #[pyclass]
  9 | #[derive(Clone)]
 10 | pub struct PyAlignment {
 11 |     pub inner: Alignment,
 12 | }
 13 | 
 14 | #[pymethods]
 15 | impl PyAlignment {
 16 |     #[new]
 17 |     pub fn new(
 18 |         file_id: usize,
 19 |         max_rt: f32,
 20 |         slope: f32,
 21 |         intercept: f32,
 22 |     ) -> Self {
 23 |         PyAlignment {
 24 |             inner: Alignment {
 25 |                 file_id,
 26 |                 max_rt,
 27 |                 slope,
 28 |                 intercept,
 29 |             },
 30 |         }
 31 |     }
 32 |     #[getter]
 33 |     pub fn file_id(&self) -> usize {
 34 |         self.inner.file_id
 35 |     }
 36 |     #[getter]
 37 |     pub fn max_rt(&self) -> f32 {
 38 |         self.inner.max_rt
 39 |     }
 40 |     #[getter]
 41 |     pub fn slope(&self) -> f32 {
 42 |         self.inner.slope
 43 |     }
 44 |     #[getter]
 45 |     pub fn intercept(&self) -> f32 {
 46 |         self.inner.intercept
 47 |     }
 48 | }
 49 | 
 50 | #[pyfunction]
 51 | pub fn py_global_alignment(
 52 |     features: &Bound<'_, PyList>,
 53 |     n_files: usize,
 54 | ) -> Vec<PyAlignment> {
 55 | 
 56 |     let mut inner_features: Vec<Feature> = features.iter()
 57 |         .map(|item| {
 58 |             let feature: Bound<'_, PyFeature> = item.extract().expect("Failed to extract PyFeature");
 59 |             feature.borrow().inner.clone()
 60 |         })
 61 |         .collect();
 62 | 
 63 |     global_alignment(&mut inner_features, n_files)
 64 |         .into_iter()
 65 |         .map(|alignment| PyAlignment { inner: alignment })
 66 |         .collect()
 67 | }
 68 | 
 69 | #[pyfunction]
 70 | pub fn py_global_alignment_psm(
 71 |     psms: &Bound<'_, PyList>,
 72 |     n_files: usize,
 73 | ) -> Vec<PyAlignment> {
 74 |     // Step 1: clone out features + remember original index
 75 |     let indexed_psms: Vec<(usize, Feature)> = psms.iter()
 76 |         .enumerate()
 77 |         .map(|(i, item)| {
 78 |             let psm: Bound<'_, PyPsm> = item.extract().expect("Failed to extract PyPsm");
 79 |             (i, psm.borrow().inner.sage_feature.clone())
 80 |         })
 81 |         .collect();
 82 | 
 83 |     // Step 2: collect features to pass into alignment
 84 |     let mut features: Vec<Feature> = indexed_psms.iter().map(|(_, feat)| feat.clone()).collect();
 85 | 
 86 |     // Step 3: run global alignment on copied data
 87 |     let alignments = global_alignment(&mut features, n_files);
 88 | 
 89 |     // Step 4: write aligned_rt back into PyPsm
 90 |     for ((i, _), updated_feat) in indexed_psms.iter().zip(features.iter()) {
 91 |         let psm: Bound<'_, PyPsm> = psms.get_item(*i)
 92 |             .expect("Index out of range")
 93 |             .extract()
 94 |             .expect("Failed to extract PyPsm");
 95 |         let mut psm_mut = psm.borrow_mut();
 96 |         psm_mut.inner.sage_feature.aligned_rt = updated_feat.aligned_rt;
 97 |     }
 98 | 
 99 |     // Step 5: return the alignment parameters
100 |     alignments
101 |         .into_iter()
102 |         .map(|alignment| PyAlignment { inner: alignment })
103 |         .collect()
104 | }
105 | 
106 | #[pymodule]
107 | pub fn py_retention_alignment(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
108 |     m.add_class::<PyAlignment>()?;
109 |     m.add_function(wrap_pyfunction!(py_global_alignment, m)?)?;
110 |     m.add_function(wrap_pyfunction!(py_global_alignment_psm, m)?)?;
111 |     Ok(())
112 | }


--------------------------------------------------------------------------------
/sagepy-connector/src/py_retention_model.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::prelude::*;
 2 | use pyo3::types::PyList;
 3 | use pyo3::exceptions::PyRuntimeError;
 4 | use sage_core::ml::retention_model::predict;
 5 | use sage_core::scoring::Feature;
 6 | use crate::py_database::PyIndexedDatabase;
 7 | use crate::py_scoring::{PyPsm};
 8 | 
 9 | #[pyfunction]
10 | pub fn py_predict_rt(
11 |     _py: Python,
12 |     psm_collection: &Bound<'_, PyList>,
13 |     indexed_database: &PyIndexedDatabase,
14 | ) -> PyResult<()> {
15 |     
16 |     let indexed_feats: Vec<(usize, Feature)> = psm_collection.iter()
17 |         .enumerate()
18 |         .map(|(idx, item)| {
19 |             let psm: Bound<'_, PyPsm> = item
20 |                 .extract()
21 |                 .expect("Failed to extract PyPsm");
22 |             // clone just the inner Feature (sage_feature)
23 |             (idx, psm.borrow().inner.sage_feature.clone())
24 |         })
25 |         .collect();
26 |     
27 |     let mut feats: Vec<Feature> = indexed_feats.iter()
28 |         .map(|(_, feat)| feat.clone())
29 |         .collect();
30 | 
31 |     if predict(&indexed_database.inner, &mut feats).is_none() {
32 |         return Err(PyRuntimeError::new_err(
33 |             "Retention model fit failed: not enough data or R² < 0.7"
34 |         ));
35 |     }
36 | 
37 |     // 3) write back the two mutated fields
38 |     for ((orig_idx, _), updated) in indexed_feats.iter().zip(feats.iter()) {
39 |         let psm: Bound<'_, PyPsm> = psm_collection
40 |             .get_item(*orig_idx)
41 |             .expect("Failed to get PyPsm")
42 |             .extract()?;
43 |         let mut psm_borrow = psm.borrow_mut();
44 |         psm_borrow.inner.sage_feature.predicted_rt   = updated.predicted_rt;
45 |         psm_borrow.inner.sage_feature.delta_rt_model = updated.delta_rt_model;
46 |     }
47 | 
48 |     Ok(())
49 | }
50 | 
51 | 
52 | #[pymodule]
53 | pub fn py_retention_model(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
54 |     m.add_function(wrap_pyfunction!(py_predict_rt, m)?)?;
55 |     Ok(())
56 | }


--------------------------------------------------------------------------------
/sagepy-connector/src/py_tmt.rs:
--------------------------------------------------------------------------------
  1 | use pyo3::prelude::*;
  2 | use sage_core::tmt::{Isobaric, Purity, TmtQuant};
  3 | use crate::py_scoring::PyFeature;
  4 | use crate::py_spectrum::{PyPeak, PyProcessedSpectrum};
  5 | 
  6 | #[pyclass]
  7 | pub struct PyIsobaric {
  8 |     pub inner: Isobaric,
  9 | }
 10 | 
 11 | #[pymethods]
 12 | impl PyIsobaric {
 13 |     #[new]
 14 |     pub fn new(
 15 |         type_name: &str,
 16 |     ) -> Self {
 17 |         PyIsobaric {
 18 |             inner: match type_name {
 19 |                 "tmt6" => Isobaric::Tmt6,
 20 |                 "tmt10" => Isobaric::Tmt10,
 21 |                 "tmt11" => Isobaric::Tmt11,
 22 |                 "tmt16" => Isobaric::Tmt16,
 23 |                 "tmt18" => Isobaric::Tmt18,
 24 |                 _ => panic!("Invalid isobaric type"),
 25 |             },
 26 |         }
 27 |     }
 28 |     #[getter]
 29 |     pub fn type_name(&self) -> String {
 30 |         match self.inner {
 31 |             Isobaric::Tmt6 => "tmt6".to_string(),
 32 |             Isobaric::Tmt10 => "tmt10".to_string(),
 33 |             Isobaric::Tmt11 => "tmt11".to_string(),
 34 |             Isobaric::Tmt16 => "tmt16".to_string(),
 35 |             Isobaric::Tmt18 => "tmt18".to_string(),
 36 |             _ => panic!("Invalid isobaric type"),
 37 |         }
 38 |     }
 39 | 
 40 |     pub fn modification_mass(&self) -> Option<f32> {
 41 |         self.inner.modification_mass()
 42 |     }
 43 | }
 44 | 
 45 | #[pyclass]
 46 | #[derive(Clone)]
 47 | pub struct PyPurity {
 48 |     pub inner: Purity,
 49 | }
 50 | 
 51 | #[pymethods]
 52 | impl PyPurity {
 53 |     #[new]
 54 |     pub fn new(ratio: f32, correct_precursors: usize, incorrect_precursors: usize, ) -> Self {
 55 |         PyPurity {
 56 |             inner: Purity {
 57 |                 ratio,
 58 |                 correct_precursors,
 59 |                 incorrect_precursors,
 60 |             },
 61 |         }
 62 |     }
 63 | 
 64 |     #[getter]
 65 |     pub fn ratio(&self) -> f32 {
 66 |         self.inner.ratio
 67 |     }
 68 | 
 69 |     #[getter]
 70 |     pub fn correct_precursors(&self) -> usize {
 71 |         self.inner.correct_precursors
 72 |     }
 73 | 
 74 |     #[getter]
 75 |     pub fn incorrect_precursors(&self) -> usize {
 76 |         self.inner.incorrect_precursors
 77 |     }
 78 | }
 79 | 
 80 | #[pyclass]
 81 | #[derive(Clone)]
 82 | pub struct PyQuant {
 83 |     pub hit: PyFeature,
 84 |     pub hit_purity: PyPurity,
 85 |     pub spectrum: PyProcessedSpectrum,
 86 |     pub chimera: Option<PyFeature>,
 87 |     pub chimera_purity: Option<PyPurity>,
 88 |     pub intensities: Vec<Option<PyPeak>>,
 89 | }
 90 | 
 91 | #[pymethods]
 92 | impl PyQuant {
 93 |     #[new]
 94 |     #[pyo3(signature = (hit, hit_purity, spectrum, intensities, chimera=None, chimera_purity=None))]
 95 |     pub fn new(
 96 |         hit: PyFeature,
 97 |         hit_purity: PyPurity,
 98 |         spectrum: PyProcessedSpectrum,
 99 |         intensities: Vec<Option<PyPeak>>,
100 |         chimera: Option<PyFeature>,
101 |         chimera_purity: Option<PyPurity>,
102 |     ) -> Self {
103 |         PyQuant {
104 |             hit,
105 |             hit_purity,
106 |             spectrum,
107 |             chimera,
108 |             chimera_purity,
109 |             intensities,
110 |         }
111 |     }
112 | 
113 |     #[getter]
114 |     pub fn hit(&self) -> PyFeature {
115 |         self.hit.clone()
116 |     }
117 | 
118 |     #[getter]
119 |     pub fn hit_purity(&self) -> PyPurity {
120 |         self.hit_purity.clone()
121 |     }
122 | 
123 |     #[getter]
124 |     pub fn spectrum(&self) -> PyProcessedSpectrum {
125 |         self.spectrum.clone()
126 |     }
127 | 
128 |     #[getter]
129 |     pub fn chimera(&self) -> Option<PyFeature> {
130 |         self.chimera.clone()
131 |     }
132 | 
133 |     #[getter]
134 |     pub fn chimera_purity(&self) -> Option<PyPurity> {
135 |         self.chimera_purity.clone()
136 |     }
137 | 
138 |     #[getter]
139 |     pub fn intensities(&self) -> Vec<Option<PyPeak>> {
140 |         self.intensities.clone()
141 |     }
142 | }
143 | 
144 | #[pyclass]
145 | #[derive(Clone)]
146 | pub struct PyTmtQuant {
147 |     pub inner: TmtQuant,
148 | }
149 | 
150 | #[pymethods]
151 | impl PyTmtQuant {
152 |     #[new]
153 |     pub fn new(
154 |         spec_id: String,
155 |         file_id: usize,
156 |         ion_injection_time: f32,
157 |         peaks: Vec<f32>
158 |     ) -> Self {
159 |         PyTmtQuant {
160 |             inner: TmtQuant {
161 |                 spec_id,
162 |                 file_id,
163 |                 ion_injection_time,
164 |                 peaks,
165 |             },
166 |         }
167 |     }
168 | 
169 |     #[getter]
170 |     pub fn spec_id(&self) -> String {
171 |         self.inner.spec_id.clone()
172 |     }
173 | 
174 |     #[getter]
175 |     pub fn file_id(&self) -> usize {
176 |         self.inner.file_id
177 |     }
178 | 
179 |     #[getter]
180 |     pub fn ion_injection_time(&self) -> f32 {
181 |         self.inner.ion_injection_time
182 |     }
183 | 
184 |     #[getter]
185 |     pub fn peaks(&self) -> Vec<f32> {
186 |         self.inner.peaks.clone()
187 |     }
188 | }
189 | 
190 | 
191 | #[pymodule]
192 | pub fn py_tmt(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
193 |     m.add_class::<PyIsobaric>()?;
194 |     m.add_class::<PyPurity>()?;
195 |     m.add_class::<PyQuant>()?;
196 |     m.add_class::<PyTmtQuant>()?;
197 |     Ok(())
198 | }
199 | 


--------------------------------------------------------------------------------
/sagepy-connector/src/py_unimod.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | use pyo3::prelude::*;
 3 | use unimod::unimod::{unimod_modifications_mass_numerical, unimod_modifications_mass, quantized_mass_to_unimod, quanzie_mass, title_to_unimod_id, modification_atomic_composition};
 4 | 
 5 | #[pyfunction]
 6 | fn unimod_modification_to_mass_numerical() -> HashMap<u32, f64> {
 7 |     unimod_modifications_mass_numerical()
 8 | }
 9 | 
10 | #[pyfunction]
11 | fn unimod_modification_to_mass() -> HashMap<&'static str, f64> {
12 |     unimod_modifications_mass()
13 | }
14 | 
15 | #[pyfunction]
16 | fn quantized_mass_to_unimod_candidates() -> HashMap<i32, Vec<&'static str>> {
17 |     quantized_mass_to_unimod()
18 | }
19 | 
20 | #[pyfunction]
21 | fn quanzied_mass(mass: f32) -> i32 {
22 |     quanzie_mass(mass)
23 | }
24 | 
25 | #[pyfunction]
26 | fn title_to_unimod_ids() -> HashMap<&'static str, &'static str> {
27 |     title_to_unimod_id()
28 | }
29 | 
30 | #[pyfunction]
31 | fn modification_atomic_compositions() -> HashMap<String, HashMap<&'static str, i32>> {
32 |     modification_atomic_composition()
33 | }
34 | 
35 | #[pymodule]
36 | pub fn py_unimod(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
37 |     m.add_function(wrap_pyfunction!(unimod_modification_to_mass_numerical, m)?)?;
38 |     m.add_function(wrap_pyfunction!(unimod_modification_to_mass, m)?)?;
39 |     m.add_function(wrap_pyfunction!(quantized_mass_to_unimod_candidates, m)?)?;
40 |     m.add_function(wrap_pyfunction!(quanzied_mass, m)?)?;
41 |     m.add_function(wrap_pyfunction!(title_to_unimod_ids, m)?)?;
42 |     m.add_function(wrap_pyfunction!(modification_atomic_compositions, m)?)?;
43 |     Ok(())
44 | }


--------------------------------------------------------------------------------
/sagepy-connector/src/py_utility.rs:
--------------------------------------------------------------------------------
  1 | use pyo3::prelude::*;
  2 | use std::collections::{BTreeMap, HashMap, HashSet};
  3 | use qfdrust::psm::{compress_psms, decompress_psms, Psm};
  4 | use rayon::prelude::*;
  5 | use rayon::ThreadPoolBuilder;
  6 | use sage_core::ion_series::Kind;
  7 | use sage_core::scoring::Fragments;
  8 | use crate::py_scoring::{PyFragments, PyPsm};
  9 | use crate::utilities::sage_sequence_to_unimod_sequence;
 10 | 
 11 | /// Converts a cosine similarity to an angle similarity.
 12 | /// The angle similarity is calculated as 1 - angle / pi.
 13 | ///
 14 | /// # Arguments
 15 | ///
 16 | /// * `cosim` - A f32 representing the cosine similarity.
 17 | ///
 18 | /// # Returns
 19 | ///
 20 | /// * A f32 representing the angle similarity.
 21 | ///
 22 | #[pyfunction]
 23 | pub fn cosim_to_spectral_angle(cosim: f32) -> f32 {
 24 |     let angle = (1.0 - cosim).acos();
 25 |     1.0 - angle / std::f32::consts::PI
 26 | }
 27 | 
 28 | /// Reshape the flat prosit array into a 3D array of shape (29, 2, 3)
 29 | ///
 30 | /// # Arguments
 31 | ///
 32 | /// * `flat_array` - a vector of f64 representing the flat prosit array
 33 | ///
 34 | /// # Returns
 35 | ///
 36 | /// * `Vec<Vec<Vec<f64>>>` - a 3D array of shape (29, 2, 3)
 37 | ///
 38 | pub fn reshape_prosit_array(flat_array: Vec<f32>) -> Vec<Vec<Vec<f32>>> {
 39 |     let mut array_return: Vec<Vec<Vec<f32>>> = vec![vec![vec![0.0; 3]; 2]; 29];
 40 |     let mut ptr = 0;
 41 | 
 42 |     for c in 0..3 {
 43 |         for row in 0..29 {
 44 |             // Fill in the Y ion values
 45 |             array_return[row][0][c] = flat_array[ptr];
 46 |             ptr += 1;
 47 |         }
 48 |         for row in 0..29 {
 49 |             // Fill in the B ion values
 50 |             array_return[row][1][c] = flat_array[ptr];
 51 |             ptr += 1;
 52 |         }
 53 |     }
 54 | 
 55 |     array_return
 56 | }
 57 | 
 58 | #[pyfunction]
 59 | pub fn flat_prosit_array_to_fragments_map(flat_intensities: Vec<f32>) -> BTreeMap<(u32, i32, i32), f32> {
 60 |     // Reshape the flat prosit array into a 3D array of shape (29, 2, 3)
 61 |     let reshaped_intensities = reshape_prosit_array(flat_intensities);
 62 | 
 63 |     // create hashmap of (kind, charge, ordinal) -> intensity
 64 |     let mut fragments: BTreeMap<(u32, i32, i32), f32> = BTreeMap::new();
 65 |     for z in 1..=3 {
 66 |         let intensity_b: Vec<f32> = reshaped_intensities[..].iter().map(|x| x[1][z as usize - 1]).collect();
 67 |         for i in 1..=29 {
 68 |                 let intensity = intensity_b[i as usize - 1];
 69 |                 if intensity >= 0.0 {
 70 |                     fragments.insert((0, z, i), intensity);
 71 |                 }
 72 |         }
 73 | 
 74 |         let intensity_y: Vec<f32> = reshaped_intensities[..].iter().map(|x| x[0][z as usize - 1]).collect();
 75 |         for i in 1..=29 {
 76 |             let intensity = intensity_y[i as usize - 1];
 77 |             if intensity >= 0.0 {
 78 |                 fragments.insert((1, z, i), intensity);
 79 |             }
 80 |         }
 81 |     }
 82 |     fragments
 83 | }
 84 | 
 85 | #[pyfunction]
 86 | pub fn py_fragments_to_fragments_map(fragments: &PyFragments, normalize: bool) -> BTreeMap<(u32, i32, i32), f32> {
 87 |     let mut fragments_map: BTreeMap<(u32, i32, i32), f32> = BTreeMap::new();
 88 | 
 89 |     let max_intensity = fragments.inner.intensities.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
 90 | 
 91 |     for i in 0..fragments.inner.mz_calculated.len() {
 92 |         let kind = match fragments.inner.kinds[i] {
 93 |             Kind::B => 0,
 94 |             Kind::Y => 1,
 95 |             _ => panic!("Invalid ion kind"),
 96 |         };
 97 | 
 98 |         let intensity = if normalize {
 99 |             fragments.inner.intensities[i] / max_intensity
100 |         } else {
101 |             fragments.inner.intensities[i]
102 |         };
103 | 
104 |         fragments_map.insert((kind,
105 |                               fragments.inner.charges[i],
106 |                               fragments.inner.fragment_ordinals[i]), intensity);
107 |     }
108 |     fragments_map
109 | }
110 | 
111 | pub fn _map_to_py_fragments(fragments: &HashMap<(u32, i32, i32), f32>,
112 |                             mz_calculated: Vec<f32>, mz_experimental: Vec<f32>) -> PyFragments {
113 | 
114 |     let mut kinds: Vec<Kind> = Vec::new();
115 |     let mut ordinals: Vec<i32> = Vec::new();
116 |     let mut charges: Vec<i32> = Vec::new();
117 |     let mut intensities: Vec<f32> = Vec::new();
118 | 
119 |     for (kind, ordinal, charge) in fragments.keys() {
120 |         let intensity = fragments.get(&(*kind, *charge, *ordinal)).unwrap();
121 |         let kind = match kind {
122 |             0 => Kind::B,
123 |             1 => Kind::Y,
124 |             _ => panic!("Invalid ion kind"),
125 |         };
126 |         kinds.push(kind);
127 |         ordinals.push(*ordinal);
128 |         charges.push(*charge);
129 |         intensities.push(*intensity);
130 |     }
131 | 
132 |     let fragments = Fragments {
133 |         mz_calculated,
134 |         mz_experimental,
135 |         kinds,
136 |         fragment_ordinals: ordinals,
137 |         charges,
138 |         intensities,
139 |     };
140 | 
141 |     PyFragments {
142 |         inner: fragments,
143 |     }
144 | }
145 | 
146 | #[pyfunction]
147 | pub fn psms_to_json(psms: Vec<PyPsm>, num_threads: usize) -> Vec<String> {
148 |     let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap();
149 | 
150 |     thread_pool.install(|| {
151 |         psms.par_iter().map(|psm| {
152 |             serde_json::to_string(&psm.inner).unwrap()
153 |         }).collect()
154 |     })
155 | }
156 | 
157 | #[pyfunction]
158 | pub fn psms_to_json_bin(psms: Vec<PyPsm>) -> Vec<u8> {
159 |     let inner_psms = psms.iter().map(|psm| psm.inner.clone()).collect::<Vec<_>>();
160 |     bincode::serialize(&inner_psms).unwrap()
161 | }
162 | 
163 | #[pyfunction]
164 | pub fn json_bin_to_psms(json_bin: Vec<u8>) -> Vec<PyPsm> {
165 |     let inner_psms: Vec<Psm> = bincode::deserialize(&json_bin).unwrap();
166 |     inner_psms.iter().map(|psm| PyPsm {
167 |         inner: psm.clone(),
168 |     }).collect()
169 | }
170 | 
171 | #[pyfunction]
172 | pub fn sage_sequence_to_unimod(sequence: String, modifications: Vec<f32>, expected_modifications: HashSet<String>) -> String {
173 |     sage_sequence_to_unimod_sequence(sequence, &modifications, &expected_modifications)
174 | }
175 | 
176 | #[pyfunction]
177 | pub fn psms_to_feature_matrix(psms: Vec<PyPsm>, num_threads: usize) -> Vec<Vec<f64>> {
178 |     let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap();
179 | 
180 |     thread_pool.install(|| {
181 |         psms.par_iter().map(|psm| {
182 |             psm.inner.get_feature_vector()
183 |         }
184 |         ).collect()
185 |     })
186 | }
187 | 
188 | #[pyfunction]
189 | pub fn get_psm_sequences_par(psms: Vec<PyPsm>, num_threads: usize) -> Vec<String> {
190 |     let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap();
191 | 
192 |     thread_pool.install(|| {
193 |         psms.par_iter().map(|psm| {
194 |             psm.inner.sequence.clone().unwrap().sequence
195 |         }).collect()
196 |     })
197 | }
198 | 
199 | #[pyfunction]
200 | pub fn get_psm_peptide_idx_par(psms: Vec<PyPsm>, num_threads: usize) -> Vec<u32> {
201 |     let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap();
202 | 
203 |     thread_pool.install(|| {
204 |         psms.par_iter().map(|psm| {
205 |             psm.inner.sage_feature.peptide_idx.0.clone()
206 |         }).collect()
207 |     })
208 | }
209 | 
210 | #[pyfunction]
211 | pub fn get_psm_sequences_modified_par(psms: Vec<PyPsm>, num_threads: usize) -> Vec<String> {
212 |     let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap();
213 | 
214 |     thread_pool.install(|| {
215 |         psms.par_iter().map(|psm| {
216 |             psm.inner.sequence_modified.clone().unwrap().sequence
217 |         }).collect()
218 |     })
219 | }
220 | 
221 | #[pyfunction]
222 | pub fn get_psm_sequences_decoy_par(psms: Vec<PyPsm>, num_threads: usize) -> Vec<String> {
223 |     let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap();
224 | 
225 |     thread_pool.install(|| {
226 |         psms.par_iter().map(|psm| {
227 |             psm.inner.sequence_decoy.clone().unwrap().sequence
228 |         }).collect()
229 |     })
230 | }
231 | 
232 | #[pyfunction]
233 | pub fn get_psm_sequences_decoy_modified_par(psms: Vec<PyPsm>, num_threads: usize) -> Vec<String> {
234 |     let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap();
235 | 
236 |     thread_pool.install(|| {
237 |         psms.par_iter().map(|psm| {
238 | 
239 |             let sequence = match &psm.inner.sequence_decoy_modified {
240 |                 Some(seq) => seq.sequence.clone(),
241 |                 None => "".to_string(),
242 |             };
243 | 
244 |             sequence
245 | 
246 |         }).collect()
247 |     })
248 | }
249 | 
250 | #[pyfunction]
251 | pub fn get_psm_spec_idx_par(psms: Vec<PyPsm>, num_threads: usize) -> Vec<String> {
252 |     let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap();
253 | 
254 |     thread_pool.install(|| {
255 |         psms.par_iter().map(|psm| {
256 |             psm.inner.spec_idx.clone()
257 |         }).collect()
258 |     })
259 | }
260 | 
261 | #[pyfunction]
262 | pub fn get_psm_proteins_par(psms: Vec<PyPsm>, num_threads: usize) -> Vec<Vec<String>> {
263 |     let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap();
264 | 
265 |     thread_pool.install(|| {
266 |         psms.par_iter().map(|psm| {
267 |             psm.inner.proteins.clone()
268 |         }).collect()
269 |     })
270 | }
271 | 
272 | #[pyfunction]
273 | pub fn py_compress_psms(psms: Vec<PyPsm>) -> Vec<u8> {
274 |     let inner_psms = psms.iter().map(|psm| psm.inner.clone()).collect::<Vec<_>>();
275 |     compress_psms(&inner_psms).unwrap()
276 | }
277 | 
278 | #[pyfunction]
279 | pub fn py_decompress_psms(psms_bin: Vec<u8>) -> Vec<PyPsm> {
280 |     let inner_psms: Vec<Psm> = decompress_psms(&psms_bin.as_slice()).unwrap();
281 |     inner_psms.iter().map(|psm| PyPsm {
282 |         inner: psm.clone(),
283 |     }).collect()
284 | }
285 | 
286 | #[pymodule]
287 | pub fn py_utility(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
288 |     m.add_function(wrap_pyfunction!(flat_prosit_array_to_fragments_map, m)?)?;
289 |     m.add_function(wrap_pyfunction!(py_fragments_to_fragments_map, m)?)?;
290 |     m.add_function(wrap_pyfunction!(psms_to_json, m)?)?;
291 |     m.add_function(wrap_pyfunction!(psms_to_json_bin, m)?)?;
292 |     m.add_function(wrap_pyfunction!(json_bin_to_psms, m)?)?;
293 |     m.add_function(wrap_pyfunction!(cosim_to_spectral_angle, m)?)?;
294 |     m.add_function(wrap_pyfunction!(sage_sequence_to_unimod, m)?)?;
295 |     m.add_function(wrap_pyfunction!(psms_to_feature_matrix, m)?)?;
296 |     m.add_function(wrap_pyfunction!(get_psm_sequences_par, m)?)?;
297 |     m.add_function(wrap_pyfunction!(get_psm_sequences_modified_par, m)?)?;
298 |     m.add_function(wrap_pyfunction!(get_psm_sequences_decoy_par, m)?)?;
299 |     m.add_function(wrap_pyfunction!(get_psm_sequences_decoy_modified_par, m)?)?;
300 |     m.add_function(wrap_pyfunction!(get_psm_spec_idx_par, m)?)?;
301 |     m.add_function(wrap_pyfunction!(get_psm_peptide_idx_par, m)?)?;
302 |     m.add_function(wrap_pyfunction!(get_psm_proteins_par, m)?)?;
303 |     m.add_function(wrap_pyfunction!(py_compress_psms, m)?)?;
304 |     m.add_function(wrap_pyfunction!(py_decompress_psms, m)?)?;
305 |     Ok(())
306 | }


--------------------------------------------------------------------------------
/sagepy-connector/src/utilities.rs:
--------------------------------------------------------------------------------
 1 | use unimod::unimod::{quanzie_mass, quantized_mass_to_unimod};
 2 | use std::collections::HashSet;
 3 | 
 4 | /// Convert a Sage sequence and modifications to a Unimod sequence
 5 | ///
 6 | /// # Arguments
 7 | ///
 8 | /// * `sequence` - A string representing the amino acid sequence
 9 | /// * `modifications` - A vector of floats representing the modifications
10 | ///
11 | /// # Returns
12 | ///
13 | /// * `String` - A string representing the Unimod sequence
14 | ///
15 | pub fn sage_sequence_to_unimod_sequence(sequence: String, modifications: &Vec<f32>, expected_modifications: &HashSet<String>) -> String {
16 | 
17 |     assert_eq!(sequence.len(), modifications.len(), "Sequence and modifications must be the same length");
18 | 
19 |     // go over each char and check if modification is present (not 0.0) and possibly convert to unimod
20 |     let mut unimod_sequence = String::new();
21 |     let unimod_modifications_qunatized = quantized_mass_to_unimod();
22 |     let empty_vec = Vec::new();
23 | 
24 |     for (idx, aa) in sequence.chars().enumerate() {
25 | 
26 |         // add amino acid to the unimod sequence
27 |         unimod_sequence.push(aa);
28 | 
29 |         // check if the modification is nonzero, need to translate to unimod
30 |         if modifications[idx] != 0.0 {
31 | 
32 |             // quantize the mass from nonzero modification
33 |             let quantized_mass = quanzie_mass(modifications[idx]);
34 | 
35 |             // find the candidate modifications for the quantized mass
36 |             let modifications = unimod_modifications_qunatized.get(&quantized_mass).unwrap_or(&empty_vec);
37 | 
38 |             let mut found = false;
39 | 
40 |             // check if the expected modification is in the candidate modifications
41 |             for modification in modifications {
42 |                 if expected_modifications.contains(&modification.to_string()) {
43 |                     unimod_sequence.push_str(modification);
44 |                     found = true;
45 |                 }
46 |             }
47 | 
48 |             // if the expected modification is not found, add a placeholder
49 |             if !found {
50 |                 unimod_sequence.push_str("[UNIMOD:?]");
51 |             }
52 |         }
53 |     }
54 |     unimod_sequence
55 | }


--------------------------------------------------------------------------------
/sagepy/README.md:
--------------------------------------------------------------------------------
  1 | # SAGEpy
  2 | A python interface to the core [SAGE](https://github.com/lazear/sage) search engine for mass spectrometry proteomics
  3 | 
  4 | <p align="center">
  5 |   <img src="sagepy_logo.png" alt="logo" width="250"/>
  6 | </p>
  7 | 
  8 | ## Installation
  9 | `sagepy` is now available via pip:
 10 | ```
 11 | pip install sagepy
 12 | ```
 13 | 
 14 | 
 15 | ### Build from source
 16 | 
 17 | 1. Clone our fork of the SAGE repository:
 18 | ```
 19 | git clone git@github.com:theGreatHerrLebert/sage.git
 20 | ```
 21 | 
 22 | 2. Install the sage-core bindings using maturin, optionally in a virtual environment:
 23 | ```
 24 | cd sage/crates/sagepy-connector
 25 | 
 26 | # Install maturin
 27 | pip install maturin
 28 | 
 29 | # Build and install the bindings
 30 | maturin build --release
 31 | 
 32 | # Install the bindings
 33 | pip install target/wheels/sagepy_connector-0.1.0-cp38-cp38-manylinux2014_x86_64.whl [--force-reinstall]
 34 | ```
 35 | This will provide you with a python exposed version of the core SAGE library.
 36 | 
 37 | 3. Install the sagepy python package with poetry:
 38 | ```
 39 | git clone git@github.com:theGreatHerrLebert/sagepy.git
 40 | 
 41 | cd sagepy
 42 | 
 43 | # Install poetry
 44 | pip install poetry
 45 | 
 46 | # Install sagepy
 47 | poetry install
 48 | ```
 49 | 
 50 | ## Usage
 51 | `sagepy` is a python interface to the core SAGE search engine. It exposes
 52 | the core functionality of SAGE in a pythonic way, allowing you to use it for a direct integration 
 53 | into your python-based proteomics workflow. So far, it mainly mirrors structs that are available
 54 | in the core SAGE library. 
 55 | 
 56 | ### Example generation of a sage database
 57 | ```python
 58 | import numpy as np
 59 | from sagepy.core import EnzymeBuilder, SageSearchConfiguration
 60 | 
 61 | # configure a trypsin-like digestor of fasta files
 62 | enzyme_builder = EnzymeBuilder(
 63 |     missed_cleavages=2, 
 64 |     min_len=5, 
 65 |     max_len=50, 
 66 |     cleave_at='KR', 
 67 |     restrict='P', 
 68 |     c_terminal=True,
 69 | )
 70 | 
 71 | # UPDATE: Modification handling is simplified, using canonical UNIMOD notation
 72 | static_mods = {"C": "[UNIMOD:4]"}  # static cysteine modification
 73 | variable_mods = {"M": ["[UNIMOD:35]"]}
 74 | 
 75 | with open('path/to/reference.fasta', 'r') as infile:
 76 |     fasta = infile.read()
 77 | 
 78 | # set-up a config for a sage-database
 79 | sage_config = SageSearchConfiguration(
 80 |     fasta=fasta,
 81 |     static_mods=static_mods,
 82 |     variable_mods=variable_mods,
 83 |     enzyme_builder=enzyme_builder,
 84 |     generate_decoys=True,
 85 |     bucket_size=int(np.power(2, 14))
 86 | )
 87 | 
 88 | # generate the database for searching against
 89 | indexed_db = sage_config.generate_indexed_database()
 90 | ```
 91 | 
 92 | ### Generate a query
 93 | ```python
 94 | import numpy as np
 95 | from sagepy.core import Precursor, RawSpectrum, ProcessedSpectrum, SpectrumProcessor, Tolerance, Scorer, Representation
 96 | 
 97 | ### Example search of a sage database
 98 | precursor = Precursor(
 99 |     charge=2,
100 |     mz=506.77,
101 | )
102 | 
103 | intensity = np.array([ 202.,  170.,  205.,  152., 1069.,  595.,  198.,  805.,  187.,
104 |         194.,  197.,  169.,  196.,  209.,  638.,  372.,  235.,  399.,
105 |         194.,  185.,  181.,  170.,  407.,  150.,  157.,  175.,  273.,
106 |        1135.,  881.,  337.,  311.,  243.,  310.,  153.,  162.,  210.,
107 |         277.,  206.,  189.,  259.,  658.,  383.,  166.,  169.,  219.,
108 |         186.,  221.,  193.,  367.,  283.,  237.,  157.,  372., 1276.,
109 |        1618., 1102.,  404.,  232.,  456.,  765.,  507.,  223.,  258.,
110 |         402.,  187.,  158.,  153.,  304.,  218.,  223.,  156., 1605.,
111 |        1165., 1062.,  434.,  208.,  155.,  197.,  221.,  697.,  397.,
112 |         180.,  195.,  512.,  252.,  367.,  305.,  335.,  175.,  174.,
113 |         296.,  212.], dtype=np.float32)
114 | 
115 | mz = np.array([272.16873692, 356.16844797, 406.71079396, 406.71396814,
116 |        406.71714233, 406.72031653, 407.21246768, 407.21564382,
117 |        407.21881996, 407.22199612, 407.7144506 , 407.71762869,
118 |        488.27537883, 488.28581266, 499.29228981, 499.29580676,
119 |        499.29932372, 499.30284069, 506.75478369, 507.26157767,
120 |        541.26272227, 553.29188809, 577.30432041, 577.30810217,
121 |        595.32672633, 597.2907525 , 603.27568881, 614.32036769,
122 |        614.32426881, 614.32816995, 615.3272682 , 615.33117252,
123 |        616.33108578, 617.33572156, 636.30924838, 637.30619081,
124 |        637.31016425, 665.36284673, 666.36197292, 674.35335834,
125 |        674.35744565, 674.36153297, 675.35511968, 675.36330039,
126 |        679.3531909 , 680.35044702, 680.35455247, 687.36822726,
127 |        687.37648041, 688.37547678, 697.3616813 , 700.3617026 ,
128 |        715.36157366, 715.36578342, 715.36999319, 715.37420297,
129 |        715.37841277, 715.38262258, 716.36384605, 716.37227148,
130 |        716.38069696, 717.37103577, 725.35228543, 749.39291293,
131 |        749.39722166, 750.38424802, 786.44692356, 786.45575152,
132 |        787.4492132 , 787.45804678, 795.39284711, 812.41777208,
133 |        812.42225834, 812.42674462, 812.4312309 , 812.44020351,
134 |        813.40504794, 813.41851494, 813.42300396, 813.427493  ,
135 |        813.43198205, 813.44544927, 814.43784098, 828.42202737,
136 |        828.4265576 , 851.43464868, 899.45327427, 899.46271517,
137 |        912.45278821, 913.44673363, 915.45053417, 915.46482091], dtype=np.float32)
138 | 
139 | raw_spectrum = RawSpectrum(
140 |     file_id=1,
141 |     spec_id='DEMO-SPEC',
142 |     total_ion_current=12667.0,
143 |     precursors=[precursor],
144 |     mz=mz,
145 |     intensity=intensity
146 | )
147 | 
148 | spec_processor = SpectrumProcessor(take_top_n=75)
149 | query = spec_processor.process(raw_spectrum)
150 | ```
151 | 
152 | ### Search a database
153 | ```python
154 | from sagepy.core import Scorer
155 | 
156 | # UPDATE: pass modifications to the scorer, necessary for PTM handling
157 | scorer = Scorer(report_psms=2, min_matched_peaks=5, variable_mods=variable_mods, static_mods=static_mods)
158 | results = scorer.score(db=indexed_db, spectrum=query)
159 | ```
160 | 
161 | potential output:
162 | ```
163 | [Feature(idx: PeptideIx(1009105), peptide_len: 9, spec_id: DEMO-SPEC, file_id: 1, rank: 1, label: 1, exp. mass: 1011.5254516601562, cal. mass: 1011.5347900390625, charge: 2, retention time: 0.0, aligned rt: 0.0, predicted rt: 0.0, delta rt model: 0.9990000128746033, delta mass: 2989.41943359375, isotope error: 3.010050058364868, average ppm: 5.889466285705566, hyperscore: 15.020833459653923, delta_next: 0.0, delta_best: 0.0, matched peaks: 5, longest b: 0,longest y: 4, longest y pct: 0.4444444477558136, missed cleavages: 0, matched intensity pct: 14.81151294708252, scored candidates: 9340, poisson: -2.177020383746938, discriminant score: 0.0, posterior error: 1.0, spectrum q: 1.0, peptide q: 1.0, protein q: 1.0, ms2 intensity: 4652.0, ms1 intensity: 0.0), Feature(idx: PeptideIx(1009105), peptide_len: 9, spec_id: DEMO-SPEC, file_id: 1, rank: 2, label: 1, exp. mass: 1011.5254516601562, cal. mass: 1011.5347900390625, charge: 2, retention time: 0.0, aligned rt: 0.0, predicted rt: 0.0, delta rt model: 0.9990000128746033, delta mass: 1001.641845703125, isotope error: 1.003350019454956, average ppm: 5.889466285705566, hyperscore: 15.020833459653923, delta_next: 0.0, delta_best: 0.0, matched peaks: 5, longest b: 0,longest y: 4, longest y pct: 0.4444444477558136, missed cleavages: 0, matched intensity pct: 14.81151294708252, scored candidates: 9340, poisson: -2.177020383746938, discriminant score: 0.0, posterior error: 1.0, spectrum q: 1.0, peptide q: 1.0, protein q: 1.0, ms2 intensity: 4652.0, ms1 intensity: 0.0)]
164 | ```
165 | 


--------------------------------------------------------------------------------
/sagepy/examples/lfq/helpers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import List, Union, Dict, Any
  3 | import pandas as pd
  4 | 
  5 | from imspy.timstof.dda import TimsDatasetDDA
  6 | 
  7 | from imspy.timstof.dbsearch.utility import (
  8 |     get_ms1_ims_spectrum,
  9 |     sanitize_mz,
 10 |     sanitize_charge,
 11 |     get_searchable_spec
 12 | )
 13 | 
 14 | from sagepy.core import (
 15 |     SpectrumProcessor,
 16 |     Precursor,
 17 |     Tolerance
 18 | )
 19 | 
 20 | def process_timstof_datasets(
 21 |     dataset_dirs: Union[str, List[str]],
 22 |     use_bruker_sdk: bool = False,
 23 |     max_peaks: int = 10_000,
 24 |     num_threads: int = 16,
 25 |     ms1_take_top_n: int = 10_000,
 26 |     ms1_deisotope: bool = True,
 27 |     fragment_take_top_n: int = 150
 28 | ) -> Dict[str, Dict[str, Any]]:
 29 |     """
 30 |     Process one or more Bruker TIMS .d folders to extract summarized fragment DataFrames
 31 |     and MS1 spectra suitable for downstream search.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     dataset_dirs : str or list of str
 36 |         A path or list of paths to the .d folders to process.
 37 |     use_bruker_sdk : bool, default False
 38 |         Whether to use the Bruker SDK when reading data.
 39 |     max_peaks : int, default 100000
 40 |         Maximum number of peaks to collect per precursor frame.
 41 |     num_threads : int, default 16
 42 |         Number of threads for parallel data extraction.
 43 |     ms1_take_top_n : int, default 10000
 44 |         Number of top peaks to keep when processing MS1 spectra.
 45 |     ms1_deisotope : bool, default True
 46 |         Whether to deisotope MS1 spectra.
 47 |     fragment_take_top_n : int, default 150
 48 |         Number of top peaks to keep when processing fragment spectra.
 49 | 
 50 |     Returns
 51 |     -------
 52 |     results : dict
 53 |         A mapping from each dataset directory to a dict with keys:
 54 |             'fragments'   : pd.DataFrame of summarized fragment ions,
 55 |             'ms1_spectra' : List of processed MS1 spectra objects.
 56 |     """
 57 |     if isinstance(dataset_dirs, str):
 58 |         dataset_dirs = [dataset_dirs]
 59 | 
 60 |     results: Dict[str, Dict[str, Any]] = {}
 61 |     for file_id, dataset_dir in enumerate(dataset_dirs):
 62 |         ds_name = os.path.basename(dataset_dir.rstrip(os.sep))
 63 |         handle = TimsDatasetDDA(dataset_dir, use_bruker_sdk=use_bruker_sdk)
 64 | 
 65 |         # Extract precursor frames and process MS1 spectra
 66 |         precursor_frames = handle.get_precursor_frames(
 67 |             max_peaks=max_peaks,
 68 |             num_threads=num_threads
 69 |         )
 70 |         ms1_processor = SpectrumProcessor(
 71 |             take_top_n=ms1_take_top_n,
 72 |         )
 73 |         ms1_spectra = [
 74 |             get_ms1_ims_spectrum(
 75 |                 raw_spectrum=spec,
 76 |                 spec_id=f"{spec.frame_id}-{ds_name}",
 77 |                 time=spec.retention_time / 60,
 78 |                 spec_processor=ms1_processor,
 79 |                 file_id=file_id
 80 |             ) for spec in precursor_frames
 81 |         ]
 82 | 
 83 |         del precursor_frames
 84 | 
 85 |         # Extract and summarize PASEF fragments
 86 |         fragments = handle.get_pasef_fragments(num_threads=num_threads)
 87 |         fragments = fragments.groupby('precursor_id').agg({
 88 |             'frame_id': 'first',
 89 |             'time': 'first',
 90 |             'precursor_id': 'first',
 91 |             'raw_data': 'sum',
 92 |             'scan_begin': 'first',
 93 |             'scan_end': 'first',
 94 |             'isolation_mz': 'first',
 95 |             'isolation_width': 'first',
 96 |             'collision_energy': 'first',
 97 |             'largest_peak_mz': 'first',
 98 |             'average_mz': 'first',
 99 |             'monoisotopic_mz': 'first',
100 |             'charge': 'first',
101 |             'average_scan': 'first',
102 |             'intensity': 'first',
103 |             'parent_id': 'first',
104 |         })
105 | 
106 |         # Compute marginal ion mobility and build spec IDs
107 |         fragments['mobility'] = fragments['raw_data'].apply(
108 |             lambda rd: rd.get_inverse_mobility_along_scan_marginal()
109 |         )
110 |         fragments['spec_id'] = fragments.apply(
111 |             lambda r: f"{r.frame_id}-{r.precursor_id}-{ds_name}",
112 |             axis=1
113 |         )
114 | 
115 |         # Build Precursor objects
116 |         fragments['sage_precursor'] = fragments.apply(
117 |             lambda r: Precursor(
118 |                 mz=sanitize_mz(r['monoisotopic_mz'], r['largest_peak_mz']),
119 |                 intensity=r['intensity'],
120 |                 charge=sanitize_charge(r['charge']),
121 |                 isolation_window=Tolerance(da=(-3, 3)),
122 |                 collision_energy=r['collision_energy'],
123 |                 inverse_ion_mobility=r['mobility'],
124 |                 spectrum_ref=r['spec_id']
125 |             ),
126 |             axis=1
127 |         )
128 | 
129 |         # Process fragment spectra for searching
130 |         fragments['processed_spec'] = fragments.apply(
131 |             lambda r: get_searchable_spec(
132 |                 precursor=r['sage_precursor'],
133 |                 raw_fragment_data=r['raw_data'],
134 |                 spec_processor=SpectrumProcessor(take_top_n=fragment_take_top_n),
135 |                 spec_id=r['spec_id'],
136 |                 time=r['time'],
137 |                 file_id=file_id
138 |             ),
139 |             axis=1
140 |         )
141 | 
142 |         results[dataset_dir] = {
143 |             'fragments': fragments,
144 |             'ms1_spectra': ms1_spectra
145 |         }
146 | 
147 |     return results


--------------------------------------------------------------------------------
/sagepy/examples/readme/readme_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "07961419-6045-407b-badd-732af003583b",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "from sagepy.core import EnzymeBuilder, SageSearchConfiguration\n",
 12 |     "\n",
 13 |     "# configure a trypsin-like digestor of fasta files\n",
 14 |     "enzyme_builder = EnzymeBuilder(\n",
 15 |     "    missed_cleavages=2, \n",
 16 |     "    min_len=5, \n",
 17 |     "    max_len=50, \n",
 18 |     "    cleave_at='KR', \n",
 19 |     "    restrict='P', \n",
 20 |     "    c_terminal=True,\n",
 21 |     ")\n",
 22 |     "\n",
 23 |     "# UPDATE: Modification handling is simplified, using canonical UNIMOD notation\n",
 24 |     "static_mods = {\"C\": \"[UNIMOD:4]\"}  # static cysteine modification\n",
 25 |     "variable_mods = {\"M\": [\"[UNIMOD:35]\"]}\n",
 26 |     "\n",
 27 |     "with open('/media/hd02/data/fasta/hela/plain/uniprotkb_proteome_UP000005640_AND_revi_2024_05_21.fasta', 'r') as infile:\n",
 28 |     "    fasta = infile.read()\n",
 29 |     "\n",
 30 |     "# set-up a config for a sage-database\n",
 31 |     "sage_config = SageSearchConfiguration(\n",
 32 |     "    fasta=fasta,\n",
 33 |     "    static_mods=static_mods,\n",
 34 |     "    variable_mods=variable_mods,\n",
 35 |     "    enzyme_builder=enzyme_builder,\n",
 36 |     "    generate_decoys=True,\n",
 37 |     "    bucket_size=int(np.power(2, 14))\n",
 38 |     ")\n",
 39 |     "\n",
 40 |     "# generate the database for searching against\n",
 41 |     "indexed_db = sage_config.generate_indexed_database()"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "id": "7003b8b7-1342-44be-ab0a-4a5e9131388f",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "import numpy as np\n",
 52 |     "from sagepy.core import Precursor, RawSpectrum, ProcessedSpectrum, SpectrumProcessor, Tolerance, Scorer, Representation\n",
 53 |     "\n",
 54 |     "### Example search of a sage database\n",
 55 |     "precursor = Precursor(\n",
 56 |     "    charge=2,\n",
 57 |     "    mz=506.77,\n",
 58 |     ")\n",
 59 |     "\n",
 60 |     "intensity = np.array([ 202.,  170.,  205.,  152., 1069.,  595.,  198.,  805.,  187.,\n",
 61 |     "        194.,  197.,  169.,  196.,  209.,  638.,  372.,  235.,  399.,\n",
 62 |     "        194.,  185.,  181.,  170.,  407.,  150.,  157.,  175.,  273.,\n",
 63 |     "       1135.,  881.,  337.,  311.,  243.,  310.,  153.,  162.,  210.,\n",
 64 |     "        277.,  206.,  189.,  259.,  658.,  383.,  166.,  169.,  219.,\n",
 65 |     "        186.,  221.,  193.,  367.,  283.,  237.,  157.,  372., 1276.,\n",
 66 |     "       1618., 1102.,  404.,  232.,  456.,  765.,  507.,  223.,  258.,\n",
 67 |     "        402.,  187.,  158.,  153.,  304.,  218.,  223.,  156., 1605.,\n",
 68 |     "       1165., 1062.,  434.,  208.,  155.,  197.,  221.,  697.,  397.,\n",
 69 |     "        180.,  195.,  512.,  252.,  367.,  305.,  335.,  175.,  174.,\n",
 70 |     "        296.,  212.], dtype=np.float32)\n",
 71 |     "\n",
 72 |     "mz = np.array([272.16873692, 356.16844797, 406.71079396, 406.71396814,\n",
 73 |     "       406.71714233, 406.72031653, 407.21246768, 407.21564382,\n",
 74 |     "       407.21881996, 407.22199612, 407.7144506 , 407.71762869,\n",
 75 |     "       488.27537883, 488.28581266, 499.29228981, 499.29580676,\n",
 76 |     "       499.29932372, 499.30284069, 506.75478369, 507.26157767,\n",
 77 |     "       541.26272227, 553.29188809, 577.30432041, 577.30810217,\n",
 78 |     "       595.32672633, 597.2907525 , 603.27568881, 614.32036769,\n",
 79 |     "       614.32426881, 614.32816995, 615.3272682 , 615.33117252,\n",
 80 |     "       616.33108578, 617.33572156, 636.30924838, 637.30619081,\n",
 81 |     "       637.31016425, 665.36284673, 666.36197292, 674.35335834,\n",
 82 |     "       674.35744565, 674.36153297, 675.35511968, 675.36330039,\n",
 83 |     "       679.3531909 , 680.35044702, 680.35455247, 687.36822726,\n",
 84 |     "       687.37648041, 688.37547678, 697.3616813 , 700.3617026 ,\n",
 85 |     "       715.36157366, 715.36578342, 715.36999319, 715.37420297,\n",
 86 |     "       715.37841277, 715.38262258, 716.36384605, 716.37227148,\n",
 87 |     "       716.38069696, 717.37103577, 725.35228543, 749.39291293,\n",
 88 |     "       749.39722166, 750.38424802, 786.44692356, 786.45575152,\n",
 89 |     "       787.4492132 , 787.45804678, 795.39284711, 812.41777208,\n",
 90 |     "       812.42225834, 812.42674462, 812.4312309 , 812.44020351,\n",
 91 |     "       813.40504794, 813.41851494, 813.42300396, 813.427493  ,\n",
 92 |     "       813.43198205, 813.44544927, 814.43784098, 828.42202737,\n",
 93 |     "       828.4265576 , 851.43464868, 899.45327427, 899.46271517,\n",
 94 |     "       912.45278821, 913.44673363, 915.45053417, 915.46482091], dtype=np.float32)\n",
 95 |     "\n",
 96 |     "raw_spectrum = RawSpectrum(\n",
 97 |     "    file_id=1,\n",
 98 |     "    spec_id='DEMO-SPEC',\n",
 99 |     "    total_ion_current=12667.0,\n",
100 |     "    precursors=[precursor],\n",
101 |     "    mz=mz,\n",
102 |     "    intensity=intensity,\n",
103 |     ")\n",
104 |     "\n",
105 |     "spec_processor = SpectrumProcessor(take_top_n=75)\n",
106 |     "query = spec_processor.process(raw_spectrum)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 3,
112 |    "id": "f340808b-6126-4f91-a39a-654b63acfb21",
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "from sagepy.core import Scorer\n",
117 |     "\n",
118 |     "scorer = Scorer(report_psms=2, min_matched_peaks=5, variable_mods=variable_mods, static_mods=static_mods)\n",
119 |     "results = scorer.score(db=indexed_db, spectrum=query)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 5,
125 |    "id": "8b468c19-c1e7-43a1-9797-a336646d3acd",
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "[Feature(idx: PeptideIx(1014528), psm_id: 3, peptide_len: 9, spec_id: DEMO-SPEC, file_id: 1, rank: 1, label: 1, exp. mass: 1011.5254516601562, cal. mass: 1011.5347900390625, charge: 2, retention time: 0.0, aligned rt: 0.0, predicted rt: 0.0, delta rt model: 0.9990000128746033, delta mass: -1995.037109375, isotope error: 2.006700038909912, average ppm: 5.889466285705566, hyperscore: 15.020833685116404, delta_next: 0.0, delta_best: 0.0, matched peaks: 5, longest b: 0,longest y: 4, longest y pct: 0.4444444477558136, missed cleavages: 0, matched intensity pct: 14.81151294708252, scored candidates: 9359, poisson: -2.1735888459925277, discriminant score: 0.0, posterior error: 1.0, spectrum q: 1.0, peptide q: 1.0, protein q: 1.0, ms2 intensity: 4652.0), fragments: Fragments(charges: [1, 1, 1, 1, 1], ion_types: [IonType(Y), IonType(Y), IonType(Y), IonType(Y), IonType(Y)], fragment_ordinals: [8, 7, 6, 5, 4], intensities: [335.0, 1165.0, 1618.0, 1135.0, 399.0], mz_calculated: [899.4580078125, 812.4259643554688, 715.3732299804688, 614.3255615234375, 499.2986145019531], mz_experimental: [899.4627075195312, 812.4222412109375, 715.3699951171875, 614.3203735351562, 499.3028259277344])),\n",
132 |        " Feature(idx: PeptideIx(1014528), psm_id: 4, peptide_len: 9, spec_id: DEMO-SPEC, file_id: 1, rank: 2, label: 1, exp. mass: 1011.5254516601562, cal. mass: 1011.5347900390625, charge: 2, retention time: 0.0, aligned rt: 0.0, predicted rt: 0.0, delta rt model: 0.9990000128746033, delta mass: -1001.641845703125, isotope error: 1.003350019454956, average ppm: 5.889466285705566, hyperscore: 15.020833685116404, delta_next: 0.0, delta_best: 0.0, matched peaks: 5, longest b: 0,longest y: 4, longest y pct: 0.4444444477558136, missed cleavages: 0, matched intensity pct: 14.81151294708252, scored candidates: 9359, poisson: -2.1735888459925277, discriminant score: 0.0, posterior error: 1.0, spectrum q: 1.0, peptide q: 1.0, protein q: 1.0, ms2 intensity: 4652.0), fragments: Fragments(charges: [1, 1, 1, 1, 1], ion_types: [IonType(Y), IonType(Y), IonType(Y), IonType(Y), IonType(Y)], fragment_ordinals: [8, 7, 6, 5, 4], intensities: [335.0, 1165.0, 1618.0, 1135.0, 399.0], mz_calculated: [899.4580078125, 812.4259643554688, 715.3732299804688, 614.3255615234375, 499.2986145019531], mz_experimental: [899.4627075195312, 812.4222412109375, 715.3699951171875, 614.3203735351562, 499.3028259277344]))]"
133 |       ]
134 |      },
135 |      "execution_count": 5,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "result_psm = scorer.score(db=indexed_db,spectrum=query)\n",
142 |     "result_psm"
143 |    ]
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "kernelspec": {
148 |    "display_name": "Python 3 (ipykernel)",
149 |    "language": "python",
150 |    "name": "python3"
151 |   },
152 |   "language_info": {
153 |    "codemirror_mode": {
154 |     "name": "ipython",
155 |     "version": 3
156 |    },
157 |    "file_extension": ".py",
158 |    "mimetype": "text/x-python",
159 |    "name": "python",
160 |    "nbconvert_exporter": "python",
161 |    "pygments_lexer": "ipython3",
162 |    "version": "3.11.11"
163 |   }
164 |  },
165 |  "nbformat": 4,
166 |  "nbformat_minor": 5
167 | }
168 | 


--------------------------------------------------------------------------------
/sagepy/examples/rescoring/data/psm_data.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1f9d469ae7fa55a195fde5ca5dbb5c270f6b1f2d58841c0d88778ea8e7b128d1
3 | size 98739151
4 | 


--------------------------------------------------------------------------------
/sagepy/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sagepy"
 3 | version = "0.3.12"
 4 | description = ""
 5 | authors = ["theGreatHerrLebert <davidteschner@googlemail.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.10"
10 | pyteomics = ">=4.7.3"
11 | mokapot = ">=0.10.0"
12 | sagepy-connector = ">=0.3.12"
13 | numpy = "==1.26.4"
14 | pandas = ">=1.3.3"
15 | numba = ">=0.59.1"
16 | scikit-learn = ">=1.6.1"
17 | tqdm = ">=4.62.3"
18 | lxml = ">=5.3.0"
19 | 
20 | [build-system]
21 | requires = ["poetry-core"]
22 | build-backend = "poetry.core.masonry.api"
23 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/theGreatHerrLebert/sagepy/4382f7b24dd4e35bfa256e8c8a32ead610782e99/sagepy/sagepy/__init__.py


--------------------------------------------------------------------------------
/sagepy/sagepy/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .scoring import Scorer, Fragments, IonType, Psm, Feature
2 | from .database import IndexedDatabase, EnzymeBuilder, SageSearchConfiguration
3 | from .spectrum import RawSpectrum, ProcessedSpectrum, ProcessedIMSpectrum, Precursor, SpectrumProcessor, Representation
4 | from .mass import Tolerance
5 | from .ion_series import IonSeries
6 | from .peptide import Peptide
7 | from .modification import SAGE_KNOWN_MODS, validate_mods, validate_var_mods
8 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/core/enzyme.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | import numpy as np
  4 | import sagepy_connector
  5 | psc = sagepy_connector.py_enzyme
  6 | 
  7 | 
  8 | class Position:
  9 |     def __init__(self, position: Union[str, None] = None):
 10 |         """Position class representing location of peptide in protein
 11 | 
 12 |         Args:
 13 |             position (Union[str, None], optional): The position of the peptide in the protein. Defaults to None.
 14 |         """
 15 |         if position is not None:
 16 |             try:
 17 |                 self.__position_ptr = psc.PyPosition.from_string(position)
 18 |             except ValueError:
 19 |                 raise ValueError("Invalid position string, allowed values are: c_term, n_term, full, internal")
 20 |         else:
 21 |             self.__position_ptr = psc.PyPosition.from_string('internal')
 22 | 
 23 |     @classmethod
 24 |     def from_py_position(cls, position: psc.PyPosition):
 25 |         instance = cls.__new__(cls)
 26 |         instance.__position_ptr = position
 27 |         return instance
 28 | 
 29 |     @property
 30 |     def position(self):
 31 |         return self.__position_ptr.to_string
 32 | 
 33 |     def __repr__(self):
 34 |         return f"Position({self.position})"
 35 | 
 36 |     def get_py_ptr(self):
 37 |         return self.__position_ptr
 38 | 
 39 | 
 40 | class Enzyme:
 41 |     def __init__(self, cleave_pattern: str = 'KR', c_terminal: bool = True, skip_suffix: str = 'P', semi_enzymatic: bool = False):
 42 |         """Enzyme class, default enzyme is Trypsin
 43 | 
 44 |         Args:
 45 |             cleave_pattern (str, optional): The cleavage pattern of the enzyme. Defaults to 'KR'.
 46 |             c_terminal (bool, optional): Cleave from the C-terminal. Defaults to True.
 47 |             skip_suffix (str, optional): The skip suffix of the enzyme. Defaults to 'P'.
 48 |             semi_enzymatic (bool, optional): Is the enzyme semi enzymatic. Defaults to False.
 49 |         """
 50 |         self.__enzyme_ptr = psc.PyEnzyme(cleave_pattern, c_terminal, skip_suffix, semi_enzymatic)
 51 | 
 52 |     @classmethod
 53 |     def from_py_enzyme(cls, enzyme: psc.PyEnzyme):
 54 |         instance = cls.__new__(cls)
 55 |         instance.__enzyme_ptr = enzyme
 56 |         return instance
 57 | 
 58 |     @property
 59 |     def c_terminal(self):
 60 |         if self.__enzyme_ptr is None:
 61 |             return None
 62 |         return self.__enzyme_ptr.c_terminal
 63 | 
 64 |     @property
 65 |     def skip_suffix(self):
 66 |         if self.__enzyme_ptr is None:
 67 |             return None
 68 |         return self.__enzyme_ptr.skip_suffix
 69 | 
 70 |     def cleavage_sites(self, sequence: str):
 71 |         if self.__enzyme_ptr is None:
 72 |             return None
 73 |         return self.__enzyme_ptr.cleavage_sites(sequence)
 74 | 
 75 |     def cleave(self, sequence: str, min_length: int = 1, max_length: int = np.inf):
 76 |         if self.__enzyme_ptr is None:
 77 |             raise ValueError("Enzyme is not defined")
 78 |         cleave_sites = self.__enzyme_ptr.cleavage_sites(sequence)
 79 | 
 80 |         cleaved_peptides = [sequence[start:end] for start, end in cleave_sites]
 81 |         filtered_peptides = [peptide for peptide in cleaved_peptides if min_length <= len(peptide) <= max_length]
 82 | 
 83 |         return filtered_peptides
 84 | 
 85 |     def get_py_ptr(self):
 86 |         return self.__enzyme_ptr
 87 | 
 88 |     def __repr__(self):
 89 |         return f"Enzyme(c_terminal: {self.c_terminal}, skip_suffix: {self.skip_suffix})"
 90 | 
 91 | 
 92 | class Digest:
 93 |     def __init__(self, decoy: bool, sequence: str, protein: str, missed_cleavages: int, position: Position):
 94 |         """Digest class representing a peptide digest
 95 | 
 96 |         Args:
 97 |             decoy (bool): Is the digest peptide a decoy
 98 |             sequence (str): The sequence of the digest peptide
 99 |             protein (str): The protein that the digest peptide is found in
100 |             missed_cleavages (int): The number of missed cleavages
101 |             position (Position): The position of the digest peptide in the protein
102 |         """
103 |         self.__digest_ptr = psc.PyDigest(decoy, sequence, protein, missed_cleavages, position.get_py_ptr())
104 | 
105 |     @classmethod
106 |     def from_py_digest(cls, digest: psc.PyDigest):
107 |         instance = cls.__new__(cls)
108 |         instance.__digest_ptr = digest
109 |         return instance
110 | 
111 |     @property
112 |     def decoy(self):
113 |         return self.__digest_ptr.decoy
114 | 
115 |     @property
116 |     def sequence(self):
117 |         return self.__digest_ptr.sequence
118 | 
119 |     @property
120 |     def protein(self):
121 |         return self.__digest_ptr.protein
122 | 
123 |     @property
124 |     def missed_cleavages(self):
125 |         return self.__digest_ptr.missed_cleavages
126 | 
127 |     @property
128 |     def position(self):
129 |         return self.__digest_ptr.position
130 | 
131 |     def reverse(self):
132 |         return self.__digest_ptr.reverse()
133 | 
134 |     def __eq__(self, other):
135 |         return self.__digest_ptr == other.__digest_ptr
136 | 
137 |     def __hash__(self):
138 |         return hash(self.__digest_ptr)
139 | 
140 |     def __repr__(self):
141 |         return f"Digest(decoy: {self.decoy}, protein: {self.protein}, " \
142 |                f"missed_cleavages: {self.missed_cleavages}, position: {self.position}, sequence: {self.sequence})"
143 | 
144 |     def get_py_ptr(self):
145 |         return self.__digest_ptr
146 | 
147 | 
148 | class EnzymeParameters:
149 |     def __init__(self, missed_cleavages: int = 0, min_len: int = 5, max_len: int = 50, enzyme: Enzyme = None):
150 |         """EnzymeParameters class representing the parameters of an enzyme
151 | 
152 |         Args:
153 |             missed_cleavages (int, optional): The number of missed cleavages. Defaults to 0.
154 |             min_len (int, optional): The minimum length of a peptide. Defaults to 5.
155 |             max_len (int, optional): The maximum length of a peptide. Defaults to 50.
156 |             enzyme (Enzyme, optional): The enzyme. Defaults to None.
157 |         """
158 |         if enzyme is None:
159 |             self.__enzyme_parameters_ptr = psc.PyEnzymeParameters(missed_cleavages, min_len, max_len, None)
160 |         else:
161 |             self.__enzyme_parameters_ptr = psc.PyEnzymeParameters(missed_cleavages, min_len, max_len,
162 |                                                                   enzyme.get_py_ptr())
163 | 
164 |     @classmethod
165 |     def from_py_enzyme_parameters(cls, enzyme_parameters: psc.PyEnzymeParameters):
166 |         instance = cls.__new__(cls)
167 |         instance.__enzyme_parameters_ptr = enzyme_parameters
168 |         return instance
169 | 
170 |     @property
171 |     def missed_cleavages(self):
172 |         return self.__enzyme_parameters_ptr.missed_cleavages
173 | 
174 |     @property
175 |     def min_len(self):
176 |         return self.__enzyme_parameters_ptr.min_len
177 | 
178 |     @property
179 |     def max_len(self):
180 |         return self.__enzyme_parameters_ptr.max_len
181 | 
182 |     @property
183 |     def enzyme(self):
184 |         return Enzyme.from_py_enzyme(self.__enzyme_parameters_ptr.enzyme)
185 | 
186 |     def __repr__(self):
187 |         return f"EnzymeParameters(missed_cleavages: {self.missed_cleavages}, min_len: {self.min_len}, " \
188 |                f"max_len: {self.max_len}, enzyme: {self.enzyme})"
189 | 
190 |     def cleavage_sites(self, sequence: str):
191 |         return self.__enzyme_parameters_ptr.cleavage_sites(sequence)
192 | 
193 |     def digest(self, sequence: str, protein: str):
194 |         return [Digest.from_py_digest(s) for s in self.__enzyme_parameters_ptr.digest(sequence, protein)]
195 | 
196 |     def get_py_ptr(self):
197 |         return self.__enzyme_parameters_ptr


--------------------------------------------------------------------------------
/sagepy/sagepy/core/fasta.py:
--------------------------------------------------------------------------------
 1 | from .enzyme import EnzymeParameters, Digest
 2 | import sagepy_connector
 3 | psc = sagepy_connector.py_fasta
 4 | 
 5 | 
 6 | class Fasta:
 7 |     def __init__(self, fasta_str: str, decoy_tag: str = 'decoy_', generate_decoys: bool = False):
 8 |         """Fasta class
 9 | 
10 |         Args:
11 |             fasta_str (str): The fasta string
12 |             decoy_tag (str, optional): The decoy tag. Defaults to 'decoy_'.
13 |             generate_decoys (bool, optional): Should decoys be generated. Defaults to False.
14 |         """
15 |         self.__fasta_ptr = psc.PyFasta.parse(fasta_str, decoy_tag, generate_decoys)
16 | 
17 |     @classmethod
18 |     def from_py_fasta(cls, fasta: psc.PyFasta):
19 |         instance = cls.__new__(cls)
20 |         instance.__fasta_ptr = fasta
21 |         return instance
22 | 
23 |     def _digest(self, enzyme_parameters: EnzymeParameters):
24 |         return [Digest.from_py_digest(s) for s in self.__fasta_ptr.digest(enzyme_parameters.get_py_ptr())]
25 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/core/fdr.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List, Union, Dict
 2 | from sagepy.core.scoring import Psm
 3 | from sagepy.core import IndexedDatabase, Feature
 4 | from sagepy.core.database import PeptideIx
 5 | import sagepy_connector
 6 | psc = sagepy_connector.py_fdr
 7 | 
 8 | 
 9 | class CompetitionPeptideIx:
10 |     def __init__(self, forward: float, reverse: float,
11 |                  forward_ix: Optional[PeptideIx] = None, reverse_ix: Optional[PeptideIx] = None):
12 | 
13 |         self.__ptr = psc.PyCompetitionPeptideIx(forward, reverse, forward_ix.get_py_ptr() if forward_ix else None,
14 |                                                 reverse_ix.get_py_ptr() if reverse_ix else None)
15 | 
16 |     @classmethod
17 |     def from_py_competition_peptide_ix(cls, competition_peptide_ix: psc.PyCompetitionPeptideIx):
18 |         instance = cls.__new__(cls)
19 |         instance.__ptr = competition_peptide_ix
20 |         return instance
21 | 
22 |     def get_py_ptr(self):
23 |         return self.__ptr
24 | 
25 |     @property
26 |     def forward(self) -> float:
27 |         return self.__ptr.forward
28 | 
29 |     @property
30 |     def reverse(self) -> float:
31 |         return self.__ptr.reverse
32 | 
33 |     @property
34 |     def forward_ix(self) -> Optional[PeptideIx]:
35 |         maybe_forward = self.__ptr.forward_ix
36 |         if maybe_forward is None:
37 |             return None
38 |         return PeptideIx.from_py_peptide_ix(maybe_forward)
39 | 
40 |     @property
41 |     def reverse_ix(self) -> Optional[PeptideIx]:
42 |         maybe_reverse = self.__ptr.reverse_ix
43 |         if maybe_reverse is None:
44 |             return None
45 |         return PeptideIx.from_py_peptide_ix(maybe_reverse)
46 | 
47 |     def __repr__(self):
48 |         return (f"CompetitionPeptideIx(forward={self.forward}, reverse={self.reverse}, "
49 |                 f"forward_ix={self.forward_ix}, reverse_ix={self.reverse_ix})")
50 | 
51 | def sage_fdr(feature_collection: List[Feature], indexed_db: IndexedDatabase, use_hyper_score: bool = True):
52 |     """ Perform SAGE FDR on all levels (spectrum, peptide, protein), calculates q-values and PEPs for a given feature collection.
53 |     Args:
54 |         feature_collection: a list of features
55 |         indexed_db: an indexed database
56 |         use_hyper_score: whether to use hyper score or discriminant score for q-value calculation
57 |     """
58 |     psc.py_sage_fdr(
59 |         [feature.get_py_ptr() for feature in feature_collection],
60 |         indexed_db.get_py_ptr(),
61 |         use_hyper_score
62 |     )
63 | 
64 | def sage_fdr_psm(psm_collection: Union[List[Psm], Dict[str, List[Psm]]], indexed_db: IndexedDatabase, use_hyper_score: bool = True):
65 |     """ Perform SAGE FDR on all levels (spectrum, peptide, protein), calculates q-values and PEPs for a given feature collection.
66 |     Args:
67 |         psm_collection: a list of features
68 |         indexed_db: an indexed database
69 |         use_hyper_score: whether to use hyper score or discriminant score for q-value calculation
70 |     """
71 | 
72 |     f_collection = []
73 | 
74 |     if isinstance(psm_collection, dict):
75 |         for _, values in psm_collection.items():
76 |             f_collection.extend(values)
77 | 
78 |     else:
79 |         f_collection = psm_collection
80 | 
81 |     psc.py_sage_fdr_psm(
82 |         [feature.get_py_ptr() for feature in f_collection],
83 |         indexed_db.get_py_ptr(),
84 |         use_hyper_score
85 |     )
86 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/core/ion_series.py:
--------------------------------------------------------------------------------
  1 | import sagepy_connector
  2 | 
  3 | from sagepy.core.peptide import Peptide
  4 | 
  5 | psc = sagepy_connector.py_ion_series
  6 | 
  7 | 
  8 | class IonType:
  9 |     def __init__(self, ion_type: str):
 10 |         """IonType class
 11 | 
 12 |         Args:
 13 |             ion_type (str): The ion type, allowed values are: a, b, c, x, y, z
 14 |         """
 15 |         try:
 16 |             self.__ion_type_ptr = psc.PyKind(ion_type)
 17 |         except ValueError:
 18 |             raise ValueError("Invalid ion type, allowed values are: a, b, c, x, y, z")
 19 | 
 20 |     @classmethod
 21 |     def from_py_kind(cls, kind: psc.PyKind):
 22 |         instance = cls.__new__(cls)
 23 |         instance.__ion_type_ptr = kind
 24 |         return instance
 25 | 
 26 |     @classmethod
 27 |     def y(cls):
 28 |         return cls.from_py_kind(psc.PyKind('y'))
 29 | 
 30 |     @classmethod
 31 |     def b(cls):
 32 |         return cls.from_py_kind(psc.PyKind('b'))
 33 | 
 34 |     def __repr__(self):
 35 |         return f"IonType({self.__ion_type_ptr.kind_as_string()})"
 36 | 
 37 |     def __hash__(self):
 38 |         return hash(self.__ion_type_ptr.kind_as_string())
 39 | 
 40 |     def __eq__(self, other):
 41 |         if not isinstance(other, IonType):
 42 |             return False
 43 |         return self.__ion_type_ptr.kind_as_string() == other.__ion_type_ptr.kind_as_string()
 44 | 
 45 |     def get_py_ptr(self):
 46 |         return self.__ion_type_ptr
 47 | 
 48 |     def to_string(self) -> str:
 49 |         return self.__ion_type_ptr.kind_as_string()
 50 | 
 51 | 
 52 | class Ion:
 53 |     """Ion class
 54 | 
 55 |     Args:
 56 |         ion_type (IonType): The ion type, e.g. b, y
 57 |         mass (float): The mass of the ion
 58 |     """
 59 |     def __init__(self, ion_type: IonType, mass: float):
 60 |         self.__ion_ptr = psc.PyIon(ion_type.get_py_ptr(), mass)
 61 | 
 62 |     @classmethod
 63 |     def from_py_ion(cls, ion: psc.PyIon):
 64 |         instance = cls.__new__(cls)
 65 |         instance.__ion_ptr = ion
 66 |         return instance
 67 | 
 68 |     @property
 69 |     def ion_type(self):
 70 |         return IonType.from_py_kind(self.__ion_ptr.kind)
 71 | 
 72 |     @property
 73 |     def mono_isotopic_mass(self):
 74 |         return self.__ion_ptr.monoisotopic_mass
 75 | 
 76 |     def __repr__(self):
 77 |         return f"Ion({self.ion_type}, {self.mono_isotopic_mass})"
 78 | 
 79 | 
 80 | class IonSeries:
 81 |     """IonSeries class
 82 | 
 83 |     Args:
 84 |         peptide (Peptide): The peptide
 85 |         ion_type (IonType): The ion type, e.g. b, y
 86 |     """
 87 |     def __init__(self, peptide: Peptide, ion_type: IonType):
 88 |         self.__ion_series_ptr = psc.PyIonSeries(peptide.get_py_ptr(), ion_type.get_py_ptr())
 89 | 
 90 |     @classmethod
 91 |     def from_py_ion_series(cls, ion_series: psc.PyIonSeries):
 92 |         instance = cls.__new__(cls)
 93 |         instance.__ion_series_ptr = ion_series
 94 |         return instance
 95 | 
 96 |     @property
 97 |     def ion_type(self):
 98 |         return IonType.from_py_kind(self.__ion_series_ptr.kind)
 99 | 
100 |     @property
101 |     def cumulative_mass(self):
102 |         return self.__ion_series_ptr.cumulative_mass
103 | 
104 |     @property
105 |     def peptide(self):
106 |         return Peptide.from_py_peptide(self.__ion_series_ptr.peptide)
107 | 
108 |     def __repr__(self):
109 |         return f"IonSeries({self.ion_type}, {self.cumulative_mass}, {self.peptide})"
110 | 
111 |     def get_py_ptr(self):
112 |         return self.__ion_series_ptr
113 | 
114 |     def get_ion_series(self):
115 |         return [Ion.from_py_ion(i) for i in self.__ion_series_ptr.get_ion_series()]


--------------------------------------------------------------------------------
/sagepy/sagepy/core/mass.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import sagepy_connector
  4 | psc = sagepy_connector.py_mass
  5 | 
  6 | 
  7 | def monoisotopic(aa: str) -> float:
  8 |     return psc.py_monoisotopic(aa)
  9 | 
 10 | 
 11 | class Composition:
 12 |     def __init__(self, carbon, sulfur):
 13 |         """Composition class
 14 | 
 15 |         Args:
 16 |             carbon (int): The number of carbon atoms
 17 |             sulfur (int): The number of sulfur atoms
 18 |         """
 19 |         self.__composition_ptr = psc.PyComposition(carbon, sulfur)
 20 | 
 21 |     @classmethod
 22 |     def from_py_composition(cls, composition: psc.PyComposition):
 23 |         instance = cls.__new__(cls)
 24 |         instance.__composition_ptr = composition
 25 |         return instance
 26 | 
 27 |     @property
 28 |     def carbon(self):
 29 |         return self.__composition_ptr.carbon
 30 | 
 31 |     @property
 32 |     def sulfur(self):
 33 |         return self.__composition_ptr.sulfur
 34 | 
 35 |     def __repr__(self):
 36 |         return f"Composition(carbon: {self.carbon}, sulfur: {self.sulfur})"
 37 | 
 38 |     def get_py_ptr(self):
 39 |         return self.__composition_ptr
 40 | 
 41 |     @staticmethod
 42 |     def sum(composition_list: List['Composition']) -> 'Composition':
 43 |         return psc.PyComposition.sum([c.get_py_ptr() for c in composition_list])
 44 | 
 45 |     @staticmethod
 46 |     def aa_composition(aa: str) -> 'Composition':
 47 |         return Composition.from_py_composition(psc.PyComposition.py_composition(aa))
 48 | 
 49 | 
 50 | class CONSTANTS:
 51 |     @classmethod
 52 |     @property
 53 |     def NEUTRON(cls):
 54 |         return psc.neutron()
 55 | 
 56 |     @classmethod
 57 |     @property
 58 |     def PROTON(cls):
 59 |         return psc.proton()
 60 | 
 61 |     @classmethod
 62 |     @property
 63 |     def H2O(cls):
 64 |         return psc.h2o()
 65 | 
 66 |     @classmethod
 67 |     @property
 68 |     def NH3(cls):
 69 |         return psc.nh3()
 70 | 
 71 | 
 72 | class Tolerance:
 73 |     def __init__(self, da: (float, float) = None, ppm: (float, float) = None):
 74 |         """Tolerance class
 75 | 
 76 |         Args:
 77 |             da (float, optional): The tolerance in Da. Defaults to None.
 78 |             ppm (float, optional): The tolerance in ppm. Defaults to None.
 79 |         """
 80 |         if da is not None and ppm is not None:
 81 |             raise ValueError("Only one of da or ppm can be set")
 82 |         elif da is None and ppm is None:
 83 |             raise ValueError("One of da or ppm must be set")
 84 |         else:
 85 |             self.__tolerance_ptr = psc.PyTolerance(da, ppm)
 86 | 
 87 |     def get_py_ptr(self):
 88 |         return self.__tolerance_ptr
 89 | 
 90 |     @classmethod
 91 |     def from_py_tolerance(cls, tolerance: psc.PyTolerance) -> 'Tolerance':
 92 |         instance = cls.__new__(cls)
 93 |         instance.__tolerance_ptr = tolerance
 94 |         return instance
 95 | 
 96 |     @property
 97 |     def da(self) -> (float, float):
 98 |         return self.__tolerance_ptr.da
 99 | 
100 |     @property
101 |     def ppm(self) -> (float, float):
102 |         return self.__tolerance_ptr.ppm
103 | 
104 |     def bounds(self, center: float) -> (float, float):
105 |         return self.__tolerance_ptr.bounds(center)
106 | 
107 |     def contains(self, center: float, target: float) -> bool:
108 |         return self.__tolerance_ptr.contains(center, target)
109 | 
110 |     def __repr__(self) -> str:
111 |         if self.da is not None:
112 |             return f"Tolerance(da={self.da})"
113 |         else:
114 |             return f"Tolerance(ppm={self.ppm})"
115 | 
116 |     def __mul__(self, other) -> 'Tolerance':
117 |         if isinstance(other, float):
118 |             return Tolerance.from_py_tolerance(self.__tolerance_ptr * other)
119 | 
120 |         elif isinstance(other, int):
121 |             return Tolerance.from_py_tolerance(self.__tolerance_ptr * float(other))
122 | 
123 |         else:
124 |             raise ValueError("Tolerance can only be multiplied by a float or an int")
125 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/core/ml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/theGreatHerrLebert/sagepy/4382f7b24dd4e35bfa256e8c8a32ead610782e99/sagepy/sagepy/core/ml/__init__.py


--------------------------------------------------------------------------------
/sagepy/sagepy/core/ml/mobility_model.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union, Dict
 2 | from sagepy.core.scoring import Psm
 3 | from sagepy.core import IndexedDatabase
 4 | 
 5 | import sagepy_connector
 6 | psc = sagepy_connector.py_mobility_model
 7 | 
 8 | def predict_sage_im(
 9 |         psm_collection: Union[List[Psm], Dict[str, List[Psm]]],
10 |         indexed_db: IndexedDatabase) -> None:
11 |     """ Predict ion mobility using SAGE IM model.
12 |     Args:
13 |         psm_collection: a list of features
14 |         indexed_db: an indexed database
15 |     """
16 | 
17 |     f_collection = []
18 | 
19 |     if isinstance(psm_collection, dict):
20 |         for _, values in psm_collection.items():
21 |             f_collection.extend(values)
22 | 
23 |     else:
24 |         f_collection = psm_collection
25 | 
26 |     psc.py_predict_im(
27 |         [feature.get_py_ptr() for feature in f_collection],
28 |         indexed_db.get_py_ptr(),
29 |     )


--------------------------------------------------------------------------------
/sagepy/sagepy/core/ml/pep.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | from numpy.typing import NDArray
  3 | import numpy as np
  4 | from numba import njit, prange
  5 | 
  6 | 
  7 | @njit
  8 | def std(sample: NDArray) -> float:
  9 |     """Calculate the standard deviation of the sample.
 10 | 
 11 |     Args:
 12 |         sample: numpy array of values
 13 | 
 14 |     Returns:
 15 |         float: standard deviation of the sample
 16 |     """
 17 |     mean = np.mean(sample)
 18 |     variance = np.mean((sample - mean) ** 2)
 19 |     return np.sqrt(variance)
 20 | 
 21 | 
 22 | @njit(parallel=True)
 23 | def kde_pdf(sample: NDArray,
 24 |             bandwidth: float,
 25 |             x: float) -> float:
 26 |     """Calculate the KDE PDF for a given x.
 27 | 
 28 |     Args:
 29 |         sample: numpy array of values
 30 |         bandwidth: bandwidth parameter
 31 |         x: value for which to calculate the PDF
 32 | 
 33 |     Returns:
 34 |         float: KDE PDF for the given
 35 |     """
 36 |     n = len(sample)
 37 |     constant = (2.0 * np.pi) ** 0.5 * bandwidth * n
 38 |     sum_pdf = 0.0
 39 | 
 40 |     for i in prange(n):
 41 |         sum_pdf += np.exp(-0.5 * ((x - sample[i]) / bandwidth) ** 2)
 42 | 
 43 |     return sum_pdf / constant
 44 | 
 45 | 
 46 | @njit
 47 | def calculate_pep_single(
 48 |         scores: NDArray,
 49 |         decoys: NDArray,
 50 |         bins: int = 1000,
 51 |         bw_adjust: float = 1.0,
 52 |         monotonic: bool = True
 53 | ) -> Tuple[NDArray, float, float]:
 54 |     """Calculate the PEP using KDE and binning with linear interpolation.
 55 | 
 56 |     Args:
 57 |         scores: numpy array of scores
 58 |         decoys: numpy array of boolean values indicating decoys
 59 |         bins: number of bins for the PEP calculation
 60 |         bw_adjust: bandwidth adjustment factor
 61 |         monotonic: whether to enforce monotonicity
 62 | 
 63 |     Returns:
 64 |         Tuple[NDArray, float, float]: PEP values, minimum score, score step
 65 |     """
 66 |     d = scores[decoys]
 67 |     t = scores[~decoys]
 68 | 
 69 |     pi = len(d) / len(scores)
 70 |     sigma_d = std(d)
 71 |     sigma_t = std(t)
 72 | 
 73 |     bandwidth_d = bw_adjust * sigma_d * (4. / 3. / len(d)) ** 0.2
 74 |     bandwidth_t = bw_adjust * sigma_t * (4. / 3. / len(t)) ** 0.2
 75 | 
 76 |     min_score = np.min(scores)
 77 |     max_score = np.max(scores)
 78 |     score_step = (max_score - min_score) / (bins - 1)
 79 | 
 80 |     pep_bins = np.zeros(bins)
 81 | 
 82 |     for bin_idx in range(bins):
 83 |         score = min_score + bin_idx * score_step
 84 |         decoy_pdf = kde_pdf(d, bandwidth_d, score) * pi
 85 |         target_pdf = kde_pdf(t, bandwidth_t, score) * (1.0 - pi)
 86 |         pep_bins[bin_idx] = decoy_pdf / (decoy_pdf + target_pdf)
 87 | 
 88 |     if monotonic:
 89 |         for i in range(len(pep_bins) - 2, -1, -1):
 90 |             pep_bins[i] = max(pep_bins[i], pep_bins[i + 1])
 91 | 
 92 |     return pep_bins, min_score, score_step
 93 | 
 94 | 
 95 | @njit
 96 | def posterior_error(pep_bins: NDArray,
 97 |                     min_score: float,
 98 |                     score_step: float,
 99 |                     score: float) -> float:
100 |     """Interpolate PEP for a given score.
101 | 
102 |     Args:
103 |         pep_bins: numpy array of PEP values
104 |         min_score: minimum score
105 |         score_step: score step
106 |         score: score for which to calculate the PEP
107 | 
108 |     Returns:
109 |         float: interpolated PEP value
110 |     """
111 |     bin_lo = int((score - min_score) / score_step)
112 |     bin_hi = min(bin_lo + 1, len(pep_bins) - 1)
113 | 
114 |     lower = pep_bins[bin_lo]
115 |     upper = pep_bins[bin_hi]
116 | 
117 |     bin_lo_score = bin_lo * score_step + min_score
118 |     linear = (score - bin_lo_score) / score_step
119 | 
120 |     delta = upper - lower
121 |     return lower + (delta * linear)
122 | 
123 | # caclulate pep for all scores
124 | @njit
125 | def calculate_pep(scores: NDArray,
126 |                   decoys: NDArray,
127 |                   bins: int = 1000,
128 |                   bw_adjust: float = 1.0,
129 |                   monotonic: bool = True) -> NDArray:
130 |     """Calculate PEP for all scores.
131 | 
132 |     Args:
133 |         scores: numpy array of scores
134 |         decoys: numpy array of boolean values indicating decoys
135 |         bins: number of bins for the PEP calculation
136 |         bw_adjust: bandwidth adjustment factor
137 |         monotonic: whether to enforce monotonicity
138 | 
139 |     Returns:
140 |         numpy array: PEP values for all scores
141 |     """
142 |     pep_bins, min_score, score_step = calculate_pep_single(scores, decoys, bins, bw_adjust, monotonic)
143 |     pep = np.zeros(len(scores))
144 |     for i in range(len(scores)):
145 |         pep[i] = posterior_error(pep_bins, min_score, score_step, scores[i])
146 |     return pep
147 | 
148 | if __name__ == "__main__":
149 |     # create 1000 radom scores between 0 and 50
150 |     scores = np.random.uniform(0, 50, 50000)
151 |     decoys = np.random.choice([True, False], 50000)
152 | 
153 |     # sort scores ascending
154 |     scores = np.sort(scores)
155 | 
156 |     # sort decoys where true comes first
157 |     decoys = np.sort(decoys)[::-1]
158 | 
159 |     pep_bins, min_score, score_step = calculate_pep_single(scores, decoys)
160 |     pep = posterior_error(pep_bins, min_score, score_step, scores[0])
161 | 
162 |     peps = calculate_pep(scores, decoys)
163 | 
164 |     from matplotlib import pyplot as plt
165 |     plt.plot(scores, peps)
166 |     plt.show()


--------------------------------------------------------------------------------
/sagepy/sagepy/core/ml/retention_alignment.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union, Dict
 2 | import sagepy_connector
 3 | from imspy.simulation.annotation import RustWrapperObject
 4 | 
 5 | from sagepy.core import Psm, Feature
 6 | 
 7 | psc = sagepy_connector.py_retention_alignment
 8 | 
 9 | 
10 | class Alignment(RustWrapperObject):
11 |     def __init__(self, file_id: int, max_rt: float, slope: float, intercept: float):
12 |         self.__py_ptr = psc.PyAlignment(file_id, max_rt, slope, intercept)
13 | 
14 |     @classmethod
15 |     def from_py_ptr(cls, py_ptr: psc.PyAlignment):
16 |         instance = cls.__new__(cls)
17 |         instance.__py_ptr = py_ptr
18 |         return instance
19 | 
20 |     def get_py_ptr(self) -> psc.PyAlignment:
21 |         return self.__py_ptr
22 | 
23 |     @property
24 |     def file_id(self) -> int:
25 |         return self.__py_ptr.file_id
26 | 
27 |     @property
28 |     def max_rt(self) -> float:
29 |         return self.__py_ptr.max_rt
30 | 
31 |     @property
32 |     def slope(self) -> float:
33 |         return self.__py_ptr.slope
34 | 
35 |     @property
36 |     def intercept(self) -> float:
37 |         return self.__py_ptr.intercept
38 | 
39 |     def __repr__(self):
40 |         return f"Alignment(file_id={self.file_id}, max_rt={self.max_rt}, slope={self.slope}, intercept={self.intercept})"
41 | 
42 | 
43 | def global_alignment(features: List[Feature], n_files: int) -> List[Alignment]:
44 |     """ Perform global alignment.
45 |     Args:
46 |         features: A list of features
47 |         n_files: Number of files
48 |     Returns:
49 |         List[Alignment]: List of Alignment objects
50 |     """
51 | 
52 |     py_alignments = psc.py_global_alignment([f.get_py_ptr() for f in features], n_files)
53 |     return [Alignment.from_py_ptr(py_alignment) for py_alignment in py_alignments]
54 | 
55 | def global_alignment_psm(psms: Union[Dict[str, List[Psm]], List[Psm]]) -> List[Alignment]:
56 |     """ Perform global alignment on PSMs.
57 |     Args:
58 |         psms: A list of PSMs
59 |     Returns:
60 |         List[Alignment]: List of Alignment objects
61 |     """
62 | 
63 |     if isinstance(psms, dict):
64 |         psms = [psm for psm_list in psms.values() for psm in psm_list]
65 | 
66 |     n_files = len(set([p.sage_feature.file_id for p in psms]))
67 | 
68 |     py_alignments = psc.py_global_alignment_psm([p.get_py_ptr() for p in psms], n_files)
69 |     return [Alignment.from_py_ptr(py_alignment) for py_alignment in py_alignments]
70 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/core/ml/retention_model.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union, Dict
 2 | from sagepy.core.scoring import Psm
 3 | from sagepy.core import IndexedDatabase
 4 | 
 5 | import sagepy_connector
 6 | psc = sagepy_connector.py_retention_model
 7 | 
 8 | 
 9 | def predict_sage_rt(
10 |         psm_collection: Union[List[Psm], Dict[str, List[Psm]]],
11 |         indexed_db: IndexedDatabase) -> None:
12 |     """ Predict retention time using SAGE RT model.
13 |     Args:
14 |         psm_collection: a list of features
15 |         indexed_db: an indexed database
16 |     """
17 | 
18 |     f_collection = []
19 | 
20 |     if isinstance(psm_collection, dict):
21 |         for _, values in psm_collection.items():
22 |             f_collection.extend(values)
23 | 
24 |     else:
25 |         f_collection = psm_collection
26 | 
27 |     psc.py_predict_rt(
28 |         [feature.get_py_ptr() for feature in f_collection],
29 |         indexed_db.get_py_ptr(),
30 |     )


--------------------------------------------------------------------------------
/sagepy/sagepy/core/modification.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List
  2 | 
  3 | import sagepy_connector
  4 | 
  5 | psc = sagepy_connector.py_modification
  6 | 
  7 | 
  8 | def process_variable_start_end_mods(variable_modifications):
  9 |     """Helper function to process variable modifications for start and end of peptides/proteins
 10 |     For some reason, the variable modification wildcards are not processed correctly when passed to SAGE
 11 |     This function processes the variable modifications and adds the start and end wildcards for amino acids
 12 |     Args:
 13 |         variable_modifications: The variable modifications
 14 | 
 15 |     Returns:
 16 |         Dict: The processed variable modifications
 17 |     """
 18 | 
 19 |     # peptide C, peptide N, protein C, protein N
 20 |     targets = ["^", "$", "[", "]"]
 21 | 
 22 |     # combine targets with amino acids
 23 |     AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"
 24 | 
 25 |     ret_dict = {}
 26 | 
 27 |     for key, values in variable_modifications.items():
 28 |         if key in targets:
 29 |             for amino_acid in AMINO_ACIDS:
 30 |                 ret_dict[key + amino_acid] = values
 31 | 
 32 |     return { **variable_modifications, **ret_dict }
 33 | 
 34 | 
 35 | class SAGE_KNOWN_MODS:
 36 |     @staticmethod
 37 |     def n_terminal_static():
 38 |         return "^", 304.207
 39 | 
 40 |     @staticmethod
 41 |     def lysine_static():
 42 |         return "K", 304.207
 43 | 
 44 |     @staticmethod
 45 |     def cysteine_static():
 46 |         return "C", 57.0215
 47 | 
 48 |     @staticmethod
 49 |     def phospho_serine_static():
 50 |         return "S", 79.9663
 51 | 
 52 |     @staticmethod
 53 |     def phospho_threonine_static():
 54 |         return "T", 79.9663
 55 | 
 56 |     @staticmethod
 57 |     def phospho_tyrosine_static():
 58 |         return "Y", 79.9663
 59 | 
 60 |     @staticmethod
 61 |     def phospho_serine_variable():
 62 |         return "S", [79.9663]
 63 | 
 64 |     @staticmethod
 65 |     def phospho_threonine_variable():
 66 |         return "T", [79.9663]
 67 | 
 68 |     @staticmethod
 69 |     def phospho_tyrosine_variable():
 70 |         return "Y", [79.9663]
 71 | 
 72 |     @staticmethod
 73 |     def methionine_variable():
 74 |         return "M", [15.9949]
 75 | 
 76 |     @staticmethod
 77 |     def q_variable():
 78 |         return "^Q", [-17.026549]
 79 | 
 80 |     @staticmethod
 81 |     def glutamic_acid_n_terminal_variable():
 82 |         return "^E", [-18.010565]
 83 | 
 84 |     @staticmethod
 85 |     def peptide_c_terminal_variable():
 86 |         return "$", [49.2, 22.9]
 87 | 
 88 |     @staticmethod
 89 |     def protein_n_terminus_variable():
 90 |         return "[", [42.0]
 91 | 
 92 |     @staticmethod
 93 |     def protein_c_terminal_variable():
 94 |         return "]", [111.0]
 95 | 
 96 |     def __repr__(self):
 97 |         return (f"SAGE_KNOWN_MODS({self.n_terminal_static()}, "
 98 |                 f"{self.lysine_static()}, "
 99 |                 f"{self.cysteine_static()}, "
100 |                 f"{self.methionine_variable()}, "
101 |                 f"{self.q_variable()}, "
102 |                 f"{self.glutamic_acid_n_terminal_variable()}, "
103 |                 f"{self.peptide_c_terminal_variable()}, "
104 |                 f"{self.protein_n_terminus_variable()}, "
105 |                 f"{self.protein_c_terminal_variable()})")
106 | 
107 | 
108 | # TODO: need to re-implement based on constant modification list
109 | class ModificationSpecificity:
110 |     def __init__(self, s: str):
111 |         self.__modification_specificity_ptr = psc.PyModificationSpecificity(s)
112 | 
113 |     @classmethod
114 |     def from_py_modification_specificity(cls, specificity: psc.PyModificationSpecificity):
115 |         instance = cls.__new__(cls)
116 |         instance.__modification_specificity_ptr = specificity
117 |         return instance
118 | 
119 |     def __repr__(self):
120 |         return f"ModificationSpecificity({self.__modification_specificity_ptr.as_string})"
121 | 
122 |     def get_py_ptr(self):
123 |         return self.__modification_specificity_ptr
124 | 
125 | 
126 | def validate_mods(mods: Dict[str, float]) -> Dict[ModificationSpecificity, float]:
127 | 
128 |     py_validate_dict = psc.py_validate_mods(mods)
129 | 
130 |     return {ModificationSpecificity.from_py_modification_specificity(k): v for k, v in py_validate_dict.items()}
131 | 
132 | 
133 | def validate_var_mods(mods: Dict[str, List[float]]) -> Dict[ModificationSpecificity, List[float]]:
134 | 
135 |     py_validate_dict = psc.py_validate_var_mods(mods)
136 | 
137 |     return {ModificationSpecificity.from_py_modification_specificity(k): v for k, v in py_validate_dict.items()}
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     static_mods = {k: v for k, v in [SAGE_KNOWN_MODS.cysteine_static()]}
142 |     variable_mods = {k: v for k, v in [SAGE_KNOWN_MODS.methionine_variable()]}
143 | 
144 |     static = validate_mods(static_mods)
145 |     variab = validate_var_mods(variable_mods)
146 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/core/peptide.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Union
  2 | 
  3 | import numpy as np
  4 | import sagepy_connector
  5 | 
  6 | from sagepy.core.enzyme import Position, Digest
  7 | 
  8 | psc = sagepy_connector.py_peptide
  9 | 
 10 | def mass_to_mod(mass: float) -> str:
 11 |     """ Convert a mass to a UNIMOD modification annotation.
 12 | 
 13 |     Args:
 14 |         mass: a mass in Da
 15 | 
 16 |     Returns:
 17 |         a UNIMOD modification annotation
 18 |     """
 19 |     maybe_key = int(np.round(mass))
 20 |     # TODO: find a better way to do the map-back
 21 |     mod_dict = {
 22 |         42: '[UNIMOD:1]',
 23 |         57: '[UNIMOD:4]',
 24 |         80: '[UNIMOD:21]',
 25 |         16: '[UNIMOD:35]',
 26 |         119: '[UNIMOD:312]',
 27 |     }
 28 |     # try to translate to UNIMOD annotation
 29 |     try:
 30 |         return mod_dict[maybe_key]
 31 |     except KeyError:
 32 |         raise KeyError(f"Rounded mass not in dict: {maybe_key}")
 33 | 
 34 | 
 35 | class Peptide:
 36 |     def __init__(self,
 37 |                  decoy: bool,
 38 |                  sequence: str,
 39 |                  modifications: List[float],
 40 |                  mono_isotopic: float,
 41 |                  missed_cleavages: int,
 42 |                  position: Position,
 43 |                  proteins: List[str],
 44 |                  semi_enzymatic: bool = False,
 45 |                  n_term: Optional[float] = None,
 46 |                  c_term: Optional[float] = None
 47 |                  ):
 48 |         """Peptide class
 49 | 
 50 |         Args:
 51 |             decoy (bool): Is the peptide a decoy
 52 |             sequence (str): The peptide sequence
 53 |             modifications (List[float]): The modifications of the peptide
 54 |             mono_isotopic (float): The monoisotopic mass of the peptide
 55 |             missed_cleavages (int): The number of missed cleavages
 56 |             position (Position): The position of the peptide
 57 |             proteins (List[str]): The proteins that the peptide is found in
 58 |             n_term (Optional[float], optional): Potential modifications on the N-terminal of the peptide. Defaults to None.
 59 |             c_term (Optional[float], optional): Potential modifications on the C-terminal of the peptide. Defaults to None.
 60 |         """
 61 |         self.__peptide_ptr = psc.PyPeptide(decoy, sequence, modifications,
 62 |                                            mono_isotopic, missed_cleavages, position.get_py_ptr(),
 63 |                                            proteins, semi_enzymatic, n_term, c_term)
 64 | 
 65 |     @classmethod
 66 |     def from_digest(cls, digest: Digest) -> 'Peptide':
 67 |         instance = cls.__new__(cls)
 68 |         instance.__peptide_ptr = psc.PyPeptide.try_new_from_digest(digest.get_py_ptr())
 69 |         return instance
 70 | 
 71 |     @classmethod
 72 |     def from_py_peptide(cls, peptide: psc.PyPeptide):
 73 |         instance = cls.__new__(cls)
 74 |         instance.__peptide_ptr = peptide
 75 |         return instance
 76 | 
 77 |     @property
 78 |     def decoy(self):
 79 |         return self.__peptide_ptr.decoy
 80 | 
 81 |     @property
 82 |     def sequence(self):
 83 |         return self.__peptide_ptr.sequence
 84 | 
 85 |     @property
 86 |     def modifications(self):
 87 |         return self.__peptide_ptr.modifications
 88 | 
 89 |     @property
 90 |     def mono_isotopic(self):
 91 |         return self.__peptide_ptr.monoisotopic
 92 | 
 93 |     @property
 94 |     def missed_cleavages(self):
 95 |         return self.__peptide_ptr.missed_cleavages
 96 | 
 97 |     @property
 98 |     def position(self):
 99 |         return Position.from_py_position(self.__peptide_ptr.position)
100 | 
101 |     @property
102 |     def proteins(self):
103 |         return self.__peptide_ptr.proteins
104 | 
105 |     @property
106 |     def n_term(self):
107 |         return self.__peptide_ptr.n_term
108 | 
109 |     @property
110 |     def c_term(self):
111 |         return self.__peptide_ptr.c_term
112 | 
113 |     @property
114 |     def semi_enzymatic(self):
115 |         return self.__peptide_ptr.semi_enzymatic
116 | 
117 |     def reverse(self, keep_ends: Union[bool, None]) -> 'Peptide':
118 |         """Reverse the peptide sequence.
119 | 
120 |         Args:
121 |             keep_ends (Union[bool, None]): Whether to keep the N- and C-terminal amino acids in place.
122 | 
123 |         Returns:
124 |             Peptide: The reversed peptide.
125 |         """
126 |         return Peptide.from_py_peptide(self.__peptide_ptr.reverse(keep_ends))
127 | 
128 |     def shuffle(self, keep_ends: Union[bool, None]) -> 'Peptide':
129 |         """Shuffle the peptide sequence.
130 | 
131 |         Args:
132 |             keep_ends (Union[bool, None]): Whether to keep the N- and C-terminal amino acids in place.
133 | 
134 |         Returns:
135 |             Peptide: The shuffled peptide.
136 |         """
137 |         return Peptide.from_py_peptide(self.__peptide_ptr.shuffle(keep_ends))
138 | 
139 |     def get_py_ptr(self):
140 |         return self.__peptide_ptr
141 | 
142 |     def __repr__(self):
143 |         return f"Peptide(decoy: {self.decoy}, sequence: {self.sequence}, " \
144 |                f"modifications: {self.modifications}, mono_isotopic: {self.mono_isotopic}, " \
145 |                f"missed_cleavages: {self.missed_cleavages}, position: {self.position}, " \
146 |                f"proteins: {self.proteins}, semi_enzymatic: {self.semi_enzymatic}, n_term: {self.n_term}, " \
147 |                f"c_term: {self.c_term})"
148 | 
149 |     def to_unimod_sequence(self) -> str:
150 |         """ Get Peptide sequence with UNIMOD modification annotations.
151 | 
152 |         Returns:
153 |             str: Peptide sequence with UNIMOD modification annotations.
154 |         """
155 | 
156 |         mods = self.modifications
157 |         sequence = self.sequence
158 | 
159 |         seq = ''
160 | 
161 |         for i, (s, m) in enumerate(zip(sequence, mods)):
162 |             if m != 0:
163 |                 # TODO: check if this is the correct way to handle N- and C-terminal mods
164 |                 if i == 0:
165 |                     if mass_to_mod(m) == '[UNIMOD:1]':
166 |                         seq += f'{mass_to_mod(m)}{s}'
167 |                     else:
168 |                         seq += f'{s}{mass_to_mod(m)}'
169 |                 else:
170 |                     seq += f'{s}{mass_to_mod(m)}'
171 |             else:
172 |                 seq += s
173 | 
174 |         return seq
175 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/core/tmt.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, List
  2 | 
  3 | import sagepy_connector
  4 | 
  5 | from sagepy.core import ProcessedSpectrum
  6 | from sagepy.core.scoring import Feature
  7 | from sagepy.core.spectrum import Peak
  8 | 
  9 | psc = sagepy_connector.py_tmt
 10 | 
 11 | 
 12 | class Isobaric:
 13 |     def __init__(self, type_name: str):
 14 |         types = ["tmt6", "tmt10", "tmt11", "tmt16", "tmt18"]
 15 |         if type_name in types:
 16 |             self.__isobaric_ptr = psc.PyIsobaric(type_name)
 17 |         else:
 18 |             raise ValueError(f"Invalid isobaric type, allowed values are: {types}")
 19 | 
 20 |     @classmethod
 21 |     def from_py_isobaric(cls, isobaric: psc.PyIsobaric):
 22 |         instance = cls.__new__(cls)
 23 |         instance.__isobaric_ptr = isobaric
 24 |         return instance
 25 | 
 26 |     @property
 27 |     def type_name(self):
 28 |         return self.__isobaric_ptr.type_name
 29 | 
 30 |     def __repr__(self):
 31 |         return f"Isobaric({self.__isobaric_ptr.type_name})"
 32 | 
 33 |     def get_py_ptr(self):
 34 |         return self.__isobaric_ptr
 35 | 
 36 |     def modification_mass(self) -> Optional[float]:
 37 |         maybe_mass = self.__isobaric_ptr.modification_mass()
 38 |         if maybe_mass is None:
 39 |             return None
 40 |         return maybe_mass
 41 | 
 42 | 
 43 | class Purity:
 44 |     def __init__(self, ratio: float, correct_precursors: int, incorrect_precursors: int):
 45 |         self.__purity_ptr = psc.PyPurity(ratio, correct_precursors, incorrect_precursors)
 46 | 
 47 |     @classmethod
 48 |     def from_py_purity(cls, purity: psc.PyPurity):
 49 |         instance = cls.__new__(cls)
 50 |         instance.__purity_ptr = purity
 51 |         return instance
 52 | 
 53 |     @property
 54 |     def ratio(self):
 55 |         return self.__purity_ptr.ratio
 56 | 
 57 |     @property
 58 |     def correct_precursors(self):
 59 |         return self.__purity_ptr.correct_precursors
 60 | 
 61 |     @property
 62 |     def incorrect_precursors(self):
 63 |         return self.__purity_ptr.incorrect_precursors
 64 | 
 65 |     def __repr__(self):
 66 |         return (f"Purity(ratio={self.ratio}, correct_precursors={self.correct_precursors}, "
 67 |                 f"incorrect_precursors={self.incorrect_precursors})")
 68 | 
 69 |     def get_py_ptr(self):
 70 |         return self.__purity_ptr
 71 | 
 72 | 
 73 | class Quant:
 74 |     def __init__(self, hit: Feature, hit_purity: Purity, spectrum: ProcessedSpectrum,
 75 |                  chimera: Optional[Feature] = None, chimera_purity: Optional[Purity] = None,
 76 |                  intensities: Optional[List[Peak]] = None):
 77 | 
 78 |         if chimera is not None:
 79 |             chimera = chimera.get_py_ptr()
 80 | 
 81 |         if chimera_purity is not None:
 82 |             chimera_purity = chimera_purity.get_py_ptr()
 83 | 
 84 |         if intensities is not None:
 85 |             intensities = [peak.get_py_ptr() for peak in intensities]
 86 | 
 87 |         self.__quant_ptr = psc.PyQuant(hit.get_py_ptr(), hit_purity.get_py_ptr(),
 88 |                                        spectrum.get_py_ptr(), chimera, chimera_purity, intensities)
 89 | 
 90 |     @classmethod
 91 |     def from_py_quant(cls, quant: psc.PyQuant):
 92 |         instance = cls.__new__(cls)
 93 |         instance.__quant_ptr = quant
 94 |         return instance
 95 | 
 96 |     @property
 97 |     def hit(self):
 98 |         return Feature.from_py_feature(self.__quant_ptr.hit())
 99 | 
100 |     @property
101 |     def hit_purity(self):
102 |         return Purity.from_py_purity(self.__quant_ptr.hit_purity())
103 | 
104 |     @property
105 |     def spectrum(self):
106 |         return ProcessedSpectrum.from_py_processed_spectrum(self.__quant_ptr.spectrum())
107 | 
108 |     @property
109 |     def chimera(self):
110 |         maybe_chimera = self.__quant_ptr.chimera()
111 |         if maybe_chimera is None:
112 |             return None
113 |         return Feature.from_py_feature(maybe_chimera)
114 | 
115 |     @property
116 |     def chimera_purity(self):
117 |         maybe_chimera_purity = self.__quant_ptr.chimera_purity()
118 |         if maybe_chimera_purity is None:
119 |             return None
120 |         return Purity.from_py_purity(maybe_chimera_purity)
121 | 
122 |     @property
123 |     def intensities(self):
124 |         intensities = self.__quant_ptr.intensities()
125 |         if intensities is None:
126 |             return None
127 |         return [Peak.from_py_peak(peak) for peak in intensities]
128 | 
129 |     def __repr__(self):
130 |         return (f"Quant(hit={self.hit}, hit_purity={self.hit_purity}, spectrum={self.spectrum}, "
131 |                 f"chimera={self.chimera}, chimera_purity={self.chimera_purity}, intensities={self.intensities})")
132 | 
133 |     def get_py_ptr(self):
134 |         return self.__quant_ptr
135 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/core/unimod.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Union, List
  2 | from sagepy.core.modification import ModificationSpecificity
  3 | 
  4 | from sagepy_connector import py_unimod as unimod
  5 | from .modification import validate_mods, validate_var_mods
  6 | 
  7 | 
  8 | def modification_title_to_unimod_id() -> Dict[str, str]:
  9 |     """ Get a dict that maps modification names to Unimod IDs.
 10 | 
 11 |     Returns:
 12 |         A dict that maps modification names to Unimod IDs.
 13 |     """
 14 |     return unimod.title_to_unimod_ids()
 15 | 
 16 | 
 17 | def modification_atomic_composition() -> Dict[str, Dict[str, int]]:
 18 |     """ Get a dict that maps modification names to atomic compositions.
 19 | 
 20 |     Returns:
 21 |         A dict that maps modification names to atomic compositions.
 22 |     """
 23 |     return unimod.modification_atomic_compositions()
 24 | 
 25 | 
 26 | def unimod_to_mass() -> Dict[str, float]:
 27 |     """ Get a dict that maps Unimod IDs to mass values.
 28 | 
 29 |     Returns:
 30 |         A dict that maps Unimod IDs to mass values.
 31 |     """
 32 |     return unimod.unimod_modification_to_mass()
 33 | 
 34 | 
 35 | def unimod_to_mass_numerical() -> Dict[int, float]:
 36 |     """ Get a dict that maps Unimod IDs given as integer to mass values.
 37 | 
 38 |     Returns:
 39 |         A dict that maps Unimod IDs to mass values.
 40 |     """
 41 |     return unimod.unimod_modification_to_mass_numerical()
 42 | 
 43 | 
 44 | def unimod_static_mods_to_sage_static_mods(
 45 |         unimod_static_mods: Union[Dict[str, str], Dict[str, int]]
 46 | ) -> Dict[ModificationSpecificity, float]:
 47 |     """ Translate a dict that maps modification names to Unimod IDs
 48 |      to a dict that maps ModificationSpecificity objects and a set of modification names.
 49 |      Args:
 50 |          unimod_static_mods: A dict that maps modification names to Unimod IDs.
 51 |    Returns:
 52 |        A tuple containing a dict that maps ModificationSpecificity objects
 53 |        to mass values and a set of modification names.
 54 |     """
 55 | 
 56 |     if len(unimod_static_mods) == 0:
 57 |         return {}
 58 | 
 59 |     mods_numeric = type(list(unimod_static_mods.values())[0]) is int
 60 |     if mods_numeric:
 61 |         mod_to_mass = unimod.unimod_modification_to_mass_numerical()
 62 |     else:
 63 |         mod_to_mass = unimod.unimod_modification_to_mass()
 64 | 
 65 |     sage_raw_dict = {}
 66 | 
 67 |     for key, value in unimod_static_mods.items():
 68 |         mass = mod_to_mass[value]
 69 |         sage_raw_dict[key] = mass
 70 | 
 71 |     return validate_mods(sage_raw_dict)
 72 | 
 73 | 
 74 | def unimod_variable_mods_to_sage_variable_mods(
 75 |         unimod_variable_mods: Union[Dict[str, List[str]], Dict[str, List[int]]]
 76 | ) -> Dict[ModificationSpecificity, List[float]]:
 77 |     """ Translate a dict that maps modification names to Unimod IDs
 78 |     to a dict that maps ModificationSpecificity objects to lists of mass values and a set of modification names.
 79 | 
 80 |     Args:
 81 |         unimod_variable_mods: A dict that maps modification names to Unimod IDs.
 82 | 
 83 |     Returns:
 84 |         A tuple containing a dict that maps ModificationSpecificity objects
 85 |         to lists of mass values and a set of modification names.
 86 |     """
 87 | 
 88 |     if len(unimod_variable_mods) == 0:
 89 |         return {}
 90 | 
 91 |     # Check if the modification IDs are numeric or string
 92 |     mods_numeric = type(list(unimod_variable_mods.values())[0]) is int
 93 | 
 94 |     if mods_numeric:
 95 |         mod_to_mass = unimod.unimod_modification_to_mass_numerical()
 96 |     else:
 97 |         mod_to_mass = unimod.unimod_modification_to_mass()
 98 | 
 99 |     sage_raw_dict: Dict[str, List[float]] = {}
100 | 
101 |     for key, values in unimod_variable_mods.items():
102 |         for value in values:
103 |             mass = mod_to_mass[value]
104 | 
105 |             if key in sage_raw_dict:
106 |                 sage_raw_dict[key].append(mass)
107 |             else:
108 |                 sage_raw_dict[key] = [mass]
109 | 
110 |     return validate_var_mods(sage_raw_dict)
111 | 
112 | 
113 | def static_unimod_mods_to_set(
114 |         unimod_mods: Union[Dict[str, str], Dict[str, int]]
115 | ) -> set:
116 |     """ Translate a dict that maps modification names to Unimod IDs to a set of modification names.
117 | 
118 |     Args:
119 |         unimod_mods: A dict that maps modification names to Unimod IDs.
120 | 
121 |     Returns:
122 |         A set of modification names.
123 |     """
124 | 
125 |     if len(unimod_mods) == 0:
126 |         return set()
127 | 
128 |     if isinstance(next(iter(unimod_mods.values())), int):
129 |         return {f"[UNIMOD:{value}]" for value in unimod_mods.values()}
130 |     else:
131 |         return set(unimod_mods.values())
132 | 
133 | def variable_unimod_mods_to_set(
134 |         unimod_mods: Union[Dict[str, List[str]], Dict[str, List[int]]
135 |     ]) -> set:
136 |     """ Translate a dict that maps modification names to Unimod IDs to a set of modification names.
137 | 
138 |     Args:
139 |         unimod_mods: A dict that maps modification names to Unimod IDs.
140 | 
141 |     Returns:
142 |         A set of modification names.
143 |     """
144 | 
145 |     if isinstance(next(iter(unimod_mods.values())), int):
146 |         return {f"[UNIMOD:{value}]" for values in unimod_mods.values() for value in values}
147 |     else:
148 |         return {value for values in unimod_mods.values() for value in values}
149 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/qfdr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/theGreatHerrLebert/sagepy/4382f7b24dd4e35bfa256e8c8a32ead610782e99/sagepy/sagepy/qfdr/__init__.py


--------------------------------------------------------------------------------
/sagepy/sagepy/qfdr/tdc.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Tuple, Optional
  2 | 
  3 | import pandas as pd
  4 | import sagepy_connector
  5 | 
  6 | from sagepy.core import Psm
  7 | 
  8 | psc = sagepy_connector.py_qfdr
  9 | 
 10 | 
 11 | class TDCMethod:
 12 |     def __init__(self, method: str):
 13 |         self.methods = {"psm", "peptide_psm_only", "peptide_peptide_only", "peptide_psm_peptide", "picked_peptide", "picked_protein"}
 14 |         assert method in self.methods, f"Invalid method: {method}, allowed values are: {self.methods}"
 15 |         self.__py_ptr = psc.PyTDCMethod(method)
 16 | 
 17 |     @classmethod
 18 |     def from_py_ptr(cls, py_ptr: psc.PyTDCMethod):
 19 |         instance = cls.__new__(cls)
 20 |         instance.__py_ptr = py_ptr
 21 |         return instance
 22 | 
 23 |     def get_py_ptr(self) -> psc.PyTDCMethod:
 24 |         return self.__py_ptr
 25 | 
 26 |     def __repr__(self):
 27 |         return f"TDCMethod({self.__py_ptr.to_str()})"
 28 | 
 29 | 
 30 | def target_decoy_competition(
 31 |         spectra_idx: List[str],
 32 |         match_idx: List[str],
 33 |         match_identity_candidates: List[List[str]],
 34 |         decoy: List[bool],
 35 |         scores: List[float],
 36 |         method: str = "peptide_psm_peptide") -> Tuple[List[str], List[str], List[List[str]], List[bool], List[float], List[float]]:
 37 |     """ Perform target-decoy competition.
 38 | 
 39 |     Args:
 40 |         spectra_idx: a list of spectrum indices
 41 |         match_idx: a list of match indices
 42 |         match_identity_candidates: a list of match identity candidates
 43 |         decoy: a list of decoy flags
 44 |         scores: a list of scores
 45 |         method: the method to use, allowed values are: psm, peptide_psm_only, peptide_peptide_only, peptide_psm_peptide
 46 | 
 47 |     Returns:
 48 |         a tuple of spectrum indices, match indices, decoy flags, scores, and q-values
 49 |     """
 50 |     tdc_method = TDCMethod(method)
 51 |     spec_idx, match_idx, match_identity_candidates, decoy, scores, q_values = psc.target_decoy_competition(
 52 |         tdc_method.get_py_ptr(), spectra_idx,
 53 |         match_idx, decoy, scores, match_identity_candidates)
 54 |     return spec_idx, match_idx, match_identity_candidates, decoy, scores, q_values
 55 | 
 56 | 
 57 | def target_decoy_competition_pandas(
 58 |         df: pd.DataFrame,
 59 |         method: str = "peptide_psm_peptide",
 60 |         score: Optional[str] = None,
 61 | ) -> pd.DataFrame:
 62 |     """ Perform target-decoy competition on a pandas DataFrame.
 63 | 
 64 |     Args:
 65 |         df: a pandas DataFrame
 66 |         method: the method to use, allowed values are: psm, peptide_psm_only, peptide_peptide_only, peptide_psm_peptide
 67 |         score: the target column name (optional)
 68 | 
 69 |     Returns:
 70 |         a pandas DataFrame with q-values
 71 |     """
 72 | 
 73 |     # Ensure necessary columns are present
 74 |     required_columns = ['spec_idx', 'match_idx', 'match_identity_candidates', 'decoy']
 75 |     for col in required_columns:
 76 |         assert col in df.columns, f"{col} column not found"
 77 | 
 78 |     # Ensure score column is present
 79 |     score_col = score if score else 'hyperscore'
 80 |     assert score_col in df.columns, f"{score_col} column not found"
 81 | 
 82 |     target_score = df[score_col]
 83 |     spec_idx, match_idx, match_identity_candidates, target, scores = (
 84 |         df['spec_idx'].tolist(),
 85 |         df['match_idx'].tolist(),
 86 |         df['match_identity_candidates'].tolist(),
 87 |         df['decoy'].tolist(),
 88 |         target_score.tolist()
 89 |     )
 90 | 
 91 |     spec_idx, match_idx, match_identity_candidates, target, scores, q_values = target_decoy_competition(
 92 |         spec_idx,
 93 |         match_idx,
 94 |         match_identity_candidates,
 95 |         target,
 96 |         scores,
 97 |         method
 98 |     )
 99 | 
100 |     # Create df with TDC results
101 |     df_tdc = pd.DataFrame({
102 |         'spec_idx': spec_idx,
103 |         'match_idx': match_idx,
104 |         'match_identity_candidates': match_identity_candidates,
105 |         'decoy': target,
106 |         f'{score_col}': scores,
107 |         'q_value': q_values
108 |     }).sort_values(by=['q_value'], ascending=True)
109 | 
110 |     return df_tdc
111 | 
112 | def assign_sage_spectrum_q(psm_list: List[Psm], use_hyper_score: bool = True):
113 |     """ Assign SAGE spectrum q-values to PSMs.
114 |     Args:
115 |         psm_list: a list of PeptideSpectrumMatch objects
116 |         use_hyper_score: whether to use hyper score or discriminant score for q-value calculation
117 |     """
118 |     # Perform SAGE FDR
119 |     psc.assign_spectrum_q([psm.get_py_ptr() for psm in psm_list], use_hyper_score)
120 | 
121 | def assign_sage_peptide_q(psm_list: List[Psm], use_hyper_score: bool = True):
122 |     """ Assign SAGE peptide q-values to PSMs.
123 |     Args:
124 |         psm_list: a list of PeptideSpectrumMatch objects
125 |         use_hyper_score: whether to use hyper score or discriminant score for q-value calculation
126 |     """
127 |     # Perform SAGE FDR
128 |     psc.assign_peptide_q([psm.get_py_ptr() for psm in psm_list], use_hyper_score)
129 | 
130 | def assign_sage_protein_q(psm_list: List[Psm], use_hyper_score: bool = True):
131 |     """ Assign SAGE protein q-values to PSMs.
132 |     Args:
133 |         psm_list: a list of PeptideSpectrumMatch objects
134 |         use_hyper_score: whether to use hyper score or discriminant score for q-value calculation
135 |     """
136 |     # Perform SAGE FDR
137 |     psc.assign_protein_q([psm.get_py_ptr() for psm in psm_list], use_hyper_score)


--------------------------------------------------------------------------------
/sagepy/sagepy/rescore/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/theGreatHerrLebert/sagepy/4382f7b24dd4e35bfa256e8c8a32ead610782e99/sagepy/sagepy/rescore/__init__.py


--------------------------------------------------------------------------------
/sagepy/sagepy/rescore/lda.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 3 | from sklearn.preprocessing import StandardScaler
 4 | 
 5 | from tqdm import tqdm
 6 | from typing import Union, List, Dict
 7 | 
 8 | from sagepy.core import Psm
 9 | from sagepy.rescore.utility import get_features, generate_training_data, split_psm_list
10 | from sagepy.utility import psm_collection_to_pandas
11 | 
12 | 
13 | def rescore_lda(
14 |         psm_collection: Union[List[Psm], Dict[str, List[Psm]]],
15 |         num_splits: int = 5,
16 |         verbose: bool = True,
17 |         balance: bool = True,
18 |         replace_nan: bool = True,
19 |         score: str = "hyperscore",
20 |         num_threads: int = 16,
21 | ) -> List[Psm]:
22 |     """ Re-score PSMs using Linear Discriminant Analysis (LDA).
23 |     Args:
24 |         psm_collection: A collection of PSMs
25 |         num_splits: Number of splits (folds) to use for cross-validation
26 |         verbose: Whether to print progress
27 |         balance: Whether to balance the dataset (equal number of target and decoy examples)
28 |         replace_nan: Whether to replace NaN values with 0
29 |         score: Score to use for rescoring
30 |         num_threads: Number of threads to use for feature extraction
31 | 
32 |     Returns:
33 |         List[PeptideSpectrumMatch]: List of PeptideSpectrumMatch objects
34 |     """
35 | 
36 |     psm_list = []
37 | 
38 |     if isinstance(psm_collection, dict):
39 |         for spec_id, psm_candidates in psm_collection.items():
40 |             psm_list.extend(psm_candidates)
41 |     else:
42 |         psm_list = psm_collection
43 | 
44 | 
45 |     X_all, _ = get_features(psm_collection_to_pandas(psm_list, num_threads=num_threads), score=score, replace_nan=replace_nan)
46 |     scaler = StandardScaler()
47 |     scaler.fit(X_all)
48 | 
49 |     splits = split_psm_list(psm_list=psm_list, num_splits=num_splits)
50 | 
51 |     predictions = []
52 | 
53 |     for i in tqdm(range(num_splits), disable=not verbose, desc='Re-scoring PSMs', ncols=100):
54 | 
55 |         target = splits[i]
56 |         features = []
57 | 
58 |         for j in range(num_splits):
59 |             if j != i:
60 |                 features.extend(splits[j])
61 | 
62 |         # generate training data
63 |         X_train, Y_train = generate_training_data(features, balance=balance, replace_nan=replace_nan, num_threads=num_threads)
64 | 
65 |         # get features for target that we want to re-score
66 |         X, _ = get_features(psm_collection_to_pandas(target, num_threads=num_threads), replace_nan=replace_nan)
67 | 
68 |         # experimenting with different settings for LDA showed that shrinkage should be used, which tries to
69 |         # keep model weights small and helps to prevent overfitting
70 |         lda = LinearDiscriminantAnalysis(solver="eigen", shrinkage="auto")
71 |         lda.fit(scaler.transform(X_train), Y_train)
72 | 
73 |         try:
74 |             # check for flip sign of LDA classification return to be compatible with good score ascending
75 |             score_flip = 1.0 if Y_train[np.argmax(np.squeeze(lda.transform(scaler.transform(X_train))))] == 1.0 else -1.0
76 |         except:
77 |             score_flip = 1.0
78 | 
79 |         Y_pred = np.squeeze(lda.transform(scaler.transform(X))) * score_flip
80 |         predictions.extend(Y_pred)
81 | 
82 |     for score, match in zip(predictions, psm_list):
83 |         match.re_score = score
84 | 
85 |     return psm_list
86 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/rescore/rescore.py:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import StandardScaler, MinMaxScaler
 2 | 
 3 | from tqdm import tqdm
 4 | from typing import Union, List, Dict
 5 | 
 6 | from sagepy.core import Psm
 7 | from sagepy.rescore.utility import get_features, generate_training_data, split_psm_list
 8 | from sagepy.utility import psm_collection_to_pandas
 9 | 
10 | 
11 | def rescore_psms(
12 |         psm_collection: Union[List[Psm], Dict[str, List[Psm]]],
13 |         model,
14 |         use_min_max_scaler: bool = False,
15 |         num_splits: int = 3,
16 |         verbose: bool = True,
17 |         balance: bool = True,
18 |         replace_nan: bool = True,
19 |         score: str = "hyperscore",
20 |         num_threads: int = 16,
21 |         **kwargs,
22 | ) -> List[Psm]:
23 |     """ Re-score PSMs using a model (e.g. Random Forest, Gradient Boosting, etc.).
24 |     Args:
25 |         psm_collection: A collection of PSMs
26 |         model: A model to use for re-scoring, needs to comply to the sklearn API
27 |         use_min_max_scaler: Whether to use MinMaxScaler instead of StandardScaler
28 |         num_splits: Number of splits (folds) to use for cross-validation
29 |         verbose: Whether to print progress
30 |         balance: Whether to balance the dataset (equal number of target and decoy examples)
31 |         replace_nan: Whether to replace NaN values with 0
32 |         score: Score to use for re-scoring
33 |         num_threads: Number of threads to use for feature extraction
34 | 
35 |     Returns:
36 |         List[PeptideSpectrumMatch]: List of PeptideSpectrumMatch objects
37 |     """
38 | 
39 |     psm_list = []
40 | 
41 |     if isinstance(psm_collection, dict):
42 |         for spec_id, psm_candidates in psm_collection.items():
43 |             psm_list.extend(psm_candidates)
44 |     else:
45 |         psm_list = psm_collection
46 | 
47 |     # get features for all PSMs, which will be a matrix of shape (n_samples, n_features)
48 |     X_all, _ = get_features(psm_collection_to_pandas(psm_list, num_threads=num_threads), score=score, replace_nan=replace_nan)
49 | 
50 |     # use a scaler to scale the features
51 |     if use_min_max_scaler:
52 |         scaler = MinMaxScaler()
53 |     else:
54 |         scaler = StandardScaler()
55 |     scaler.fit(X_all)
56 | 
57 |     # split the PSMs into num_splits folds to perform cross-validation
58 |     splits = split_psm_list(psm_list=psm_list, num_splits=num_splits, **kwargs)
59 | 
60 |     predictions = []
61 |     final_psms = []
62 |     for i in tqdm(range(num_splits), disable=not verbose, desc='Re-scoring PSMs', ncols=100):
63 | 
64 |         target = splits[i]
65 |         final_psms.extend(target)
66 |         features = []
67 | 
68 |         for j in range(num_splits):
69 |             if j != i:
70 |                 features.extend(splits[j])
71 | 
72 |         # generate training data
73 |         X_train, Y_train = generate_training_data(features, balance=balance, replace_nan=replace_nan, num_threads=num_threads, **kwargs)
74 | 
75 |         # get features for target that we want to re-score
76 |         X, _ = get_features(psm_collection_to_pandas(target), replace_nan=replace_nan)
77 |         model.fit(scaler.transform(X_train), Y_train)
78 | 
79 |         # try to use decision function, otherwise use predict_proba
80 |         try:
81 |             Y_pred = model.decision_function(scaler.transform(X))
82 |             predictions.extend(Y_pred)  # Use decision scores directly
83 |         except AttributeError:
84 |             Y_pred = model.predict_proba(scaler.transform(X))
85 |             predictions.extend(Y_pred[:, 1])  # Use class probabilities (second column for binary classification)
86 | 
87 |     # assign the re-scored values to the PSMs
88 |     for score, match in zip(predictions, final_psms):
89 |         match.re_score = score
90 | 
91 |     return final_psms
92 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/rescore/rt_predictor.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from sklearn.model_selection import train_test_split
  8 | from sklearn.metrics import mean_squared_error
  9 | from sklearn.linear_model import Ridge, Lasso
 10 | 
 11 | from sagepy.core import Psm
 12 | from sagepy.qfdr.tdc import target_decoy_competition_pandas
 13 | from sagepy.utility import psm_collection_to_pandas
 14 | 
 15 | 
 16 | def tokenize_peptide(sequence: str) -> List[str]:
 17 |     """
 18 |     Tokenize a peptide sequence into amino acid tokens.
 19 |     Args:
 20 |         sequence: A peptide sequence string
 21 | 
 22 |     Returns:
 23 |         A list of amino acid tokens, including modifications
 24 |     """
 25 |     tokens = re.findall(r'[A-Z](?:\[UNIMOD:\d+\])?', sequence)
 26 |     return tokens
 27 | 
 28 | 
 29 | def sequence_to_vector(sequence: str, token_alphabet: List[str]) -> np.ndarray:
 30 |     """
 31 |     Convert a peptide sequence into a vector representation based on a token alphabet
 32 |     Args:
 33 |         sequence: A peptide sequence string
 34 |         token_alphabet: A list of amino acid tokens
 35 | 
 36 |     Returns:
 37 |         A vector representation of the peptide sequence
 38 |     """
 39 |     tokens = tokenize_peptide(sequence)
 40 |     vector = np.zeros(len(token_alphabet))
 41 |     for token in tokens:
 42 |         if token in token_alphabet:
 43 |             idx = token_alphabet.index(token)
 44 |             vector[idx] += 1
 45 |     return vector
 46 | 
 47 | 
 48 | def create_token_alphabet(sequences):
 49 |     """
 50 |     Create a token alphabet from a set of peptide sequences.
 51 |     Args:
 52 |         sequences: A list of peptide sequences
 53 | 
 54 |     Returns:
 55 |         A list of unique amino acid tokens
 56 |     """
 57 |     unique_tokens = set()
 58 |     for seq in sequences:
 59 |         unique_tokens.update(tokenize_peptide(seq))
 60 |     token_alphabet = sorted(unique_tokens)  # Sort to maintain consistent ordering
 61 |     return token_alphabet
 62 | 
 63 | 
 64 | def prepare_data(sequences, retention_times, token_alphabet):
 65 |     """
 66 |     Prepare the dataset for training a retention time predictor
 67 |     Args:
 68 |         sequences: A list of peptide sequences
 69 |         retention_times: A list of retention times
 70 |         token_alphabet: A list of amino acid tokens
 71 | 
 72 |     Returns:
 73 |         X: A matrix of input features
 74 |         y: A vector of target values
 75 |     """
 76 |     # Convert sequences to vectors based on the provided token_alphabet
 77 |     X = np.array([sequence_to_vector(seq, token_alphabet) for seq in sequences])
 78 |     y = np.array(retention_times)
 79 | 
 80 |     return X, y
 81 | 
 82 | 
 83 | def train_ridge_regression_model(X, y, alpha=1.0, verbose=False):
 84 |     """
 85 |     Train a ridge regression model with L2 regularization.
 86 |     Args:
 87 |         X: tokenized peptide sequences
 88 |         y: retention times
 89 |         alpha: regularization strength
 90 |         verbose: whether to print the test MSE
 91 | 
 92 |     Returns:
 93 |         Trained model and train/test data
 94 |     """
 95 | 
 96 |     model = Ridge(alpha=alpha)
 97 |     model.fit(X, y)
 98 | 
 99 |     if verbose:
100 |         y_pred = model.predict(X)
101 |         mse = mean_squared_error(y, y_pred)
102 |         print(f"Test MSE (Ridge, alpha={alpha}): {mse}")
103 | 
104 |     return model
105 | 
106 | 
107 | def train_lasso_regression_model(X, y, alpha=1.0, verbose=False):
108 |     """
109 |     Train a Lasso regression model with L1 regularization.
110 |     Args:
111 |         X: Peptide sequences
112 |         y: Retention times
113 |         alpha: Regularization strength
114 |         verbose: Whether to print the test MSE
115 | 
116 |     Returns:
117 |         Trained model and train/test data
118 |     """
119 | 
120 |     model = Lasso(alpha=alpha)
121 |     model.fit(X, y)
122 | 
123 |     if verbose:
124 |         y_pred = model.predict(X)
125 |         mse = mean_squared_error(y, y_pred)
126 |         print(f"Test MSE (Lasso, alpha={alpha}): {mse}")
127 | 
128 |     return model
129 | 
130 | 
131 | def transform_sequences(sequences, token_alphabet):
132 |     """
133 |     Transform a list of peptide sequences into a matrix of token counts based on a token alphabet.
134 |     Args:
135 |         sequences: A list of peptide sequences
136 |         token_alphabet: A list of amino acid tokens
137 | 
138 |     Returns:
139 |         A matrix of token counts
140 |     """
141 |     X_new = np.array([sequence_to_vector(seq, token_alphabet) for seq in sequences])
142 |     return X_new
143 | 
144 | def predict_retention_times_psm(psm_collection: List[Psm], fdr_threshold: float = 0.01, alpha: float = 0.2):
145 |     """
146 |     Predict retention times for peptide spectrum matches using a ridge regression model
147 |     Args:
148 |         psm_collection: A list of PeptideSpectrumMatch objects
149 |         fdr_threshold: The false discovery rate threshold for selecting target hits
150 |         alpha: The regularization strength for the ridge regression model
151 | 
152 |     Returns:
153 |         None, the retention times are updated in place in the PeptideSpectrumMatch objects
154 |     """
155 | 
156 |     # Convert the peptide spectrum matches to a pandas DataFrame
157 |     PSM_pandas = psm_collection_to_pandas(psm_collection)
158 | 
159 |     # Create a token alphabet from the peptide sequences
160 |     token_alphabet = create_token_alphabet(PSM_pandas["sequence"])
161 | 
162 |     # Prepare the dataset for training
163 |     TDC = target_decoy_competition_pandas(PSM_pandas, method="psm")
164 |     TDC = pd.merge(TDC.drop(columns=["score"]),
165 |                                    PSM_pandas.drop(columns=["q_value"]), on=["spec_idx", "match_idx", "decoy"])
166 | 
167 |     # we can select target hits with a q-value of 0.01, translating to 1 percent FDR
168 |     FDR_controlled = TDC[(TDC.q_value <= fdr_threshold) & (TDC.decoy == False)]
169 |     X, y = prepare_data(FDR_controlled["sequence"], FDR_controlled["retention_time_observed"], token_alphabet)
170 | 
171 |     # Train a ridge regression model
172 |     model = train_ridge_regression_model(X, y, alpha=alpha)
173 | 
174 |     # Predict retention times for all PSMs
175 |     X_new = transform_sequences(PSM_pandas["sequence"], token_alphabet)
176 |     predicted_times = model.predict(X_new)
177 | 
178 |     for rt_pred, psm in zip(predicted_times, psm_collection):
179 |         psm.retention_time_predicted = rt_pred
180 | 


--------------------------------------------------------------------------------
/sagepy/sagepy/rescore/utility.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import pandas as pd
  4 | 
  5 | from numpy.typing import NDArray
  6 | from typing import Optional, Tuple, List
  7 | 
  8 | from sagepy.core.scoring import Psm
  9 | from sagepy.utility import psm_collection_to_pandas
 10 | 
 11 | from collections import defaultdict
 12 | from typing import Callable
 13 | 
 14 | peptide_key_maker = lambda psm: (psm.sequence,)
 15 | ion_key_maker = lambda psm: (psm.sequence, psm.charge)
 16 | 
 17 | 
 18 | def assign_random_groups(group_count, number_of_assignments, seed=None):
 19 |     """
 20 |     Assign random groups to a number of assignments.
 21 |     Args:
 22 |         group_count: int, number of groups to assign to
 23 |         number_of_assignments: int, number of assignments to make
 24 |         seed: None | int, random seed for reproducibility
 25 | 
 26 |     Returns:
 27 |         list: a list of group assignments, where each assignment is an integer in the range [0, group_count - 1]
 28 |     """
 29 |     if seed is not None:
 30 |         random.seed(seed)
 31 |     return [random.randint(0, group_count - 1) for _ in range(number_of_assignments)]
 32 | 
 33 | 
 34 | def split_into_chunks(
 35 |         psms: list,
 36 |         splits_count: int = 3,
 37 |         key_maker: Callable = peptide_key_maker,
 38 |         seed: None | int = None,
 39 | ) -> list[list]:
 40 |     """
 41 |     Split a list of PSMs into multiple chunks based on a key maker function.
 42 |     Args:
 43 |         psms: list of Psm objects
 44 |         splits_count: int, number of splits to create
 45 |         key_maker: Callable, a function that takes a Psm object and returns a key for grouping
 46 |         seed: None | int, random seed for reproducibility
 47 | 
 48 |     Returns:
 49 |         list[list]: a list of lists, where each inner list contains Psm objects that share the same key
 50 |     """
 51 |     grouped_psms = defaultdict(list)
 52 |     for psm in psms:
 53 |         grouped_psms[key_maker(psm)].append(psm)
 54 | 
 55 |     split_assignments = assign_random_groups(splits_count, len(grouped_psms), seed=seed)
 56 | 
 57 |     splits = [[] for _ in range(splits_count)]
 58 |     for split_assignment, (group, grouped_psms) in zip(split_assignments, grouped_psms.items()):
 59 |         splits[split_assignment].extend(grouped_psms)
 60 | 
 61 |     return splits
 62 | 
 63 | 
 64 | def dict_to_dense_array(peak_dict, array_length=174):
 65 |     """
 66 |     Convert a dictionary of peaks to a fixed-length array.
 67 |     Args:
 68 |         peak_dict: a dictionary of peaks (ion_type, charge, ordinal) -> intensity
 69 |         array_length: the length of the fixed-length array
 70 | 
 71 |     Returns:
 72 |         A fixed-length array of intensities.
 73 |     """
 74 |     # initialize a fixed-length array of zeros
 75 |     intensities = np.zeros(array_length)
 76 | 
 77 |     half_length = array_length // 2  # first half for b ions, second half for y ions
 78 |     block_size = 29  # number of ordinals per charge state
 79 | 
 80 |     for (ion_type, charge, ordinal), intensity in peak_dict.items():
 81 |         # map (b=0, y=1) ions to the correct index
 82 |         index = ion_type * half_length + (charge - 1) * block_size + (ordinal - 1)
 83 |         intensities[index] = intensity
 84 | 
 85 |     return intensities
 86 | 
 87 | def get_features(
 88 |         ds: pd.DataFrame,
 89 |         score: Optional[str] = None,
 90 |         replace_nan: bool = True,
 91 | ) -> Tuple[NDArray, NDArray]:
 92 |     """
 93 |     Get features and labels from a dataset.
 94 |     Args:
 95 |         ds: a pandas DataFrame containing the dataset.
 96 |         score: the name of the target score column.
 97 |         replace_nan: if True, replace NaN values with 0.
 98 | 
 99 |     Returns:
100 |         A tuple containing the features and labels.
101 |     """
102 | 
103 |     score = score if score is not None else "hyperscore"
104 | 
105 |     # The currently used features for the model fit
106 |     # TODO: extend this list with additional features
107 |     features = [
108 |         f"{score}",
109 |         "delta_rt",
110 |         "delta_ims",
111 |         "cosine_similarity",
112 |         "delta_mass",
113 |         "rank",
114 |         "isotope_error",
115 |         "average_ppm",
116 |         "delta_next",
117 |         "delta_best",
118 |         "matched_peaks",
119 |         "longest_b",
120 |         "longest_y",
121 |         "longest_y_pct",
122 |         "missed_cleavages",
123 |         "matched_intensity_pct",
124 |         "poisson",
125 |         "charge",
126 |         "intensity_ms1",
127 |         "intensity_ms2",
128 |         "collision_energy",
129 |         "cosine_similarity",
130 |         "spectral_angle_similarity",
131 |         "pearson_correlation",
132 |         "spearman_correlation",
133 |         "spectral_entropy_similarity",
134 |     ]
135 |     ds = ds.copy()
136 | 
137 |     # Log-transform the intensity columns
138 |     ds["intensity_ms1"] = ds["intensity_ms1"].apply(lambda x: np.log1p(x))
139 |     ds["intensity_ms2"] = ds["intensity_ms2"].apply(lambda x: np.log1p(x))
140 | 
141 |     # avoid none values for cosine similarity
142 |     ds["cosine_similarity"] = ds["cosine_similarity"].apply(lambda x: 0.0 if x is None else x)
143 | 
144 |     X = ds[features].to_numpy().astype(np.float32)
145 | 
146 |     if replace_nan:
147 |         X = np.nan_to_num(X)
148 | 
149 |     Y = ds["decoy"].to_numpy()
150 |     Y = np.array([0 if x else 1 for x in Y]).astype(np.float32)
151 | 
152 |     return X, Y
153 | 
154 | 
155 | def generate_training_data(
156 |         psm_list: List[Psm],
157 |         method: str = "peptide_q",
158 |         q_max: float = 0.01,
159 |         balance: bool = True,
160 |         replace_nan: bool = True,
161 |         num_threads: int = 16,
162 |         **kwargs
163 | ) -> Tuple[NDArray, NDArray]:
164 |     """ Generate training data.
165 |     Args:
166 |         psm_list: List of PeptideSpectrumMatch objects
167 |         method: Method to use for training data generation
168 |         q_max: Maximum q-value allowed for positive examples
169 |         balance: Whether to balance the dataset
170 |         replace_nan: Whether to replace NaN values with 0
171 |         num_threads: Number of threads to use for feature extraction
172 | 
173 |     Returns:
174 |         Tuple[NDArray, NDArray]: X_train and Y_train
175 |     """
176 |     # create pandas table from psms
177 |     PSM_pandas = psm_collection_to_pandas(psm_list, num_threads=num_threads)
178 | 
179 |     if method == "spectrum_q":
180 |         TARGET = PSM_pandas[(PSM_pandas.decoy == False) & (PSM_pandas.spectrum_q <= q_max) & (PSM_pandas["rank"] == 1)]
181 |     elif method == "peptide_q":
182 |         TARGET = PSM_pandas[(PSM_pandas.decoy == False) & (PSM_pandas.peptide_q <= q_max) & (PSM_pandas["rank"] == 1)]
183 | 
184 |     elif method == "decoy_quantile":
185 |         cutoff = PSM_pandas[PSM_pandas.decoy].hyperscore.quantile(1 - q_max)
186 |         TARGET = PSM_pandas[(PSM_pandas.decoy == False) & (PSM_pandas["rank"] == 1) & (PSM_pandas.hyperscore >= cutoff)]
187 |     else:
188 |         raise ValueError(f"Unknown method: {method}. Use 'spectrum_q' or 'peptide_q'.")
189 | 
190 |     X_target, Y_target = get_features(TARGET, replace_nan=replace_nan)
191 | 
192 |     # select all decoys
193 |     DECOY = PSM_pandas[PSM_pandas.decoy & (PSM_pandas["rank"] == 1)]
194 |     X_decoy, Y_decoy = get_features(DECOY, replace_nan=replace_nan)
195 | 
196 |     # balance the dataset such that the number of target and decoy examples are equal
197 |     if balance:
198 |         num_target = np.min((len(DECOY), len(TARGET)))
199 |         target_indices = np.random.choice(np.arange(len(X_target)), size=num_target)
200 |         X_target = X_target[target_indices, :]
201 |         Y_target = Y_target[target_indices]
202 | 
203 |     # combine target and decoy examples
204 |     X_train = np.vstack((X_target, X_decoy))
205 |     Y_train = np.hstack((Y_target, Y_decoy))
206 | 
207 |     return X_train, Y_train
208 | 
209 | 
210 | def get_list_index_by_sequence(psms, num_splits: int = 5, seed: int = 35):
211 |     # Set random seed for reproducibility
212 |     np.random.seed(seed)
213 | 
214 |     # Extract unique sequences
215 |     unique_sequences = list({psm.sequence for psm in psms})
216 | 
217 |     # Shuffle and split
218 |     shuffled = np.random.permutation(unique_sequences)
219 |     split = np.array_split(shuffled, num_splits)
220 | 
221 |     # Create mapping from sequence -> split index
222 |     index_dict = {seq: i for i, group in enumerate(split) for seq in group}
223 |     return index_dict
224 | 
225 | 
226 | def split_psm_list_broken(psm_list: List[Psm], num_splits: int = 5) -> List[List]:
227 |     # Get sequence-to-split mapping
228 |     seq_to_split = get_list_index_by_sequence(psm_list, num_splits)
229 | 
230 |     # Preallocate split containers
231 |     splits = [[] for _ in range(num_splits)]
232 | 
233 |     # Assign PSMs to their respective splits
234 |     for psm in psm_list:
235 |         split_idx = seq_to_split[psm.sequence]
236 |         splits[split_idx].append(psm)
237 | 
238 |     return splits
239 | 
240 | def split_psm_list(psm_list: List[Psm], num_splits: int = 5, seed: None| int = None, key_maker: Callable = peptide_key_maker, **kwargs) -> List[List[Psm]]:
241 |     """
242 |     Split PSMs into multiple splits.
243 | 
244 |     Args:
245 |         psm_list: List of PeptideSpectrumMatch objects
246 |         num_splits: Number of splits
247 |         seed: Optional seed for reproducibility
248 |         key_maker: Callable function to create keys for grouping PSMs
249 | 
250 |     Returns:
251 |         List[List[PeptideSpectrumMatch]]: List of splits
252 | 
253 |     """
254 |     return split_into_chunks(psm_list, num_splits, seed=seed, key_maker=key_maker)
255 | 
256 | """
257 | def split_psm_list(psm_list: List[Psm], num_splits: int = 5) -> List[List[Psm]]:
258 |      Split PSMs into multiple splits.
259 | 
260 |     Args:
261 |         psm_list: List of PeptideSpectrumMatch objects
262 |         num_splits: Number of splits
263 | 
264 |     Returns:
265 |         List[List[PeptideSpectrumMatch]]: List of splits
266 | 
267 |     # floor division by num_splits
268 |     split_size = len(psm_list) // num_splits
269 | 
270 |     # remainder for last split
271 |     remainder = len(psm_list) % num_splits
272 | 
273 |     splits = []
274 | 
275 |     start_index = 0
276 | 
277 |     for i in range(num_splits):
278 |         end_index = start_index + split_size + (1 if i < remainder else 0)
279 |         splits.append(psm_list[start_index:end_index])
280 |         start_index = end_index
281 | 
282 |     return splits
283 | """
284 | 
285 | def transform_psm_to_mokapot_pin(psm_df, seq_modified: bool = False):
286 |     """ Transform a PSM DataFrame to a mokapot PIN DataFrame.
287 |     Args:
288 |         psm_df: a DataFrame containing PSMs
289 |         seq_modified: whether the sequences are modified
290 | 
291 |     Returns:
292 |         A DataFrame containing the PSMs in mokapot PIN format.
293 |     """
294 | 
295 |     columns_map = {
296 |         # target columns mapping for mokapot
297 |         'spec_idx': 'SpecId',
298 |         'decoy': 'Label',
299 |         'charge': 'Charge',
300 |         'sequence_modified': 'Peptide',
301 |         'proteins': 'Proteins',
302 | 
303 |         # feature mapping for re-scoring
304 |         'hyperscore': 'Feature1',
305 |         'isotope_error': 'Feature2',
306 |         'delta_mass': 'Feature3',
307 |         'delta_rt': 'Feature4',
308 |         'delta_ims': 'Feature5',
309 |         'matched_peaks': 'Feature6',
310 |         'matched_intensity_pct': 'Feature7',
311 |         'intensity_ms1': 'Feature8',
312 |         'intensity_ms2': 'Feature9',
313 |         'average_ppm': 'Feature10',
314 |         'poisson': 'Feature11',
315 |         'spectral_entropy_similarity': 'Feature12',
316 |         'pearson_correlation': 'Feature13',
317 |         'spearman_correlation': 'Feature14',
318 |         'spectral_angle_similarity': 'Feature15',
319 |         'collision_energy': 'Feature16',
320 |         'delta_next': 'Feature17',
321 |         'delta_best': 'Feature18',
322 |         'longest_b': 'Feature19',
323 |         'longest_y': 'Feature20',
324 |         'longest_y_pct': 'Feature21',
325 |         'cosine_similarity': 'Feature22',
326 |         'rank': 'Feature23',
327 |         'missed_cleavages': 'Feature24',
328 |     }
329 | 
330 |     if not seq_modified:
331 |         columns_map['sequence'] = 'Peptide'
332 |         columns_map.pop('sequence_modified')
333 | 
334 |     psm_df = psm_df[list(columns_map.keys())]
335 |     df_pin = psm_df.rename(columns=columns_map)
336 |     df_pin_clean = df_pin.dropna(axis=1, how='all')
337 |     df_pin_clean = df_pin_clean.dropna()
338 | 
339 |     df_pin_clean['Label'] = df_pin_clean['Label'].apply(lambda x: -1 if x else 1)
340 |     df_pin_clean['ScanNr'] = range(1, len(df_pin_clean) + 1)
341 | 
342 |     return df_pin_clean
343 | 


--------------------------------------------------------------------------------
/sagepy/sagepy_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/theGreatHerrLebert/sagepy/4382f7b24dd4e35bfa256e8c8a32ead610782e99/sagepy/sagepy_logo.png


--------------------------------------------------------------------------------
/unimod/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "unimod"
3 | version = "0.1.0"
4 | edition = "2021"
5 | 
6 | [dependencies]
7 | 


--------------------------------------------------------------------------------
/unimod/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod unimod {
 2 |     pub mod modification_atomic_composition;
 3 |     pub mod title_to_unimod_id;
 4 |     pub mod unimod_quantized;
 5 |     pub mod unimod_to_mass;
 6 | 
 7 |     // Re-exporting functions to the parent module for easier access
 8 |     pub use modification_atomic_composition::modification_atomic_composition;
 9 |     pub use title_to_unimod_id::title_to_unimod_id;
10 |     pub use unimod_quantized::{quanzie_mass, quantized_mass_to_unimod};
11 |     pub use unimod_to_mass::{unimod_modifications_mass, unimod_modifications_mass_numerical};
12 | }


--------------------------------------------------------------------------------