├── .gitattributes ├── .github └── workflows │ ├── sagepy-connector-publish.yml │ └── sagepy-publish.yml ├── .gitignore ├── LICENSE ├── README.md ├── qfdrust ├── Cargo.toml └── src │ ├── dataset.rs │ ├── intensity.rs │ ├── lib.rs │ ├── main.rs │ ├── picked.rs │ ├── psm.rs │ └── utility.rs ├── sagepy-connector ├── .idea │ ├── .gitignore │ ├── modules.xml │ ├── sagepy-connector.iml │ └── vcs.xml ├── Cargo.lock ├── Cargo.toml └── src │ ├── lib.rs │ ├── py_database.rs │ ├── py_enzyme.rs │ ├── py_fasta.rs │ ├── py_fdr.rs │ ├── py_intensity.rs │ ├── py_ion_series.rs │ ├── py_lfq.rs │ ├── py_mass.rs │ ├── py_mobility_model.rs │ ├── py_modification.rs │ ├── py_peptide.rs │ ├── py_qfdr.rs │ ├── py_retention_alignment.rs │ ├── py_retention_model.rs │ ├── py_scoring.rs │ ├── py_spectrum.rs │ ├── py_tmt.rs │ ├── py_unimod.rs │ ├── py_utility.rs │ └── utilities.rs ├── sagepy ├── README.md ├── examples │ ├── lfq │ │ ├── LFQ.ipynb │ │ └── helpers.py │ ├── property-prediction │ │ └── property_prediction.ipynb │ ├── readme │ │ └── readme_example.ipynb │ ├── rescoring │ │ ├── bayesian_and_frequentist_rescoring.ipynb │ │ ├── data │ │ │ └── psm_data.csv │ │ └── rescoring.ipynb │ ├── sage-fdr │ │ └── FDRControl.ipynb │ └── scoring │ │ └── scoring.ipynb ├── pyproject.toml ├── sagepy │ ├── __init__.py │ ├── core │ │ ├── __init__.py │ │ ├── database.py │ │ ├── enzyme.py │ │ ├── fasta.py │ │ ├── fdr.py │ │ ├── ion_series.py │ │ ├── lfq.py │ │ ├── mass.py │ │ ├── ml │ │ │ ├── __init__.py │ │ │ ├── mobility_model.py │ │ │ ├── pep.py │ │ │ ├── retention_alignment.py │ │ │ └── retention_model.py │ │ ├── modification.py │ │ ├── peptide.py │ │ ├── scoring.py │ │ ├── spectrum.py │ │ ├── tmt.py │ │ └── unimod.py │ ├── qfdr │ │ ├── __init__.py │ │ └── tdc.py │ ├── rescore │ │ ├── __init__.py │ │ ├── lda.py │ │ ├── rescore.py │ │ ├── rt_predictor.py │ │ └── utility.py │ └── utility.py └── sagepy_logo.png └── unimod ├── Cargo.toml └── src ├── lib.rs └── unimod ├── modification_atomic_composition.rs ├── title_to_unimod_id.rs ├── unimod_quantized.rs └── unimod_to_mass.rs /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-vendored 2 | -------------------------------------------------------------------------------- /.github/workflows/sagepy-connector-publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish Rust Binding 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | build-and-publish: 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest, windows-latest, macos-13, macos-14] 16 | python-version: ['3.11', '3.12'] 17 | include: 18 | - os: ubuntu-latest 19 | python-version: '3.11' 20 | publish: true 21 | - os: windows-latest 22 | python-version: '3.11' 23 | publish: true 24 | - os: macos-13 25 | python-version: '3.11' 26 | publish: true 27 | - os: macos-14 28 | python-version: '3.11' 29 | publish: true 30 | - os: ubuntu-latest 31 | python-version: '3.12' 32 | publish: true 33 | - os: windows-latest 34 | python-version: '3.12' 35 | publish: true 36 | - os: macos-13 37 | python-version: '3.12' 38 | publish: true 39 | - os: macos-14 40 | python-version: '3.12' 41 | publish: true 42 | 43 | steps: 44 | - uses: actions/checkout@v3 45 | 46 | - name: Set up Python 47 | uses: actions/setup-python@v3 48 | with: 49 | python-version: ${{ matrix.python-version }} 50 | 51 | - name: Install Maturin 52 | run: | 53 | python -m pip install --upgrade pip 54 | pip install maturin 55 | 56 | - name: Set up Rust 57 | uses: actions-rs/toolchain@v1 58 | with: 59 | profile: minimal 60 | toolchain: stable 61 | override: true 62 | 63 | - name: Change to sagepy-connector directory 64 | run: cd sagepy-connector 65 | 66 | - name: Clean Cargo Artifacts 67 | run: | 68 | cd sagepy-connector 69 | cargo clean 70 | 71 | - name: Build with Maturin 72 | run: | 73 | cd sagepy-connector 74 | maturin build --release 75 | 76 | - name: Publish 77 | if: matrix.publish 78 | env: 79 | MATURIN_PYPI_TOKEN: ${{ secrets.SAGEPY_CONNECTOR_PYPI_API_TOKEN }} 80 | run: | 81 | cd sagepy-connector 82 | maturin publish --no-sdist 83 | -------------------------------------------------------------------------------- /.github/workflows/sagepy-publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v3 19 | with: 20 | python-version: '3.11' 21 | 22 | - name: Install Poetry 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install poetry 26 | 27 | - name: Change to sagepy directory 28 | run: cd sagepy 29 | 30 | - name: Build package 31 | run: | 32 | cd sagepy 33 | poetry build 34 | 35 | - name: Publish package 36 | env: 37 | POETRY_PYPI_TOKEN_PYPI: ${{ secrets.SAGEPY_PYPI_API_TOKEN }} 38 | run: | 39 | cd sagepy 40 | poetry config http-basic.pypi __token__ $POETRY_PYPI_TOKEN_PYPI 41 | poetry publish 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 David Teschner 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sagepy 2 | A python interface to the [SAGE](https://github.com/lazear/sage) search engine for mass spectrometry proteomics. 3 | 4 | This repository hosts the main codebase for the sagepy project, which is dedicated to creating a fully functional Python interface for the powerful Sage search engine, originally written in Rust. 5 | 6 | The project is structured as follows: 7 | 8 | * `sagepy-connector`: This crate creates a Python interface using [PyO3](https://github.com/PyO3) to bind Rust to Python. 9 | * `sagepy`: A pure Python, fully Pythonic wrapper around the exposed Rust code. 10 | * `qfdrust`: This crate implements basic false discovery rate (FDR) estimation using TDC, following the methods proposed by [Crema](https://github.com/Noble-Lab/crema). 11 | * `unimod`: A work-in-progress crate that bridges Sage-style PSM annotation with the UNIMOD standard. 12 | 13 | ## Quickstart 14 | Get started quickly by installing sagepy via pip: 15 | ``` 16 | pip install sagepy 17 | ``` 18 | Check out the tutorial notebooks to dive into [DB generation, searching, and FDR estimation](https://github.com/theGreatHerrLebert/sagepy/blob/main/sagepy/examples/scoring/scoring.ipynb), [peptide property prediction](https://github.com/theGreatHerrLebert/sagepy/blob/main/sagepy/examples/property-prediction/property_prediction.ipynb), and [re-scoring of results](https://github.com/theGreatHerrLebert/sagepy/blob/main/sagepy/examples/rescoring/rescoring.ipynb). 19 | 20 | ## Get involved 21 | Do you have any questions or want to contribute? Feel free to reach out at any time! 22 | 23 | 24 | ## Cite 25 | 26 | If you find sagepy useful, please cite the original SAGE publication and consider citing our paper on sagepy: 27 | 28 | Lazear, M. “Sage: An Open-Source Tool for Fast Proteomics Searching and Quantification at Scale.” [Journal of Proteome Research (2023)](https://pubs.acs.org/doi/10.1021/acs.jproteome.3c00486). 29 | 30 | Teschner, D et al. “Rustims: An Open-Source Framework for Rapid Development and Processing of timsTOF Data-Dependent Acquisition Data.” [Journal of Proteome Research (2025)]( https://pubs.acs.org/doi/full/10.1021/acs.jproteome.4c00966). 31 | 32 | Thanks for supporting free and open-source software and science! 33 | -------------------------------------------------------------------------------- /qfdrust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "qfdrust" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | rand = "0.9.0-beta.0" 10 | rustms = { git = "https://github.com/theGreatHerrLebert/rustims.git" } 11 | # rustms = { path = "../../rustims/rustms" } 12 | # sage-core = {path = "../../sage/crates/sage" } 13 | sage-core = { git = "https://github.com/theGreatHerrLebert/sage.git" } 14 | itertools = "0.13.0" 15 | serde = { version = "1.0.217", features = ["derive"] } 16 | ndarray = "0.16.1" 17 | rayon = "1.10.0" 18 | bincode = "2.0.0-rc.3" 19 | zstd = "0.13.2" 20 | -------------------------------------------------------------------------------- /qfdrust/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod utility; 2 | pub mod dataset; 3 | pub mod intensity; 4 | pub mod psm; 5 | pub mod picked; -------------------------------------------------------------------------------- /qfdrust/src/main.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | println!("Hello, world!"); 3 | } 4 | -------------------------------------------------------------------------------- /qfdrust/src/picked.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use itertools::Itertools; 3 | use crate::psm::Psm; 4 | use rayon::prelude::*; 5 | 6 | pub fn protein_id_from_psm(psm: &Psm, decoy_tag: &str, generate_decoys: bool) -> String { 7 | if psm.sage_feature.label == -1 { 8 | psm.proteins 9 | .iter() 10 | .map(|s| { 11 | if generate_decoys { 12 | format!("{}{}", decoy_tag, s) 13 | } else { 14 | s.to_string() 15 | } 16 | }) 17 | .join(";") 18 | } else { 19 | psm.proteins.iter().join(";") 20 | } 21 | } 22 | 23 | #[derive(Default)] 24 | struct Row { 25 | ix: String, 26 | decoy: bool, 27 | score: f32, 28 | q: f32, 29 | } 30 | 31 | #[derive(Clone, Debug)] 32 | struct Competition { 33 | forward_ix: Option, 34 | forward: f32, 35 | reverse_ix: Option, 36 | reverse: f32, 37 | } 38 | 39 | impl Default for Competition { 40 | fn default() -> Self { 41 | Competition { 42 | forward_ix: None, 43 | forward: f32::MIN, 44 | reverse_ix: None, 45 | reverse: f32::MIN, 46 | } 47 | } 48 | } 49 | 50 | fn assign_q_value( 51 | scores: HashMap, 52 | ) -> HashMap { 53 | 54 | let mut q_values: HashMap = HashMap::new(); 55 | 56 | let mut scores = scores 57 | .into_par_iter() 58 | .flat_map(|(_, comp)| { 59 | [ 60 | (comp.forward_ix.clone(), false, comp.forward), 61 | (comp.reverse_ix.clone(), true, comp.reverse), 62 | ] 63 | }) 64 | .filter_map(|(ix, decoy, score)| { 65 | ix.map(|ix| Row { 66 | ix, 67 | decoy, 68 | score, 69 | q: 1.0, 70 | }) 71 | }) 72 | .collect::>(); 73 | 74 | scores.par_sort_by(|a, b| b.score.total_cmp(&a.score)); 75 | 76 | let mut decoy_count: f64 = 1.0; 77 | let mut target_count: f64 = 0.0; 78 | let mut q_values_list: Vec = Vec::new(); 79 | 80 | // First pass: Calculate the raw q-values 81 | for row in scores.iter() { 82 | if row.decoy { 83 | decoy_count += 1.0; 84 | } else { 85 | target_count += 1.0; 86 | } 87 | 88 | // Avoid division by zero 89 | if target_count == 0.0 { 90 | q_values_list.push(1.0); 91 | continue; 92 | } 93 | 94 | let q = decoy_count / target_count; 95 | q_values_list.push(q); 96 | } 97 | 98 | // Second pass: Compute the cumulative minimum from the end 99 | let mut q_min = 1.0; 100 | for (i, row) in scores.iter_mut().enumerate().rev() { 101 | let q = q_values_list[i]; 102 | if q < q_min { 103 | q_min = q; 104 | } 105 | row.q = q_min as f32; 106 | q_values.insert(row.ix.clone(), row.q as f64); 107 | } 108 | 109 | q_values 110 | } 111 | 112 | pub fn spectrum_q_value(scores: &Vec, use_hyper_score: bool) -> Vec { 113 | 114 | // create a collection of PSMs sorted by score and keep the index 115 | let mut indexed_inner_collection: Vec<(usize, Psm)> = scores.iter() 116 | .enumerate() 117 | .map(|(index, item)| (index, item.clone())) 118 | .collect(); 119 | 120 | // sort either by hyperscore or PSM re_score 121 | match use_hyper_score { 122 | // Sort by hyperscore 123 | true => { 124 | indexed_inner_collection.par_sort_unstable_by(|(_, a), (_, b)| b.sage_feature.hyperscore.total_cmp(&a.sage_feature.hyperscore)); 125 | } 126 | // Sort by PSM re_score 127 | false => { 128 | indexed_inner_collection.par_sort_unstable_by(|(_, a), (_, b)| b.re_score.unwrap().total_cmp(&a.re_score.unwrap())); 129 | } 130 | } 131 | 132 | // Calculate the spectrum q-value 133 | let mut decoy = 1; 134 | let mut target = 0; 135 | 136 | for (_, psm) in indexed_inner_collection.iter_mut() { 137 | match psm.sage_feature.label == -1 { 138 | true => decoy += 1, 139 | false => target += 1, 140 | } 141 | psm.sage_feature.spectrum_q = decoy as f32 / target as f32; 142 | } 143 | 144 | // Reverse slice, and calculate the cumulative minimum 145 | let mut q_min = 1.0f32; 146 | for (_, psm) in indexed_inner_collection.iter_mut().rev() { 147 | q_min = q_min.min(psm.sage_feature.spectrum_q); 148 | psm.sage_feature.spectrum_q = q_min; 149 | } 150 | 151 | // sort the q_values by the original index 152 | let mut q_values = vec![0.0; scores.len()]; 153 | for (sorted_index, psm) in indexed_inner_collection.iter() { 154 | q_values[*sorted_index] = psm.sage_feature.spectrum_q; 155 | } 156 | 157 | q_values 158 | } 159 | 160 | pub fn picked_peptide(features: &mut Vec, use_hyper_score: bool) -> HashMap { 161 | 162 | let mut map: HashMap = HashMap::default(); 163 | 164 | for feat in features.iter() { 165 | 166 | let peptide_sequence_key = match feat.sage_feature.label == -1 { 167 | true => feat.sequence_decoy.clone().unwrap().sequence, 168 | false => feat.sequence.clone().unwrap().sequence, 169 | }; 170 | 171 | let entry = map.entry(peptide_sequence_key).or_default(); 172 | 173 | match feat.sage_feature.label == -1 { 174 | true => { 175 | match use_hyper_score { 176 | true => { 177 | entry.reverse_ix = Some(feat.sequence_decoy.clone().unwrap().sequence); 178 | entry.reverse = entry.reverse.max(feat.sage_feature.hyperscore as f32); 179 | } 180 | false => { 181 | entry.reverse_ix = Some(feat.sequence_decoy.clone().unwrap().sequence); 182 | entry.reverse = entry.reverse.max(feat.re_score.unwrap() as f32); 183 | } 184 | } 185 | } 186 | false => { 187 | match use_hyper_score { 188 | true => { 189 | entry.forward_ix = Some(feat.sequence.clone().unwrap().sequence); 190 | entry.forward = entry.forward.max(feat.sage_feature.hyperscore as f32); 191 | } 192 | false => { 193 | entry.forward_ix = Some(feat.sequence.clone().unwrap().sequence); 194 | entry.forward = entry.forward.max(feat.re_score.unwrap() as f32); 195 | } 196 | } 197 | } 198 | } 199 | } 200 | 201 | let q_value_map = assign_q_value(map); 202 | 203 | q_value_map 204 | } 205 | 206 | pub fn picked_protein(features: &mut Vec, use_hyper_score: bool) -> HashMap { 207 | 208 | let mut map: HashMap = HashMap::default(); 209 | 210 | for feat in features.iter() { 211 | 212 | let protein_key = protein_id_from_psm(feat, "rev_", true); 213 | 214 | let entry = map.entry(protein_key).or_default(); 215 | 216 | match feat.sage_feature.label == -1 { 217 | true => { 218 | match use_hyper_score { 219 | true => { 220 | entry.reverse_ix = Some(protein_id_from_psm(feat, "rev_", true)); 221 | entry.reverse = entry.reverse.max(feat.sage_feature.hyperscore as f32); 222 | } 223 | false => { 224 | entry.reverse_ix = Some(protein_id_from_psm(feat, "rev_", true)); 225 | entry.reverse = entry.reverse.max(feat.re_score.unwrap() as f32); 226 | } 227 | } 228 | } 229 | false => { 230 | match use_hyper_score { 231 | true => { 232 | entry.forward_ix = Some(protein_id_from_psm(feat, "rev_", true)); 233 | entry.forward = entry.forward.max(feat.sage_feature.hyperscore as f32); 234 | } 235 | false => { 236 | entry.forward_ix = Some(protein_id_from_psm(feat, "rev_", true)); 237 | entry.forward = entry.forward.max(feat.re_score.unwrap() as f32); 238 | } 239 | } 240 | } 241 | } 242 | } 243 | 244 | let q_value_map = assign_q_value(map); 245 | 246 | q_value_map 247 | } -------------------------------------------------------------------------------- /qfdrust/src/psm.rs: -------------------------------------------------------------------------------- 1 | use rustms::chemistry::formula::calculate_mz; 2 | use rustms::proteomics::peptide::{PeptideSequence}; 3 | use sage_core::scoring::{Feature, Fragments}; 4 | use serde::{Deserialize, Serialize}; 5 | use bincode; 6 | use std::io; 7 | use bincode::config::standard; 8 | use crate::intensity::{prosit_intensities_to_fragments, FragmentIntensityPrediction}; 9 | use zstd::stream::encode_all; // For compression 10 | use bincode::{Encode, Decode}; 11 | use zstd::decode_all; 12 | 13 | #[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)] 14 | pub struct Psm { 15 | pub spec_idx: String, 16 | pub peptide_idx: u32, 17 | pub proteins: Vec, 18 | pub sage_feature: Feature, 19 | pub sequence: Option, 20 | pub sequence_modified: Option, 21 | pub sequence_decoy: Option, 22 | pub sequence_decoy_modified: Option, 23 | pub mono_mz_calculated: Option, 24 | pub intensity_ms1: Option, 25 | pub intensity_ms2: Option, 26 | pub collision_energy: Option, 27 | pub collision_energy_calibrated: Option, 28 | pub retention_time_projected: Option, 29 | pub prosit_predicted_intensities: Option>, 30 | pub re_score: Option, 31 | pub fragment_intensity_prediction: Option, 32 | } 33 | 34 | impl Psm { 35 | pub fn new( 36 | spec_idx: String, 37 | peptide_idx: u32, 38 | proteins: Vec, 39 | sage_feature: Feature, 40 | sequence: Option, 41 | sequence_modified: Option, 42 | sequence_decoy: Option, 43 | sequence_decoy_modified: Option, 44 | intensity_ms1: Option, 45 | intensity_ms2: Option, 46 | collision_energy: Option, 47 | collision_energy_calibrated: Option, 48 | retention_time_projected: Option, 49 | prosit_predicted_intensities: Option>, 50 | re_score: Option, 51 | ) -> Self { 52 | 53 | let peptide_sequence = match &sequence { 54 | Some(seq) => Some(PeptideSequence::new(seq.clone(), Some(peptide_idx as i32))), 55 | None => None, 56 | }; 57 | 58 | let sequence_decoy = match &sequence_decoy { 59 | Some(seq) => Some(PeptideSequence::new(seq.clone(), Some(peptide_idx as i32))), 60 | None => None, 61 | }; 62 | 63 | let sequence_modified = match &sequence_modified { 64 | Some(seq) => Some(PeptideSequence::new(seq.clone(), Some(peptide_idx as i32))), 65 | None => None, 66 | }; 67 | 68 | let sequence_decoy_modified = match &sequence_decoy_modified { 69 | Some(seq) => Some(PeptideSequence::new(seq.clone(), Some(peptide_idx as i32))), 70 | None => None, 71 | }; 72 | 73 | let mono_mz_calculated = match (peptide_sequence.clone(), sage_feature.charge as i32) { 74 | (Some(seq), charge) => Some(calculate_mz(seq.mono_isotopic_mass(), charge) as f32), 75 | (_, _) => None, 76 | }; 77 | 78 | Psm { 79 | spec_idx, 80 | peptide_idx, 81 | proteins, 82 | sage_feature, 83 | sequence: peptide_sequence, 84 | sequence_modified, 85 | sequence_decoy, 86 | sequence_decoy_modified, 87 | mono_mz_calculated, 88 | intensity_ms1, 89 | intensity_ms2, 90 | collision_energy, 91 | collision_energy_calibrated, 92 | retention_time_projected, 93 | prosit_predicted_intensities, 94 | re_score, 95 | fragment_intensity_prediction: None, 96 | } 97 | } 98 | 99 | pub fn get_fragment_intensity_prediction(&self) -> FragmentIntensityPrediction { 100 | FragmentIntensityPrediction::new( 101 | self.sage_feature.fragments.clone().unwrap(), 102 | self.prosit_predicted_intensities.clone().unwrap(), 103 | ) 104 | } 105 | 106 | pub fn calculate_fragment_intensity_prediction(&mut self) { 107 | self.fragment_intensity_prediction = Some(self.get_fragment_intensity_prediction()); 108 | } 109 | 110 | pub fn prosit_intensity_to_fragments(&self) -> Option { 111 | match &self.prosit_predicted_intensities { 112 | Some(intensities) => Some(prosit_intensities_to_fragments(intensities.clone())), 113 | None => None, 114 | } 115 | } 116 | 117 | pub fn get_feature_vector(&self) -> Vec { 118 | 119 | let sage_feature = &self.sage_feature; 120 | let mut feature_vector = Vec::new(); 121 | feature_vector.push(sage_feature.expmass as f64); 122 | feature_vector.push(sage_feature.calcmass as f64); 123 | feature_vector.push(sage_feature.charge as f64); 124 | feature_vector.push(sage_feature.rt as f64); 125 | feature_vector.push(sage_feature.aligned_rt as f64); 126 | feature_vector.push(sage_feature.predicted_rt as f64); 127 | feature_vector.push(sage_feature.delta_rt_model as f64); 128 | feature_vector.push(sage_feature.ims as f64); 129 | feature_vector.push(sage_feature.predicted_ims as f64); 130 | feature_vector.push(sage_feature.delta_ims_model as f64); 131 | feature_vector.push(sage_feature.delta_mass as f64); 132 | feature_vector.push(sage_feature.isotope_error as f64); 133 | feature_vector.push(sage_feature.average_ppm as f64); 134 | feature_vector.push(sage_feature.hyperscore); 135 | feature_vector.push(self.re_score.unwrap_or(0.0)); 136 | feature_vector.push(sage_feature.delta_next); 137 | feature_vector.push(sage_feature.delta_best); 138 | feature_vector.push(sage_feature.matched_peaks as f64); 139 | feature_vector.push(sage_feature.longest_b as f64); 140 | feature_vector.push(sage_feature.longest_y as f64); 141 | feature_vector.push(sage_feature.longest_y_pct as f64); 142 | feature_vector.push(sage_feature.missed_cleavages as f64); 143 | feature_vector.push(sage_feature.matched_intensity_pct as f64); 144 | feature_vector.push(sage_feature.scored_candidates as f64); 145 | feature_vector.push(sage_feature.poisson); 146 | feature_vector.push(sage_feature.discriminant_score as f64); 147 | feature_vector.push(sage_feature.posterior_error as f64); 148 | feature_vector.push(sage_feature.ms2_intensity as f64); 149 | feature_vector.push(sage_feature.rank as f64); 150 | 151 | feature_vector.push(self.intensity_ms1.unwrap_or(0.0) as f64); 152 | feature_vector.push(self.intensity_ms2.unwrap_or(0.0) as f64); 153 | feature_vector.push(self.collision_energy.unwrap_or(0.0) as f64); 154 | feature_vector.push(self.collision_energy_calibrated.unwrap_or(0.0) as f64); 155 | feature_vector.push(self.retention_time_projected.unwrap_or(0.0) as f64); 156 | 157 | let intensity_features = self.fragment_intensity_prediction.clone(); 158 | 159 | match intensity_features { 160 | Some(intensity_features) => { 161 | let features = intensity_features.get_feature_vector(0.00001, false); 162 | for feature in features { 163 | feature_vector.push(feature as f64); 164 | } 165 | }, 166 | 167 | None => { 168 | for _ in 0..5 { 169 | feature_vector.push(0.0); 170 | } 171 | } 172 | } 173 | 174 | feature_vector.push(sage_feature.delta_rt_model as f64); 175 | feature_vector.push(sage_feature.delta_ims_model as f64); 176 | 177 | feature_vector.push(sage_feature.label as f64); 178 | 179 | feature_vector.push(sage_feature.spectrum_q as f64); 180 | feature_vector.push(sage_feature.peptide_q as f64); 181 | feature_vector.push(sage_feature.protein_q as f64); 182 | 183 | feature_vector 184 | } 185 | 186 | pub fn get_feature_names(&self) -> Vec<&str> { 187 | vec![ 188 | "expmass", 189 | "calcmass", 190 | "charge", 191 | "rt", 192 | "aligned_rt", 193 | "predicted_rt", 194 | "delta_rt_model", 195 | "ims", 196 | "predicted_ims", 197 | "delta_ims_model", 198 | "delta_mass", 199 | "isotope_error", 200 | "average_ppm", 201 | "hyperscore", 202 | "re_score", 203 | "delta_next", 204 | "delta_best", 205 | "matched_peaks", 206 | "longest_b", 207 | "longest_y", 208 | "longest_y_pct", 209 | "missed_cleavages", 210 | "matched_intensity_pct", 211 | "scored_candidates", 212 | "poisson", 213 | "discriminant_score", 214 | "posterior_error", 215 | "ms2_intensity", 216 | "rank", 217 | "intensity_ms1", 218 | "intensity_ms2", 219 | "collision_energy", 220 | "collision_energy_calibrated", 221 | "retention_time_projected", 222 | "cosine_similarity", 223 | "spectral_angle_similarity", 224 | "pearson_correlation", 225 | "spearman_correlation", 226 | "spectral_entropy_similarity", 227 | "delta_rt", 228 | "delta_ims", 229 | "decoy", 230 | "spectrum_q", 231 | "peptide_q", 232 | "protein_q", 233 | ] 234 | } 235 | } 236 | 237 | pub fn compress_psms(psms: &[Psm]) -> io::Result> { 238 | // Step 1: Configure bincode 239 | let config = standard(); 240 | // Step 2: Serialize with the configured bincode 241 | let serialized = bincode::encode_to_vec(psms, config).expect("Serialization failed"); 242 | // Step 3: Compress the serialized data using ZSTD 243 | let compressed = encode_all(serialized.as_slice(), 0).expect("Compression failed"); 244 | // Step 4: Return compressed binary data 245 | Ok(compressed) 246 | } 247 | 248 | pub fn decompress_psms(compressed_data: &[u8]) -> io::Result> { 249 | // Step 1: Decompress the data using ZSTD 250 | let decompressed = decode_all(compressed_data).expect("Decompression failed"); 251 | // Step 2: Configure bincode 252 | let config = standard(); 253 | // Step 3: Deserialize the decompressed data back into Psm structs 254 | let psms: Vec = bincode::decode_from_slice(&decompressed, config) 255 | .expect("Deserialization failed") 256 | .0; 257 | // Step 4: Return the deserialized data 258 | Ok(psms) 259 | } -------------------------------------------------------------------------------- /qfdrust/src/utility.rs: -------------------------------------------------------------------------------- 1 | use rand::prelude::*; 2 | 3 | /// Use target-decoy competition to calculate q-values 4 | /// 5 | /// # Arguments 6 | /// 7 | /// * `scores` - A vector of floats representing the scores 8 | /// * `target` - A vector of booleans representing the target/decoy status 9 | /// * `desc` - A boolean representing the sort order of the scores 10 | /// 11 | /// # Returns 12 | /// 13 | /// * `Vec` - A vector of floats representing the q-values 14 | /// 15 | pub fn target_decoy_competition(scores: &Vec, target: &Vec, desc: bool) -> Vec { 16 | assert_eq!(scores.len(), target.len(), "Scores and target must be the same length"); 17 | 18 | // Create a vector of indices and sort by scores 19 | let mut indices: Vec = (0..scores.len()).collect(); 20 | if desc { 21 | indices.sort_by(|&i, &j| scores[j].partial_cmp(&scores[i]).unwrap()); 22 | } else { 23 | indices.sort_by(|&i, &j| scores[i].partial_cmp(&scores[j]).unwrap()); 24 | } 25 | 26 | // Apply sorted indices to scores and targets 27 | let sorted_scores: Vec = indices.iter().map(|&i| scores[i]).collect(); 28 | let sorted_target: Vec = indices.iter().map(|&i| target[i]).collect(); 29 | 30 | // Calculate cumulative sums for targets and decoys 31 | let mut cum_targets = 0; 32 | let mut cum_decoys = 0; 33 | let mut cum_targets_vec = Vec::new(); 34 | let mut cum_decoys_vec = Vec::new(); 35 | 36 | for &t in &sorted_target { 37 | if t { 38 | cum_targets += 1; 39 | } else { 40 | cum_decoys += 1; 41 | } 42 | cum_targets_vec.push(cum_targets); 43 | cum_decoys_vec.push(cum_decoys); 44 | } 45 | 46 | // Calculate FDR 47 | let mut fdr: Vec = cum_decoys_vec.iter() 48 | .zip(cum_targets_vec.iter()) 49 | .map(|(&d, &t)| if t > 0 { (d as f64 + 1.0) / t as f64 } else { 1.0 }) 50 | .collect(); 51 | 52 | // Calculate q-values 53 | fdr.reverse(); 54 | let reversed_scores: Vec = sorted_scores.iter().rev().cloned().collect(); 55 | let mut q_vals = fdr_to_q_value(&reversed_scores, &fdr); 56 | q_vals.reverse(); 57 | 58 | // Reorder q_vals to original order 59 | let mut final_q_vals = vec![0.0; scores.len()]; 60 | for (original_pos, &sorted_pos) in indices.iter().enumerate() { 61 | final_q_vals[sorted_pos] = q_vals[original_pos]; 62 | } 63 | 64 | final_q_vals 65 | } 66 | 67 | /// Convert FDR to q-values 68 | /// 69 | /// # Arguments 70 | /// 71 | /// * `scores` - A vector of floats representing the scores 72 | /// * `fdr` - A vector of floats representing the FDR 73 | /// 74 | /// # Returns 75 | /// 76 | /// * `Vec` - A vector of floats representing the q-values 77 | /// 78 | fn fdr_to_q_value(scores: &[f64], fdr: &[f64]) -> Vec { 79 | assert_eq!(scores.len(), fdr.len(), "Scores and FDR must be of the same length"); 80 | 81 | let mut min_q = 1.0; 82 | let mut qvals = vec![1.0; fdr.len()]; 83 | let mut start = 0; 84 | 85 | for (idx, &score) in scores.iter().enumerate() { 86 | // check if the next score is the same 87 | if idx < scores.len() - 1 && scores[idx + 1] == score { 88 | continue; 89 | } 90 | 91 | // update the minimum q-value 92 | if fdr[start] < min_q { 93 | min_q = fdr[start]; 94 | } 95 | 96 | for qval in &mut qvals[start..=idx] { 97 | *qval = min_q; 98 | } 99 | start = idx + 1; 100 | } 101 | 102 | qvals 103 | } 104 | 105 | fn _estimate_pi0(pval_list: &Vec) -> f64 { 106 | let num_lambda = 100; 107 | let max_lambda = 0.5; 108 | let num_boot = 100; 109 | let max_size = 1000; 110 | let mut rng = rand::rng(); 111 | 112 | let n_pval = pval_list.len(); 113 | let mut pi0s_list = Vec::new(); 114 | let mut lambda_list = Vec::new(); 115 | 116 | for idx in 0..num_lambda { 117 | let cur_lambda = ((idx + 1) as f64 / num_lambda as f64) * max_lambda; 118 | let start = pval_list.binary_search_by(|p| p.partial_cmp(&cur_lambda).unwrap()).unwrap_or_else(|pos| pos); 119 | let w1 = n_pval - start; 120 | let pi0 = w1 as f64 / n_pval as f64 / (1.0 - cur_lambda); 121 | 122 | if pi0 > 0.0 { 123 | lambda_list.push(cur_lambda); 124 | pi0s_list.push(pi0); 125 | } 126 | } 127 | 128 | assert!(!pi0s_list.is_empty(), "Error in the input data: too good separation between target and decoy PSMs."); 129 | 130 | let min_pi0 = *pi0s_list.iter().min_by(|a, b| a.partial_cmp(b).unwrap()).unwrap(); 131 | let mut mse_list = vec![0.0; pi0s_list.len()]; 132 | 133 | let dist = rand::distr::Uniform::new(0, n_pval).unwrap(); 134 | 135 | for _ in 0..num_boot { 136 | let num_draw = std::cmp::min(n_pval, max_size); 137 | let mut p_boot_list: Vec = (0..num_draw).map(|_| pval_list[dist.sample(&mut rng)]).collect(); 138 | p_boot_list.sort_by(|a, b| a.partial_cmp(b).unwrap()); 139 | 140 | for (idx, &lambda) in lambda_list.iter().enumerate() { 141 | let start = p_boot_list.binary_search_by(|p| p.partial_cmp(&lambda).unwrap()).unwrap_or_else(|pos| pos); 142 | let w1 = num_draw - start; 143 | let pi0_boot = w1 as f64 / num_draw as f64 / (1.0 - lambda); 144 | mse_list[idx] += (pi0_boot - min_pi0).powi(2); 145 | } 146 | } 147 | 148 | let min_idx = mse_list.iter().enumerate().min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()).map(|(idx, _)| idx).unwrap(); 149 | 150 | pi0s_list[min_idx].clamp(0.0, 1.0) 151 | } 152 | 153 | #[cfg(test)] 154 | mod tests { 155 | use super::*; 156 | 157 | fn setup_desc_scores() -> (Vec, Vec, Vec) { 158 | let scores = vec![10.0, 10.0, 9.0, 8.0, 7.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0]; 159 | let target = vec![true, true, true, true, false, true, true, false, true, false, true, false, false, false, false, false]; 160 | let q_vals = vec![0.25, 0.25, 0.25, 0.25, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.42857142857142855, 0.42857142857142855, 0.5714285714285714, 0.625, 0.625, 1.0, 1.0, 1.0, 1.0]; 161 | (scores, target, q_vals) 162 | } 163 | 164 | #[test] 165 | fn test_tdc_descending() { 166 | let (scores, target, true_q_vals) = setup_desc_scores(); 167 | let q_vals = target_decoy_competition(&scores, &target, true); 168 | assert_eq!(q_vals, true_q_vals, "Q-values for descending scores are incorrect."); 169 | } 170 | 171 | #[test] 172 | fn test_tdc_ascending() { 173 | let (mut scores, target, true_q_vals) = setup_desc_scores(); 174 | scores = scores.into_iter().map(|x| -x).collect(); // Negate scores for ascending test 175 | let q_vals = target_decoy_competition(&scores, &target, false); 176 | assert_eq!(q_vals, true_q_vals, "Q-values for ascending scores are incorrect."); 177 | } 178 | } -------------------------------------------------------------------------------- /sagepy-connector/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | # GitHub Copilot persisted chat sessions 10 | /copilot/chatSessions 11 | -------------------------------------------------------------------------------- /sagepy-connector/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /sagepy-connector/.idea/sagepy-connector.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /sagepy-connector/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /sagepy-connector/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sagepy-connector" 3 | version = "0.3.12" 4 | edition = "2021" 5 | 6 | [lib] 7 | name = "sagepy_connector" 8 | crate-type = ["cdylib"] 9 | 10 | [dependencies] 11 | # sage-core = {path = "../../sage/crates/sage" } 12 | sage-core = { git = "https://github.com/theGreatHerrLebert/sage.git" } 13 | qfdrust = { path = "../qfdrust" } 14 | unimod = { path = "../unimod" } 15 | pyo3 = { version = "0.23.4", features = ["extension-module"] } 16 | numpy = "0.23.0" 17 | rayon = "1.10.0" 18 | 19 | serde = { version = "1.0.217", features = ["derive"] } 20 | bincode = "1.3.3" 21 | log = "0.4.22" 22 | itertools = "0.14.0" 23 | serde_json = "1.0.138" 24 | -------------------------------------------------------------------------------- /sagepy-connector/src/lib.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::wrap_pymodule; 3 | 4 | pub mod py_database; 5 | pub mod py_enzyme; 6 | pub mod py_fasta; 7 | pub mod py_ion_series; 8 | pub mod py_mass; 9 | pub mod py_modification; 10 | pub mod py_peptide; 11 | pub mod py_scoring; 12 | pub mod py_spectrum; 13 | pub mod py_fdr; 14 | pub mod py_lfq; 15 | pub mod py_tmt; 16 | pub mod py_qfdr; 17 | pub mod py_utility; 18 | pub mod py_unimod; 19 | pub mod utilities; 20 | pub mod py_intensity; 21 | pub mod py_retention_model; 22 | pub mod py_retention_alignment; 23 | pub mod py_mobility_model; 24 | #[pymodule] 25 | fn sagepy_connector(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 26 | 27 | m.add_wrapped(wrap_pymodule!(py_mass::py_mass))?; 28 | m.add_wrapped(wrap_pymodule!(py_enzyme::py_enzyme))?; 29 | m.add_wrapped(wrap_pymodule!(py_fasta::py_fasta))?; 30 | m.add_wrapped(wrap_pymodule!(py_peptide::py_peptide))?; 31 | m.add_wrapped(wrap_pymodule!(py_ion_series::py_ion_series))?; 32 | m.add_wrapped(wrap_pymodule!(py_modification::py_modification))?; 33 | m.add_wrapped(wrap_pymodule!(py_database::py_database))?; 34 | m.add_wrapped(wrap_pymodule!(py_spectrum::py_spectrum))?; 35 | m.add_wrapped(wrap_pymodule!(py_scoring::py_scoring))?; 36 | m.add_wrapped(wrap_pymodule!(py_fdr::py_fdr))?; 37 | m.add_wrapped(wrap_pymodule!(py_lfq::py_lfq))?; 38 | m.add_wrapped(wrap_pymodule!(py_tmt::py_tmt))?; 39 | m.add_wrapped(wrap_pymodule!(py_qfdr::py_qfdr))?; 40 | m.add_wrapped(wrap_pymodule!(py_unimod::py_unimod))?; 41 | m.add_wrapped(wrap_pymodule!(py_utility::py_utility))?; 42 | m.add_wrapped(wrap_pymodule!(py_intensity::py_intensity))?; 43 | m.add_wrapped(wrap_pymodule!(py_retention_alignment::py_retention_alignment))?; 44 | m.add_wrapped(wrap_pymodule!(py_retention_model::py_retention_model))?; 45 | m.add_wrapped(wrap_pymodule!(py_mobility_model::py_mobility_model))?; 46 | 47 | Ok(()) 48 | } 49 | -------------------------------------------------------------------------------- /sagepy-connector/src/py_enzyme.rs: -------------------------------------------------------------------------------- 1 | use numpy::{IntoPyArray, PyArray2, PyArrayMethods}; 2 | use pyo3::prelude::*; 3 | use std::sync::Arc; 4 | 5 | use std::hash::Hash; 6 | 7 | use pyo3::exceptions::PyValueError; 8 | use pyo3::types::PyList; 9 | use sage_core::enzyme::{Digest, Enzyme, EnzymeParameters, Position}; 10 | use std::collections::hash_map::DefaultHasher; 11 | use std::hash::Hasher; 12 | 13 | #[pyclass] 14 | #[derive(Clone)] 15 | pub struct PyPosition { 16 | pub inner: Position, 17 | } 18 | 19 | #[pymethods] 20 | impl PyPosition { 21 | #[staticmethod] 22 | fn nterm() -> Self { 23 | PyPosition { 24 | inner: Position::Nterm, 25 | } 26 | } 27 | 28 | #[staticmethod] 29 | fn cterm() -> Self { 30 | PyPosition { 31 | inner: Position::Cterm, 32 | } 33 | } 34 | 35 | #[staticmethod] 36 | fn full() -> Self { 37 | PyPosition { 38 | inner: Position::Full, 39 | } 40 | } 41 | 42 | #[staticmethod] 43 | fn internal() -> Self { 44 | PyPosition { 45 | inner: Position::Internal, 46 | } 47 | } 48 | 49 | #[staticmethod] 50 | fn from_string(position_string: &str) -> PyResult { 51 | match position_string { 52 | "n_term" => Ok(PyPosition::nterm()), 53 | "c_term" => Ok(PyPosition::cterm()), 54 | "full" => Ok(PyPosition::full()), 55 | "internal" => Ok(PyPosition::internal()), 56 | _ => Err(PyValueError::new_err("Invalid position string")), 57 | } 58 | } 59 | 60 | #[getter] 61 | fn to_string(&self) -> String { 62 | format!("{:?}", self.inner) 63 | } 64 | } 65 | 66 | #[pyclass] 67 | #[derive(Clone)] 68 | pub struct PyDigest { 69 | pub inner: Digest, 70 | } 71 | 72 | #[pymethods] 73 | impl PyDigest { 74 | #[new] 75 | fn new( 76 | decoy: bool, 77 | sequence: &str, 78 | protein: &str, 79 | missed_cleavages: u8, 80 | position: PyPosition, 81 | semi_enzymatic: bool, 82 | ) -> Self { 83 | PyDigest { 84 | inner: Digest { 85 | decoy, 86 | sequence: sequence.to_string(), 87 | protein: Arc::from(protein.to_string()), 88 | missed_cleavages, 89 | position: position.inner, 90 | semi_enzymatic, 91 | }, 92 | } 93 | } 94 | 95 | #[getter] 96 | fn decoy(&self) -> bool { 97 | self.inner.decoy 98 | } 99 | 100 | #[getter] 101 | fn sequence(&self) -> &str { 102 | &self.inner.sequence 103 | } 104 | 105 | #[getter] 106 | fn protein(&self) -> &str { 107 | &self.inner.protein 108 | } 109 | 110 | #[getter] 111 | fn missed_cleavages(&self) -> u8 { 112 | self.inner.missed_cleavages 113 | } 114 | 115 | #[getter] 116 | fn position(&self) -> String { 117 | format!("{:?}", self.inner.position) 118 | } 119 | 120 | #[getter] 121 | fn semi_enzymatic(&self) -> bool { 122 | self.inner.semi_enzymatic 123 | } 124 | 125 | fn reverse(&self) -> PyResult { 126 | Ok(PyDigest { 127 | inner: self.inner.reverse(), 128 | }) 129 | } 130 | 131 | fn __eq__(&self, other: &PyDigest) -> bool { 132 | self.inner == other.inner 133 | } 134 | 135 | fn __hash__(&self) -> isize { 136 | let mut hasher = DefaultHasher::new(); 137 | self.inner.hash(&mut hasher); 138 | hasher.finish() as isize 139 | } 140 | } 141 | 142 | #[pyclass] 143 | #[derive(Clone)] 144 | pub struct PyEnzyme { 145 | pub inner: Enzyme, 146 | } 147 | 148 | #[pymethods] 149 | impl PyEnzyme { 150 | #[new] 151 | #[pyo3(signature = (cleave, c_terminal, semi_enzymatic, skip_suffix=None))] 152 | fn new( 153 | cleave: &str, 154 | c_terminal: bool, 155 | semi_enzymatic: bool, 156 | skip_suffix: Option, 157 | ) -> PyResult { 158 | match Enzyme::new(cleave, skip_suffix, c_terminal, semi_enzymatic) { 159 | Some(enzyme) => Ok(PyEnzyme { inner: enzyme }), 160 | None => Err(PyValueError::new_err("Failed to create Enzyme")), 161 | } 162 | } 163 | 164 | #[getter] 165 | fn c_terminal(&self) -> bool { 166 | self.inner.c_terminal 167 | } 168 | 169 | #[getter] 170 | fn skip_suffix(&self) -> Option { 171 | self.inner.skip_suffix 172 | } 173 | 174 | #[getter] 175 | fn semi_enzymatic(&self) -> bool { 176 | self.inner.semi_enzymatic 177 | } 178 | 179 | fn cleavage_sites(&self, py: Python, sequence: &str) -> PyResult>> { 180 | // Call the original cleavage_sites method 181 | let sites = self.inner.cleavage_sites(sequence); 182 | 183 | // Convert the Vec> to Vec while flattening 184 | let sites_flat: Vec = sites 185 | .into_iter() 186 | .flat_map(|s| vec![s.site.start, s.site.end]) 187 | .collect(); 188 | 189 | let rows = sites_flat.len() / 2; 190 | let np_array: Py> = sites_flat 191 | .into_pyarray(py) 192 | .reshape([rows, 2])? 193 | .unbind(); 194 | 195 | Ok(np_array) 196 | } 197 | } 198 | 199 | #[pyclass] 200 | pub struct PyEnzymeParameters { 201 | pub inner: EnzymeParameters, 202 | } 203 | 204 | #[pymethods] 205 | impl PyEnzymeParameters { 206 | #[new] 207 | #[pyo3(signature = (missed_cleavages, min_len, max_len, enzyme=None))] 208 | fn new(missed_cleavages: u8, min_len: usize, max_len: usize, enzyme: Option) -> Self { 209 | PyEnzymeParameters { 210 | inner: EnzymeParameters { 211 | missed_cleavages, 212 | min_len, 213 | max_len, 214 | enyzme: enzyme.map(|e| e.inner), 215 | }, 216 | } 217 | } 218 | 219 | #[getter] 220 | fn missed_cleavages(&self) -> u8 { 221 | self.inner.missed_cleavages 222 | } 223 | 224 | #[getter] 225 | fn min_len(&self) -> usize { 226 | self.inner.min_len 227 | } 228 | 229 | #[getter] 230 | fn max_len(&self) -> usize { 231 | self.inner.max_len 232 | } 233 | 234 | #[getter] 235 | fn enzyme(&self, _py: Python) -> PyResult> { 236 | match &self.inner.enyzme { 237 | Some(enzyme) => Ok(Some(PyEnzyme { 238 | inner: enzyme.clone(), 239 | })), 240 | None => Ok(None), 241 | } 242 | } 243 | fn cleavage_sites(&self, py: Python, sequence: &str) -> PyResult>> { 244 | // Call the original cleavage_sites method 245 | let sites = self.inner.cleavage_sites(sequence); 246 | 247 | // Convert the Vec> to Vec while flattening 248 | let sites_flat: Vec = sites 249 | .into_iter() 250 | .flat_map(|s| vec![s.site.start, s.site.end]) 251 | .collect(); 252 | 253 | let rows = sites_flat.len() / 2; 254 | let np_array: Py> = 255 | sites_flat.into_pyarray(py).reshape([rows, 2])?.unbind(); 256 | 257 | Ok(np_array) 258 | } 259 | 260 | pub fn digest(&self, py: Python, sequence: &str, protein: &str) -> PyResult> { 261 | let digests = self.inner.digest(sequence, Arc::from(protein.to_string())); 262 | 263 | // Create an empty Python list 264 | let list: Py = PyList::empty(py).into(); 265 | 266 | // Iterate over the digests and append them to the list 267 | for digest in digests { 268 | let py_digest = Py::new(py, PyDigest { inner: digest })?; 269 | list.bind(py).append(py_digest)?; 270 | } 271 | 272 | Ok(list.into()) 273 | } 274 | } 275 | 276 | #[pymodule] 277 | pub fn py_enzyme(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 278 | m.add_class::()?; 279 | m.add_class::()?; 280 | m.add_class::()?; 281 | m.add_class::()?; 282 | Ok(()) 283 | } 284 | -------------------------------------------------------------------------------- /sagepy-connector/src/py_fasta.rs: -------------------------------------------------------------------------------- 1 | use pyo3::IntoPyObjectExt; 2 | use sage_core::fasta::Fasta; 3 | 4 | use crate::py_enzyme::{PyDigest, PyEnzymeParameters}; 5 | use pyo3::prelude::*; 6 | 7 | #[pyclass] 8 | #[derive(Clone)] 9 | pub struct PyFasta { 10 | pub inner: Fasta, 11 | } 12 | 13 | #[pymethods] 14 | impl PyFasta { 15 | #[staticmethod] 16 | fn parse(contents: String, decoy_tag: String, generate_decoys: bool) -> PyResult { 17 | Ok(PyFasta { 18 | inner: Fasta::parse(contents, decoy_tag, generate_decoys), 19 | }) 20 | } 21 | 22 | fn digest(&self, py: Python, enzyme_params: &PyEnzymeParameters) -> PyResult { 23 | let digests = self.inner.digest(&enzyme_params.inner); 24 | let py_digests: Vec = 25 | digests.into_iter().map(|d| PyDigest { inner: d }).collect(); 26 | Ok(py_digests.into_pyobject_or_pyerr(py)?.unbind()) 27 | } 28 | } 29 | 30 | #[pymodule] 31 | pub fn py_fasta(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 32 | m.add_class::()?; 33 | Ok(()) 34 | } 35 | -------------------------------------------------------------------------------- /sagepy-connector/src/py_fdr.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::types::PyList; 3 | use qfdrust::psm::Psm; 4 | use sage_core::fdr::{Competition, picked_peptide, picked_protein}; 5 | 6 | use sage_core::database::{PeptideIx}; 7 | use sage_core::scoring::Feature; 8 | use crate::py_database::{PyIndexedDatabase, PyPeptideIx}; 9 | use crate::py_scoring::{PyFeature, PyPsm}; 10 | use rayon::prelude::*; 11 | 12 | #[pyclass] 13 | // TODO: Check if it makes sense to tie this to PeptideIx 14 | struct PyCompetitionPeptideIx { 15 | inner: Competition, 16 | } 17 | 18 | #[pymethods] 19 | impl PyCompetitionPeptideIx { 20 | #[new] 21 | #[pyo3(signature = (forward, reverse, forward_ix=None, reverse_ix=None))] 22 | fn new(forward: f32, reverse: f32, forward_ix: Option, reverse_ix: Option) -> Self { 23 | PyCompetitionPeptideIx { 24 | inner: Competition { 25 | forward, 26 | foward_ix: forward_ix.map(|ix| ix.inner), 27 | reverse, 28 | reverse_ix: reverse_ix.map(|ix| ix.inner), 29 | }, 30 | } 31 | } 32 | #[getter] 33 | fn forward(&self) -> f32 { 34 | self.inner.forward 35 | } 36 | 37 | #[getter] 38 | fn reverse(&self) -> f32 { 39 | self.inner.reverse 40 | } 41 | 42 | #[getter] 43 | fn forward_ix(&self) -> Option { 44 | self.inner.foward_ix.map(|ix| PyPeptideIx { inner: ix }) 45 | } 46 | 47 | #[getter] 48 | fn reverse_ix(&self) -> Option { 49 | self.inner.reverse_ix.map(|ix| PyPeptideIx { inner: ix }) 50 | } 51 | } 52 | 53 | #[pyfunction] 54 | pub fn py_sage_fdr(_py: Python, feature_collection: &Bound<'_, PyList>, indexed_database: &PyIndexedDatabase, use_hyper_score: bool) -> PyResult<()> { 55 | 56 | // Extract the inner collection of Feature objects along with their original indices 57 | let mut indexed_inner_collection: Vec<(usize, Feature)> = feature_collection.iter() 58 | .enumerate() 59 | .map(|(index, item)| { 60 | // Extract each item as a Bound 61 | let feature: Bound<'_, PyFeature> = item.extract().expect("Failed to extract PyFeature"); 62 | // Clone the inner Feature and keep the original index 63 | (index, feature.borrow().inner.clone()) 64 | }) 65 | .collect(); 66 | 67 | // Set discriminant score to hyper score 68 | indexed_inner_collection.par_iter_mut().for_each(|(_, feat)| { 69 | match use_hyper_score { 70 | false => { 71 | feat.discriminant_score = (-feat.poisson as f32).ln_1p() + feat.longest_y_pct / 3.0 72 | } 73 | true => { 74 | feat.discriminant_score = feat.hyperscore as f32; 75 | } 76 | } 77 | }); 78 | 79 | // Sort indexed_inner_collection by discriminant_score 80 | indexed_inner_collection.par_sort_unstable_by(|(_, a), (_, b)| b.discriminant_score.total_cmp(&a.discriminant_score)); 81 | 82 | // Extract the sorted indices 83 | let sorted_indices: Vec = indexed_inner_collection.iter().map(|(index, _)| *index).collect(); 84 | 85 | // Perform additional operations on the sorted inner_collection 86 | let mut inner_collection: Vec = indexed_inner_collection.into_iter().map(|(_, feat)| feat).collect(); 87 | let _ = sage_core::ml::qvalue::spectrum_q_value(&mut inner_collection); 88 | let _ = picked_peptide(&indexed_database.inner, &mut inner_collection); 89 | let _ = picked_protein(&indexed_database.inner, &mut inner_collection); 90 | 91 | // Update the original feature_collection according to the sorted order 92 | for (sorted_index, sorted_feature) in sorted_indices.iter().zip(inner_collection.iter()) { 93 | let feature: Bound<'_, PyFeature> = feature_collection.get_item(*sorted_index).expect("Failed to get PyFeature").extract()?; 94 | let mut feature_borrow = feature.borrow_mut(); 95 | // Update the feature's fields 96 | feature_borrow.inner.discriminant_score = sorted_feature.discriminant_score; 97 | feature_borrow.inner.spectrum_q = sorted_feature.spectrum_q; 98 | feature_borrow.inner.peptide_q = sorted_feature.peptide_q; 99 | feature_borrow.inner.protein_q = sorted_feature.protein_q; 100 | } 101 | 102 | Ok(()) 103 | } 104 | 105 | #[pyfunction] 106 | pub fn py_sage_fdr_psm(_py: Python, psm_collection: &Bound<'_, PyList>, indexed_database: &PyIndexedDatabase, use_hyper_score: bool) -> PyResult<()> { 107 | 108 | // Extract the inner collection of Feature objects along with their original indices 109 | let mut indexed_inner_collection: Vec<(usize, Psm)> = psm_collection.iter() 110 | .enumerate() 111 | .map(|(index, item)| { 112 | // Extract each item as a Bound 113 | let feature: Bound<'_, PyPsm> = item.extract().expect("Failed to extract PyFeature"); 114 | // Clone the inner Feature and keep the original index 115 | (index, feature.borrow().inner.clone()) 116 | }) 117 | .collect(); 118 | 119 | // Set discriminant score to hyper score 120 | indexed_inner_collection.par_iter_mut().for_each(|(_, feat)| { 121 | match use_hyper_score { 122 | false => { 123 | feat.sage_feature.discriminant_score = feat.re_score.unwrap_or(0.0) as f32; 124 | } 125 | true => { 126 | feat.sage_feature.discriminant_score = feat.sage_feature.hyperscore as f32; 127 | } 128 | } 129 | }); 130 | 131 | // Sort indexed_inner_collection by discriminant_score 132 | indexed_inner_collection.par_sort_unstable_by(|(_, a), (_, b)| b.sage_feature.discriminant_score.total_cmp(&a.sage_feature.discriminant_score)); 133 | 134 | // Extract the sorted indices 135 | let sorted_indices: Vec = indexed_inner_collection.iter().map(|(index, _)| *index).collect(); 136 | 137 | // Perform additional operations on the sorted inner_collection 138 | let mut inner_collection: Vec = indexed_inner_collection.into_iter().map(|(_, feat)| feat.sage_feature).collect(); 139 | let _ = sage_core::ml::qvalue::spectrum_q_value(&mut inner_collection); 140 | let _ = picked_peptide(&indexed_database.inner, &mut inner_collection); 141 | let _ = picked_protein(&indexed_database.inner, &mut inner_collection); 142 | 143 | // Update the original psm_collection according to the sorted order 144 | for (sorted_index, sorted_feature) in sorted_indices.iter().zip(inner_collection.iter()) { 145 | let feature: Bound<'_, PyPsm> = psm_collection.get_item(*sorted_index).expect("Failed to get PyFeature").extract()?; 146 | let mut feature_borrow = feature.borrow_mut(); 147 | feature_borrow.inner.sage_feature.discriminant_score = sorted_feature.discriminant_score; 148 | feature_borrow.inner.sage_feature.spectrum_q = sorted_feature.spectrum_q; 149 | feature_borrow.inner.sage_feature.peptide_q = sorted_feature.peptide_q; 150 | feature_borrow.inner.sage_feature.protein_q = sorted_feature.protein_q; 151 | } 152 | 153 | Ok(()) 154 | } 155 | 156 | #[pymodule] 157 | pub fn py_fdr(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 158 | m.add_class::()?; 159 | m.add_function(wrap_pyfunction!(py_sage_fdr, m)?)?; 160 | m.add_function(wrap_pyfunction!(py_sage_fdr_psm, m)?)?; 161 | Ok(()) 162 | } -------------------------------------------------------------------------------- /sagepy-connector/src/py_intensity.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | use pyo3::prelude::*; 3 | use qfdrust::intensity::FragmentIntensityPrediction; 4 | use crate::py_scoring::PyFragments; 5 | 6 | #[pyclass] 7 | #[derive(Clone, Debug)] 8 | pub struct PyFragmentIntensityPrediction { 9 | pub inner: FragmentIntensityPrediction, 10 | } 11 | 12 | #[pymethods] 13 | impl PyFragmentIntensityPrediction { 14 | #[new] 15 | fn new( 16 | fragments: PyFragments, 17 | prosit_intensity_predicted: Vec, 18 | ) -> Self { 19 | PyFragmentIntensityPrediction { 20 | inner: FragmentIntensityPrediction { 21 | fragments: fragments.inner.clone(), 22 | prosit_intensity_predicted, 23 | }, 24 | } 25 | } 26 | 27 | #[getter] 28 | fn prosit_intensity_predicted(&self) -> Vec { 29 | self.inner.prosit_intensity_predicted.clone() 30 | } 31 | 32 | #[setter] 33 | fn set_prosit_intensity_predicted(&mut self, prosit_intensity_predicted: Vec) { 34 | self.inner.prosit_intensity_predicted = prosit_intensity_predicted; 35 | } 36 | 37 | fn cosine_similarity(&self, epsilon: f32, reduce_matched: bool) -> f32 { 38 | self.inner.cosine_similarity(epsilon, reduce_matched).unwrap() 39 | } 40 | 41 | fn spectral_angle_similarity(&self, epsilon: f32, reduce_matched: bool) -> f32 { 42 | self.inner.spectral_angle_similarity(epsilon, reduce_matched) 43 | } 44 | 45 | fn pearson_correlation(&self, epsilon: f32, reduce_matched: bool) -> f32 { 46 | self.inner.pearson_correlation(epsilon, reduce_matched) 47 | } 48 | 49 | fn spearman_correlation(&self, epsilon: f32, reduce_matched: bool) -> f32 { 50 | self.inner.spearman_correlation(epsilon, reduce_matched) 51 | } 52 | 53 | fn spectral_entropy_similarity(&self, epsilon: f32, reduce_matched: bool) -> f32 { 54 | self.inner.spectral_entropy_similarity(epsilon, reduce_matched) 55 | } 56 | 57 | fn observed_intensity_map(&self) -> BTreeMap<(u32, i32, i32), f32> { 58 | self.inner.observed_intensity_to_fragments_map() 59 | } 60 | 61 | fn predicted_intensity_map(&self) -> BTreeMap<(u32, i32, i32), f32> { 62 | self.inner.prosit_intensity_to_fragments_map() 63 | } 64 | 65 | fn prosit_intensity_to_fragments(&self) -> PyFragments { 66 | PyFragments { 67 | inner: self.inner.prosit_intensity_to_fragments(), 68 | } 69 | } 70 | } 71 | 72 | #[pymodule] 73 | pub fn py_intensity(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 74 | m.add_class::()?; 75 | Ok(()) 76 | } -------------------------------------------------------------------------------- /sagepy-connector/src/py_ion_series.rs: -------------------------------------------------------------------------------- 1 | use crate::py_peptide::PyPeptide; 2 | use pyo3::prelude::*; 3 | use sage_core::ion_series::{Ion, Kind}; 4 | use sage_core::mass::monoisotopic; 5 | 6 | #[pyclass] 7 | #[derive(Clone)] 8 | pub struct PyKind { 9 | pub inner: Kind, 10 | } 11 | 12 | #[pymethods] 13 | impl PyKind { 14 | #[new] 15 | fn new(kind: String) -> PyResult { 16 | match kind.to_lowercase().as_str() { 17 | "a" => Ok(PyKind { inner: Kind::A }), 18 | "b" => Ok(PyKind { inner: Kind::B }), 19 | "c" => Ok(PyKind { inner: Kind::C }), 20 | "x" => Ok(PyKind { inner: Kind::X }), 21 | "y" => Ok(PyKind { inner: Kind::Y }), 22 | "z" => Ok(PyKind { inner: Kind::Z }), 23 | _ => Err(PyErr::new::(format!( 24 | "Invalid Kind value: {}", 25 | kind 26 | ))), 27 | } 28 | } 29 | pub fn kind_as_string(&self) -> String { 30 | format!("{:?}", self.inner) 31 | } 32 | } 33 | 34 | #[pyclass] 35 | pub struct PyIon { 36 | pub inner: Ion, 37 | } 38 | 39 | #[pymethods] 40 | impl PyIon { 41 | #[new] 42 | fn new(kind: PyKind, monoisotopic_mass: f32) -> PyResult { 43 | let inner_ion = Ion { 44 | kind: kind.inner, // Conversion from PyKind to Rust Kind 45 | monoisotopic_mass, 46 | }; 47 | Ok(PyIon { inner: inner_ion }) 48 | } 49 | 50 | // Getter methods for accessing Ion properties 51 | #[getter] 52 | fn kind(&self) -> PyResult { 53 | Ok(PyKind { 54 | inner: self.inner.kind, 55 | }) 56 | } 57 | 58 | #[getter] 59 | fn monoisotopic_mass(&self) -> PyResult { 60 | Ok(self.inner.monoisotopic_mass) 61 | } 62 | } 63 | 64 | #[pyclass] 65 | pub struct PyIonSeries { 66 | pub kind: PyKind, 67 | pub cumulative_mass: f32, 68 | pub peptide: PyPeptide, 69 | } 70 | 71 | #[pymethods] 72 | impl PyIonSeries { 73 | #[new] 74 | pub fn new(_py: Python, peptide: PyPeptide, kind: PyKind) -> PyResult { 75 | const C: f32 = 12.0; 76 | const O: f32 = 15.994914; 77 | const H: f32 = 1.007825; 78 | const PRO: f32 = 1.0072764; 79 | const N: f32 = 14.003074; 80 | const NH3: f32 = N + H * 3.0 + PRO; 81 | 82 | let cumulative_mass = match kind.inner { 83 | Kind::A => peptide.inner.nterm.unwrap_or_default() - (C + O), 84 | Kind::B => peptide.inner.nterm.unwrap_or_default(), 85 | Kind::C => peptide.inner.nterm.unwrap_or_default() + NH3, 86 | 87 | Kind::X => { 88 | peptide.inner.monoisotopic - peptide.inner.nterm.unwrap_or_default() 89 | + (C + O - NH3 + N + H) 90 | } 91 | Kind::Y => peptide.inner.monoisotopic - peptide.inner.nterm.unwrap_or_default(), 92 | Kind::Z => peptide.inner.monoisotopic - peptide.inner.nterm.unwrap_or_default() - NH3, 93 | }; 94 | 95 | Ok(Self { 96 | kind, 97 | cumulative_mass, 98 | peptide, 99 | }) 100 | } 101 | 102 | #[getter] 103 | fn kind(&self) -> PyResult { 104 | Ok(self.kind.clone()) 105 | } 106 | 107 | #[getter] 108 | fn cumulative_mass(&self) -> PyResult { 109 | Ok(self.cumulative_mass) 110 | } 111 | 112 | #[getter] 113 | fn peptide(&self) -> PyResult { 114 | Ok(self.peptide.clone()) 115 | } 116 | 117 | pub fn get_ion_series(&self) -> PyResult> { 118 | let mut ions = Vec::new(); 119 | let mut cm = self.cumulative_mass; 120 | 121 | for idx in 0..self.peptide.inner.sequence.len() - 1 { 122 | let r = self.peptide.inner.sequence[idx]; 123 | let m = self.peptide.inner.modifications.get(idx).unwrap_or(&0.0); 124 | 125 | cm += match self.kind.inner { 126 | Kind::A | Kind::B | Kind::C => monoisotopic(r) + m, 127 | Kind::X | Kind::Y | Kind::Z => -(monoisotopic(r) + m), 128 | }; 129 | 130 | ions.push(PyIon { 131 | inner: Ion { 132 | kind: self.kind.inner.clone(), 133 | monoisotopic_mass: cm, 134 | }, 135 | }); 136 | } 137 | Ok(ions) 138 | } 139 | } 140 | 141 | #[pymodule] 142 | pub fn py_ion_series(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 143 | m.add_class::()?; 144 | m.add_class::()?; 145 | m.add_class::()?; 146 | Ok(()) 147 | } 148 | -------------------------------------------------------------------------------- /sagepy-connector/src/py_mass.rs: -------------------------------------------------------------------------------- 1 | use pyo3::exceptions::PyValueError; 2 | use pyo3::prelude::*; 3 | use pyo3::types::PyList; 4 | 5 | use sage_core::mass::{ 6 | composition, monoisotopic, Composition, Tolerance, H2O, NEUTRON, NH3, PROTON, 7 | }; 8 | 9 | #[pyfunction] 10 | fn h2o() -> f32 { 11 | H2O 12 | } 13 | 14 | #[pyfunction] 15 | fn proton() -> f32 { 16 | PROTON 17 | } 18 | 19 | #[pyfunction] 20 | fn neutron() -> f32 { 21 | NEUTRON 22 | } 23 | 24 | #[pyfunction] 25 | fn nh3() -> f32 { 26 | NH3 27 | } 28 | 29 | #[pyfunction] 30 | fn py_monoisotopic(aa: &str) -> PyResult { 31 | if aa.len() == 1 && aa.chars().next().unwrap().is_ascii_uppercase() { 32 | let aa_u8 = aa.as_bytes()[0]; 33 | Ok(monoisotopic(aa_u8)) 34 | } else { 35 | Err(PyErr::new::( 36 | "Input must be a single uppercase ASCII character.", 37 | )) 38 | } 39 | } 40 | 41 | #[pyclass] 42 | #[derive(Clone)] 43 | pub struct PyComposition { 44 | inner: Composition, 45 | } 46 | 47 | #[pymethods] 48 | impl PyComposition { 49 | #[new] 50 | pub fn new(carbon: u16, sulfur: u16) -> Self { 51 | PyComposition { 52 | inner: Composition::new(carbon, 0, sulfur), 53 | } 54 | } 55 | 56 | // Exposing fields for Python access 57 | #[getter] 58 | pub fn carbon(&self) -> u16 { 59 | self.inner.carbon 60 | } 61 | 62 | #[getter] 63 | pub fn sulfur(&self) -> u16 { 64 | self.inner.sulfur 65 | } 66 | 67 | // Static method to sum compositions 68 | #[staticmethod] 69 | pub fn sum(compositions: &Bound<'_, PyList>) -> PyResult { 70 | let mut total_composition = Composition::new(0, 0, 0); 71 | 72 | for comp in compositions.iter() { 73 | let py_comp: PyComposition = comp.extract()?; 74 | total_composition.carbon += py_comp.inner.carbon; 75 | total_composition.sulfur += py_comp.inner.sulfur; 76 | } 77 | 78 | Ok(PyComposition { 79 | inner: total_composition, 80 | }) 81 | } 82 | 83 | #[staticmethod] 84 | fn py_composition(aa: &str) -> PyResult { 85 | // Ensure the string is exactly one character long 86 | if aa.chars().count() == 1 { 87 | // Extract the first character 88 | let aa_char = aa.chars().next().unwrap(); // Safe to use unwrap here as we know it has exactly one character 89 | Ok(PyComposition { 90 | inner: composition(aa_char as u8), 91 | }) 92 | } else { 93 | // Return an error if the string is not a single character 94 | Err(PyErr::new::( 95 | "Expected a single character string", 96 | )) 97 | } 98 | } 99 | } 100 | 101 | #[pyclass] 102 | #[derive(Clone)] 103 | pub struct PyTolerance { 104 | pub inner: Tolerance, 105 | } 106 | 107 | #[pymethods] 108 | impl PyTolerance { 109 | #[new] 110 | #[pyo3(signature = (da=None, ppm=None))] 111 | fn new(da: Option<(f32, f32)>, ppm: Option<(f32, f32)>) -> PyResult { 112 | let tolerance = match (da, ppm) { 113 | (Some((lo, hi)), None) => Tolerance::Da(lo, hi), 114 | (None, Some((lo, hi))) => Tolerance::Ppm(lo, hi), 115 | _ => { 116 | return Err(PyValueError::new_err( 117 | "Provide either da or ppm values, not both.", 118 | )) 119 | } 120 | }; 121 | 122 | Ok(PyTolerance { inner: tolerance }) 123 | } 124 | 125 | #[getter] 126 | fn da(&self) -> Option<(f32, f32)> { 127 | match self.inner { 128 | Tolerance::Da(lo, hi) => Some((lo, hi)), 129 | _ => None, 130 | } 131 | } 132 | 133 | #[getter] 134 | fn ppm(&self) -> Option<(f32, f32)> { 135 | match self.inner { 136 | Tolerance::Ppm(lo, hi) => Some((lo, hi)), 137 | _ => None, 138 | } 139 | } 140 | 141 | fn bounds(&self, center: f32) -> (f32, f32) { 142 | self.inner.bounds(center) 143 | } 144 | 145 | fn contains(&self, center: f32, target: f32) -> bool { 146 | self.inner.contains(center, target) 147 | } 148 | 149 | #[staticmethod] 150 | fn ppm_to_delta_mass(center: f32, ppm: f32) -> f32 { 151 | Tolerance::ppm_to_delta_mass(center, ppm) 152 | } 153 | 154 | fn __mul__(&self, rhs: f64) -> PyResult { 155 | let result = self.inner.clone() * rhs as f32; 156 | Ok(PyTolerance { inner: result }) 157 | } 158 | } 159 | 160 | #[pymodule] 161 | pub fn py_mass(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 162 | m.add_function(wrap_pyfunction!(h2o, m)?)?; 163 | m.add_function(wrap_pyfunction!(proton, m)?)?; 164 | m.add_function(wrap_pyfunction!(neutron, m)?)?; 165 | m.add_function(wrap_pyfunction!(nh3, m)?)?; 166 | m.add_function(wrap_pyfunction!(py_monoisotopic, m)?)?; 167 | m.add_class::()?; 168 | m.add_class::()?; 169 | Ok(()) 170 | } 171 | -------------------------------------------------------------------------------- /sagepy-connector/src/py_mobility_model.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::types::PyList; 3 | use pyo3::exceptions::PyRuntimeError; 4 | use sage_core::ml::mobility_model::predict; 5 | use sage_core::scoring::Feature; 6 | use crate::py_database::PyIndexedDatabase; 7 | use crate::py_scoring::{PyPsm}; 8 | 9 | #[pyfunction] 10 | pub fn py_predict_im( 11 | _py: Python, 12 | psm_collection: &Bound<'_, PyList>, 13 | indexed_database: &PyIndexedDatabase, 14 | ) -> PyResult<()> { 15 | 16 | let indexed_feats: Vec<(usize, Feature)> = psm_collection.iter() 17 | .enumerate() 18 | .map(|(idx, item)| { 19 | let psm: Bound<'_, PyPsm> = item 20 | .extract() 21 | .expect("Failed to extract PyPsm"); 22 | // clone just the inner Feature (sage_feature) 23 | (idx, psm.borrow().inner.sage_feature.clone()) 24 | }) 25 | .collect(); 26 | 27 | let mut feats: Vec = indexed_feats.iter() 28 | .map(|(_, feat)| feat.clone()) 29 | .collect(); 30 | 31 | if predict(&indexed_database.inner, &mut feats).is_none() { 32 | return Err(PyRuntimeError::new_err( 33 | "Retention model fit failed: not enough data or R² < 0.7" 34 | )); 35 | } 36 | 37 | // 3) write back the two mutated fields 38 | for ((orig_idx, _), updated) in indexed_feats.iter().zip(feats.iter()) { 39 | let psm: Bound<'_, PyPsm> = psm_collection 40 | .get_item(*orig_idx) 41 | .expect("Failed to get PyPsm") 42 | .extract()?; 43 | let mut psm_borrow = psm.borrow_mut(); 44 | psm_borrow.inner.sage_feature.predicted_ims = updated.predicted_ims; 45 | psm_borrow.inner.sage_feature.delta_ims_model = updated.delta_ims_model; 46 | } 47 | 48 | Ok(()) 49 | } 50 | 51 | 52 | #[pymodule] 53 | pub fn py_mobility_model(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 54 | m.add_function(wrap_pyfunction!(py_predict_im, m)?)?; 55 | Ok(()) 56 | } -------------------------------------------------------------------------------- /sagepy-connector/src/py_modification.rs: -------------------------------------------------------------------------------- 1 | use pyo3::exceptions::PyValueError; 2 | use pyo3::prelude::*; 3 | use pyo3::types::PyDict; 4 | use sage_core::modification::{validate_mods, InvalidModification, ModificationSpecificity}; 5 | use std::collections::HashMap; 6 | use std::str::FromStr; 7 | 8 | #[pyclass] 9 | #[derive(Clone, Debug, PartialEq, Hash)] 10 | pub struct PyModificationSpecificity { 11 | pub inner: ModificationSpecificity, 12 | } 13 | 14 | #[pymethods] 15 | impl PyModificationSpecificity { 16 | #[new] 17 | pub fn new(s: &str) -> PyResult { 18 | match ModificationSpecificity::from_str(s) { 19 | Ok(m) => Ok(PyModificationSpecificity { inner: m }), 20 | Err(InvalidModification::Empty) => { 21 | Err(PyValueError::new_err("Empty modification string")) 22 | } 23 | Err(InvalidModification::InvalidResidue(c)) => Err(PyValueError::new_err(format!( 24 | "Invalid modification string: unrecognized residue ({})", 25 | c 26 | ))), 27 | Err(InvalidModification::TooLong(s)) => Err(PyValueError::new_err(format!( 28 | "Invalid modification string: {} is too long", 29 | s 30 | ))), 31 | } 32 | } 33 | 34 | #[getter] 35 | pub fn as_string(&self) -> String { 36 | self.inner.to_string() 37 | } 38 | } 39 | 40 | impl Eq for PyModificationSpecificity {} 41 | 42 | #[pyfunction] 43 | #[pyo3(signature = (input=None))] 44 | pub fn py_validate_mods(input: Option<&Bound<'_, PyDict>>) -> HashMap { 45 | // unwrap the input 46 | let input = input.map(|d| d.extract::>().unwrap()); 47 | // validate the mods 48 | let output = validate_mods(input); 49 | // convert to a py dict 50 | let py_validated_mods = output 51 | .iter() 52 | .map(|(k, v)| (PyModificationSpecificity { inner: k.clone() }, *v)) 53 | .collect::>(); 54 | 55 | py_validated_mods 56 | } 57 | 58 | #[pyfunction] 59 | #[pyo3(signature = (input=None))] 60 | pub fn py_validate_var_mods( 61 | input: Option<&Bound<'_, PyDict>>, 62 | ) -> HashMap> { 63 | // unwrap the input 64 | let input = input.map(|d| d.extract::>>().unwrap()); 65 | let mut output: HashMap> = HashMap::new(); 66 | 67 | if let Some(input) = input { 68 | for (s, mass) in input { 69 | match ModificationSpecificity::from_str(&s) { 70 | Ok(m) => { 71 | output.insert(PyModificationSpecificity { inner: m }, mass); 72 | } 73 | Err(InvalidModification::Empty) => { 74 | log::error!("Skipping invalid modification string: empty") 75 | } 76 | Err(InvalidModification::InvalidResidue(c)) => { 77 | log::error!( 78 | "Skipping invalid modification string: unrecognized residue ({})", 79 | c 80 | ) 81 | } 82 | Err(InvalidModification::TooLong(s)) => { 83 | log::error!("Skipping invalid modification string: {} is too long", s) 84 | } 85 | } 86 | } 87 | } 88 | output 89 | } 90 | 91 | #[pymodule] 92 | pub fn py_modification(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 93 | m.add_class::()?; 94 | m.add_wrapped(wrap_pyfunction!(py_validate_mods))?; 95 | m.add_wrapped(wrap_pyfunction!(py_validate_var_mods))?; 96 | Ok(()) 97 | } 98 | -------------------------------------------------------------------------------- /sagepy-connector/src/py_peptide.rs: -------------------------------------------------------------------------------- 1 | use pyo3::exceptions::PyValueError; 2 | use pyo3::prelude::*; 3 | use std::sync::Arc; 4 | 5 | use crate::py_enzyme::{PyDigest, PyPosition}; 6 | use sage_core::peptide::Peptide; 7 | 8 | #[pyclass] 9 | #[derive(Clone)] 10 | pub struct PyPeptide { 11 | pub inner: Peptide, 12 | } 13 | 14 | #[pymethods] 15 | impl PyPeptide { 16 | #[new] 17 | #[pyo3(signature = (decoy, sequence, modifications, mono_isotopic, missed_cleavages, position, proteins, semi_enzymatic, n_term=None, c_term=None))] 18 | pub fn new( 19 | decoy: bool, 20 | sequence: String, 21 | modifications: Vec, 22 | mono_isotopic: f32, 23 | missed_cleavages: u8, 24 | position: PyPosition, 25 | proteins: Vec, 26 | semi_enzymatic: bool, 27 | n_term: Option, 28 | c_term: Option, 29 | ) -> PyResult { 30 | let sequence_bytes = sequence.into_bytes(); // Convert the string to Vec 31 | let boxed_sequence = sequence_bytes.into_boxed_slice(); // Convert Vec to Box<[u8]> 32 | let arc_sequence = Arc::from(boxed_sequence); // Convert Box<[u8]> to Arc<[u8]> without dereferencing 33 | 34 | // Convert Python list of strings to Vec> 35 | let arc_proteins = proteins.into_iter().map(Arc::from).collect(); 36 | 37 | Ok(PyPeptide { 38 | inner: Peptide { 39 | decoy, 40 | sequence: arc_sequence, 41 | modifications: modifications, 42 | nterm: n_term, 43 | cterm: c_term, 44 | monoisotopic: mono_isotopic, 45 | missed_cleavages, 46 | position: position.inner, 47 | proteins: arc_proteins, 48 | semi_enzymatic, 49 | }, 50 | }) 51 | } 52 | 53 | #[staticmethod] 54 | fn try_new_from_digest(digest: &PyDigest) -> PyResult { 55 | let peptide = Peptide::try_from(digest.inner.clone()) 56 | .map_err(|_e| PyErr::new::(format!("Error creating peptide.")))?; 57 | Ok(PyPeptide { inner: peptide }) 58 | } 59 | 60 | #[getter] 61 | pub fn decoy(&self) -> bool { 62 | self.inner.decoy 63 | } 64 | 65 | #[getter] 66 | pub fn sequence(&self) -> &str { 67 | std::str::from_utf8(&self.inner.sequence).unwrap() 68 | } 69 | 70 | #[getter] 71 | pub fn modifications(&self) -> Vec { 72 | self.inner.modifications.clone() 73 | } 74 | 75 | #[getter] 76 | pub fn n_term(&self) -> Option { 77 | self.inner.nterm 78 | } 79 | 80 | #[getter] 81 | pub fn c_term(&self) -> Option { 82 | self.inner.cterm 83 | } 84 | 85 | #[getter] 86 | pub fn monoisotopic(&self) -> f32 { 87 | self.inner.monoisotopic 88 | } 89 | 90 | #[getter] 91 | pub fn missed_cleavages(&self) -> u8 { 92 | self.inner.missed_cleavages 93 | } 94 | 95 | #[getter] 96 | pub fn position(&self) -> PyPosition { 97 | PyPosition { 98 | inner: self.inner.position, 99 | } 100 | } 101 | 102 | #[getter] 103 | pub fn proteins(&self) -> Vec { 104 | self.inner.proteins.iter().map(|s| s.to_string()).collect() 105 | } 106 | 107 | #[getter] 108 | pub fn semi_enzymatic(&self) -> bool { 109 | self.inner.semi_enzymatic 110 | } 111 | 112 | #[pyo3(signature = (keep_ends=None))] 113 | pub fn reverse(&self, keep_ends: Option) -> PyPeptide { 114 | PyPeptide { inner: self.inner.reverse(keep_ends.unwrap_or(true)), } 115 | } 116 | 117 | #[pyo3(signature = (keep_ends=None))] 118 | pub fn shuffle(&self, keep_ends: Option) -> PyPeptide { 119 | PyPeptide { inner: self.inner.shuffle(keep_ends.unwrap_or(true)), } 120 | } 121 | } 122 | 123 | #[pymodule] 124 | pub fn py_peptide(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 125 | m.add_class::()?; 126 | Ok(()) 127 | } 128 | -------------------------------------------------------------------------------- /sagepy-connector/src/py_qfdr.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::types::PyList; 3 | use qfdrust::dataset::TDCMethod; 4 | use qfdrust::picked::{protein_id_from_psm, spectrum_q_value, picked_peptide, picked_protein}; 5 | use qfdrust::psm::Psm; 6 | use crate::py_scoring::PyPsm; 7 | 8 | #[pyclass] 9 | #[derive(Clone)] 10 | pub struct PyTDCMethod { 11 | pub inner: TDCMethod, 12 | } 13 | 14 | #[pymethods] 15 | impl PyTDCMethod { 16 | #[new] 17 | fn new(method: &str) -> Self { 18 | PyTDCMethod { 19 | inner: TDCMethod::from_str(method), 20 | } 21 | } 22 | pub fn to_str(&self) -> &str { 23 | self.inner.to_str() 24 | } 25 | } 26 | 27 | #[pyfunction] 28 | pub fn target_decoy_competition( 29 | method: &PyTDCMethod, 30 | spectra_idx: Vec, 31 | match_idx: Vec, 32 | target: Vec, 33 | scores: Vec, 34 | match_identity_candidates: Vec>>, 35 | ) -> (Vec, Vec, Vec>, Vec, Vec, Vec) { 36 | let method = method.inner.clone(); 37 | 38 | let (spec_idx, match_idx, match_identity, decoy, scores, q_values) = 39 | qfdrust::dataset::target_decoy_competition( 40 | method, 41 | spectra_idx, 42 | match_idx, 43 | target, 44 | scores, 45 | match_identity_candidates, 46 | ); 47 | 48 | (spec_idx, match_idx, match_identity, decoy, scores, q_values) 49 | } 50 | 51 | #[pyfunction] 52 | pub fn assign_spectrum_q(_py: Python, psm_collection: &Bound<'_, PyList>, use_hyper_score: bool) -> PyResult<()> { 53 | let inner_collection: Vec = psm_collection 54 | .iter() 55 | .map(|item| { 56 | let feature: Bound<'_, PyPsm> = item.extract().expect("Failed to extract PyPsm"); 57 | feature.borrow().inner.clone() 58 | }) 59 | .collect(); 60 | 61 | let q_values = spectrum_q_value(&inner_collection, use_hyper_score); 62 | 63 | for (index, q_value) in q_values.iter().enumerate() { 64 | let feature: Bound<'_, PyPsm> = psm_collection.get_item(index).expect("Failed to get PyPsm").extract()?; 65 | let mut feature_borrow = feature.borrow_mut(); 66 | feature_borrow.inner.sage_feature.spectrum_q = *q_value as f32; 67 | } 68 | 69 | Ok(()) 70 | } 71 | 72 | #[pyfunction] 73 | pub fn assign_peptide_q(_py: Python, psm_collection: &Bound<'_, PyList>, use_hyper_score: bool) -> PyResult<()> { 74 | let mut inner_collection: Vec = psm_collection 75 | .iter() 76 | .map(|item| { 77 | let feature: Bound<'_, PyPsm> = item.extract().expect("Failed to extract PyPsm"); 78 | feature.borrow().inner.clone() 79 | }) 80 | .collect(); 81 | 82 | let q_values = picked_peptide(&mut inner_collection, use_hyper_score); 83 | 84 | for (index, _) in psm_collection.iter().enumerate() { 85 | let feature: Bound<'_, PyPsm> = psm_collection.get_item(index).expect("Failed to get PyPsm").extract()?; 86 | let mut feature_borrow = feature.borrow_mut(); 87 | 88 | let key = match feature_borrow.inner.sage_feature.label { 89 | -1 => feature_borrow.inner.sequence_decoy.clone().unwrap().sequence.clone(), 90 | _ => feature_borrow.inner.sequence.clone().unwrap().sequence.clone(), 91 | }; 92 | 93 | feature_borrow.inner.sage_feature.peptide_q = *q_values.get(&key).unwrap_or(&1.0) as f32; 94 | } 95 | 96 | Ok(()) 97 | } 98 | 99 | #[pyfunction] 100 | pub fn assign_protein_q(_py: Python, psm_collection: &Bound<'_, PyList>, use_hyper_score: bool) -> PyResult<()> { 101 | let mut inner_collection: Vec = psm_collection 102 | .iter() 103 | .map(|item| { 104 | let feature: Bound<'_, PyPsm> = item.extract().expect("Failed to extract PyPsm"); 105 | feature.borrow().inner.clone() 106 | }) 107 | .collect(); 108 | 109 | let q_values = picked_protein(&mut inner_collection, use_hyper_score); 110 | 111 | for (index, _) in psm_collection.iter().enumerate() { 112 | let feature: Bound<'_, PyPsm> = psm_collection.get_item(index).expect("Failed to get PyPsm").extract()?; 113 | let mut feature_borrow = feature.borrow_mut(); 114 | 115 | let key = protein_id_from_psm(&feature_borrow.inner, "rev_", true); 116 | 117 | feature_borrow.inner.sage_feature.protein_q = *q_values.get(&key).unwrap_or(&1.0) as f32; 118 | } 119 | 120 | Ok(()) 121 | } 122 | 123 | #[pymodule] 124 | pub fn py_qfdr(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 125 | m.add_class::()?; 126 | m.add_function(wrap_pyfunction!(target_decoy_competition, m)?)?; 127 | m.add_function(wrap_pyfunction!(assign_spectrum_q, m)?)?; 128 | m.add_function(wrap_pyfunction!(assign_peptide_q, m)?)?; 129 | m.add_function(wrap_pyfunction!(assign_protein_q, m)?)?; 130 | Ok(()) 131 | } -------------------------------------------------------------------------------- /sagepy-connector/src/py_retention_alignment.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::types::PyList; 3 | use sage_core::ml::retention_alignment::{Alignment, global_alignment}; 4 | 5 | use sage_core::scoring::Feature; 6 | use crate::py_scoring::{PyFeature, PyPsm}; 7 | 8 | #[pyclass] 9 | #[derive(Clone)] 10 | pub struct PyAlignment { 11 | pub inner: Alignment, 12 | } 13 | 14 | #[pymethods] 15 | impl PyAlignment { 16 | #[new] 17 | pub fn new( 18 | file_id: usize, 19 | max_rt: f32, 20 | slope: f32, 21 | intercept: f32, 22 | ) -> Self { 23 | PyAlignment { 24 | inner: Alignment { 25 | file_id, 26 | max_rt, 27 | slope, 28 | intercept, 29 | }, 30 | } 31 | } 32 | #[getter] 33 | pub fn file_id(&self) -> usize { 34 | self.inner.file_id 35 | } 36 | #[getter] 37 | pub fn max_rt(&self) -> f32 { 38 | self.inner.max_rt 39 | } 40 | #[getter] 41 | pub fn slope(&self) -> f32 { 42 | self.inner.slope 43 | } 44 | #[getter] 45 | pub fn intercept(&self) -> f32 { 46 | self.inner.intercept 47 | } 48 | } 49 | 50 | #[pyfunction] 51 | pub fn py_global_alignment( 52 | features: &Bound<'_, PyList>, 53 | n_files: usize, 54 | ) -> Vec { 55 | 56 | let mut inner_features: Vec = features.iter() 57 | .map(|item| { 58 | let feature: Bound<'_, PyFeature> = item.extract().expect("Failed to extract PyFeature"); 59 | feature.borrow().inner.clone() 60 | }) 61 | .collect(); 62 | 63 | global_alignment(&mut inner_features, n_files) 64 | .into_iter() 65 | .map(|alignment| PyAlignment { inner: alignment }) 66 | .collect() 67 | } 68 | 69 | #[pyfunction] 70 | pub fn py_global_alignment_psm( 71 | psms: &Bound<'_, PyList>, 72 | n_files: usize, 73 | ) -> Vec { 74 | // Step 1: clone out features + remember original index 75 | let indexed_psms: Vec<(usize, Feature)> = psms.iter() 76 | .enumerate() 77 | .map(|(i, item)| { 78 | let psm: Bound<'_, PyPsm> = item.extract().expect("Failed to extract PyPsm"); 79 | (i, psm.borrow().inner.sage_feature.clone()) 80 | }) 81 | .collect(); 82 | 83 | // Step 2: collect features to pass into alignment 84 | let mut features: Vec = indexed_psms.iter().map(|(_, feat)| feat.clone()).collect(); 85 | 86 | // Step 3: run global alignment on copied data 87 | let alignments = global_alignment(&mut features, n_files); 88 | 89 | // Step 4: write aligned_rt back into PyPsm 90 | for ((i, _), updated_feat) in indexed_psms.iter().zip(features.iter()) { 91 | let psm: Bound<'_, PyPsm> = psms.get_item(*i) 92 | .expect("Index out of range") 93 | .extract() 94 | .expect("Failed to extract PyPsm"); 95 | let mut psm_mut = psm.borrow_mut(); 96 | psm_mut.inner.sage_feature.aligned_rt = updated_feat.aligned_rt; 97 | } 98 | 99 | // Step 5: return the alignment parameters 100 | alignments 101 | .into_iter() 102 | .map(|alignment| PyAlignment { inner: alignment }) 103 | .collect() 104 | } 105 | 106 | #[pymodule] 107 | pub fn py_retention_alignment(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 108 | m.add_class::()?; 109 | m.add_function(wrap_pyfunction!(py_global_alignment, m)?)?; 110 | m.add_function(wrap_pyfunction!(py_global_alignment_psm, m)?)?; 111 | Ok(()) 112 | } -------------------------------------------------------------------------------- /sagepy-connector/src/py_retention_model.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3::types::PyList; 3 | use pyo3::exceptions::PyRuntimeError; 4 | use sage_core::ml::retention_model::predict; 5 | use sage_core::scoring::Feature; 6 | use crate::py_database::PyIndexedDatabase; 7 | use crate::py_scoring::{PyPsm}; 8 | 9 | #[pyfunction] 10 | pub fn py_predict_rt( 11 | _py: Python, 12 | psm_collection: &Bound<'_, PyList>, 13 | indexed_database: &PyIndexedDatabase, 14 | ) -> PyResult<()> { 15 | 16 | let indexed_feats: Vec<(usize, Feature)> = psm_collection.iter() 17 | .enumerate() 18 | .map(|(idx, item)| { 19 | let psm: Bound<'_, PyPsm> = item 20 | .extract() 21 | .expect("Failed to extract PyPsm"); 22 | // clone just the inner Feature (sage_feature) 23 | (idx, psm.borrow().inner.sage_feature.clone()) 24 | }) 25 | .collect(); 26 | 27 | let mut feats: Vec = indexed_feats.iter() 28 | .map(|(_, feat)| feat.clone()) 29 | .collect(); 30 | 31 | if predict(&indexed_database.inner, &mut feats).is_none() { 32 | return Err(PyRuntimeError::new_err( 33 | "Retention model fit failed: not enough data or R² < 0.7" 34 | )); 35 | } 36 | 37 | // 3) write back the two mutated fields 38 | for ((orig_idx, _), updated) in indexed_feats.iter().zip(feats.iter()) { 39 | let psm: Bound<'_, PyPsm> = psm_collection 40 | .get_item(*orig_idx) 41 | .expect("Failed to get PyPsm") 42 | .extract()?; 43 | let mut psm_borrow = psm.borrow_mut(); 44 | psm_borrow.inner.sage_feature.predicted_rt = updated.predicted_rt; 45 | psm_borrow.inner.sage_feature.delta_rt_model = updated.delta_rt_model; 46 | } 47 | 48 | Ok(()) 49 | } 50 | 51 | 52 | #[pymodule] 53 | pub fn py_retention_model(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 54 | m.add_function(wrap_pyfunction!(py_predict_rt, m)?)?; 55 | Ok(()) 56 | } -------------------------------------------------------------------------------- /sagepy-connector/src/py_tmt.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use sage_core::tmt::{Isobaric, Purity, TmtQuant}; 3 | use crate::py_scoring::PyFeature; 4 | use crate::py_spectrum::{PyPeak, PyProcessedSpectrum}; 5 | 6 | #[pyclass] 7 | pub struct PyIsobaric { 8 | pub inner: Isobaric, 9 | } 10 | 11 | #[pymethods] 12 | impl PyIsobaric { 13 | #[new] 14 | pub fn new( 15 | type_name: &str, 16 | ) -> Self { 17 | PyIsobaric { 18 | inner: match type_name { 19 | "tmt6" => Isobaric::Tmt6, 20 | "tmt10" => Isobaric::Tmt10, 21 | "tmt11" => Isobaric::Tmt11, 22 | "tmt16" => Isobaric::Tmt16, 23 | "tmt18" => Isobaric::Tmt18, 24 | _ => panic!("Invalid isobaric type"), 25 | }, 26 | } 27 | } 28 | #[getter] 29 | pub fn type_name(&self) -> String { 30 | match self.inner { 31 | Isobaric::Tmt6 => "tmt6".to_string(), 32 | Isobaric::Tmt10 => "tmt10".to_string(), 33 | Isobaric::Tmt11 => "tmt11".to_string(), 34 | Isobaric::Tmt16 => "tmt16".to_string(), 35 | Isobaric::Tmt18 => "tmt18".to_string(), 36 | _ => panic!("Invalid isobaric type"), 37 | } 38 | } 39 | 40 | pub fn modification_mass(&self) -> Option { 41 | self.inner.modification_mass() 42 | } 43 | } 44 | 45 | #[pyclass] 46 | #[derive(Clone)] 47 | pub struct PyPurity { 48 | pub inner: Purity, 49 | } 50 | 51 | #[pymethods] 52 | impl PyPurity { 53 | #[new] 54 | pub fn new(ratio: f32, correct_precursors: usize, incorrect_precursors: usize, ) -> Self { 55 | PyPurity { 56 | inner: Purity { 57 | ratio, 58 | correct_precursors, 59 | incorrect_precursors, 60 | }, 61 | } 62 | } 63 | 64 | #[getter] 65 | pub fn ratio(&self) -> f32 { 66 | self.inner.ratio 67 | } 68 | 69 | #[getter] 70 | pub fn correct_precursors(&self) -> usize { 71 | self.inner.correct_precursors 72 | } 73 | 74 | #[getter] 75 | pub fn incorrect_precursors(&self) -> usize { 76 | self.inner.incorrect_precursors 77 | } 78 | } 79 | 80 | #[pyclass] 81 | #[derive(Clone)] 82 | pub struct PyQuant { 83 | pub hit: PyFeature, 84 | pub hit_purity: PyPurity, 85 | pub spectrum: PyProcessedSpectrum, 86 | pub chimera: Option, 87 | pub chimera_purity: Option, 88 | pub intensities: Vec>, 89 | } 90 | 91 | #[pymethods] 92 | impl PyQuant { 93 | #[new] 94 | #[pyo3(signature = (hit, hit_purity, spectrum, intensities, chimera=None, chimera_purity=None))] 95 | pub fn new( 96 | hit: PyFeature, 97 | hit_purity: PyPurity, 98 | spectrum: PyProcessedSpectrum, 99 | intensities: Vec>, 100 | chimera: Option, 101 | chimera_purity: Option, 102 | ) -> Self { 103 | PyQuant { 104 | hit, 105 | hit_purity, 106 | spectrum, 107 | chimera, 108 | chimera_purity, 109 | intensities, 110 | } 111 | } 112 | 113 | #[getter] 114 | pub fn hit(&self) -> PyFeature { 115 | self.hit.clone() 116 | } 117 | 118 | #[getter] 119 | pub fn hit_purity(&self) -> PyPurity { 120 | self.hit_purity.clone() 121 | } 122 | 123 | #[getter] 124 | pub fn spectrum(&self) -> PyProcessedSpectrum { 125 | self.spectrum.clone() 126 | } 127 | 128 | #[getter] 129 | pub fn chimera(&self) -> Option { 130 | self.chimera.clone() 131 | } 132 | 133 | #[getter] 134 | pub fn chimera_purity(&self) -> Option { 135 | self.chimera_purity.clone() 136 | } 137 | 138 | #[getter] 139 | pub fn intensities(&self) -> Vec> { 140 | self.intensities.clone() 141 | } 142 | } 143 | 144 | #[pyclass] 145 | #[derive(Clone)] 146 | pub struct PyTmtQuant { 147 | pub inner: TmtQuant, 148 | } 149 | 150 | #[pymethods] 151 | impl PyTmtQuant { 152 | #[new] 153 | pub fn new( 154 | spec_id: String, 155 | file_id: usize, 156 | ion_injection_time: f32, 157 | peaks: Vec 158 | ) -> Self { 159 | PyTmtQuant { 160 | inner: TmtQuant { 161 | spec_id, 162 | file_id, 163 | ion_injection_time, 164 | peaks, 165 | }, 166 | } 167 | } 168 | 169 | #[getter] 170 | pub fn spec_id(&self) -> String { 171 | self.inner.spec_id.clone() 172 | } 173 | 174 | #[getter] 175 | pub fn file_id(&self) -> usize { 176 | self.inner.file_id 177 | } 178 | 179 | #[getter] 180 | pub fn ion_injection_time(&self) -> f32 { 181 | self.inner.ion_injection_time 182 | } 183 | 184 | #[getter] 185 | pub fn peaks(&self) -> Vec { 186 | self.inner.peaks.clone() 187 | } 188 | } 189 | 190 | 191 | #[pymodule] 192 | pub fn py_tmt(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 193 | m.add_class::()?; 194 | m.add_class::()?; 195 | m.add_class::()?; 196 | m.add_class::()?; 197 | Ok(()) 198 | } 199 | -------------------------------------------------------------------------------- /sagepy-connector/src/py_unimod.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use pyo3::prelude::*; 3 | use unimod::unimod::{unimod_modifications_mass_numerical, unimod_modifications_mass, quantized_mass_to_unimod, quanzie_mass, title_to_unimod_id, modification_atomic_composition}; 4 | 5 | #[pyfunction] 6 | fn unimod_modification_to_mass_numerical() -> HashMap { 7 | unimod_modifications_mass_numerical() 8 | } 9 | 10 | #[pyfunction] 11 | fn unimod_modification_to_mass() -> HashMap<&'static str, f64> { 12 | unimod_modifications_mass() 13 | } 14 | 15 | #[pyfunction] 16 | fn quantized_mass_to_unimod_candidates() -> HashMap> { 17 | quantized_mass_to_unimod() 18 | } 19 | 20 | #[pyfunction] 21 | fn quanzied_mass(mass: f32) -> i32 { 22 | quanzie_mass(mass) 23 | } 24 | 25 | #[pyfunction] 26 | fn title_to_unimod_ids() -> HashMap<&'static str, &'static str> { 27 | title_to_unimod_id() 28 | } 29 | 30 | #[pyfunction] 31 | fn modification_atomic_compositions() -> HashMap> { 32 | modification_atomic_composition() 33 | } 34 | 35 | #[pymodule] 36 | pub fn py_unimod(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 37 | m.add_function(wrap_pyfunction!(unimod_modification_to_mass_numerical, m)?)?; 38 | m.add_function(wrap_pyfunction!(unimod_modification_to_mass, m)?)?; 39 | m.add_function(wrap_pyfunction!(quantized_mass_to_unimod_candidates, m)?)?; 40 | m.add_function(wrap_pyfunction!(quanzied_mass, m)?)?; 41 | m.add_function(wrap_pyfunction!(title_to_unimod_ids, m)?)?; 42 | m.add_function(wrap_pyfunction!(modification_atomic_compositions, m)?)?; 43 | Ok(()) 44 | } -------------------------------------------------------------------------------- /sagepy-connector/src/py_utility.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use std::collections::{BTreeMap, HashMap, HashSet}; 3 | use qfdrust::psm::{compress_psms, decompress_psms, Psm}; 4 | use rayon::prelude::*; 5 | use rayon::ThreadPoolBuilder; 6 | use sage_core::ion_series::Kind; 7 | use sage_core::scoring::Fragments; 8 | use crate::py_scoring::{PyFragments, PyPsm}; 9 | use crate::utilities::sage_sequence_to_unimod_sequence; 10 | 11 | /// Converts a cosine similarity to an angle similarity. 12 | /// The angle similarity is calculated as 1 - angle / pi. 13 | /// 14 | /// # Arguments 15 | /// 16 | /// * `cosim` - A f32 representing the cosine similarity. 17 | /// 18 | /// # Returns 19 | /// 20 | /// * A f32 representing the angle similarity. 21 | /// 22 | #[pyfunction] 23 | pub fn cosim_to_spectral_angle(cosim: f32) -> f32 { 24 | let angle = (1.0 - cosim).acos(); 25 | 1.0 - angle / std::f32::consts::PI 26 | } 27 | 28 | /// Reshape the flat prosit array into a 3D array of shape (29, 2, 3) 29 | /// 30 | /// # Arguments 31 | /// 32 | /// * `flat_array` - a vector of f64 representing the flat prosit array 33 | /// 34 | /// # Returns 35 | /// 36 | /// * `Vec>>` - a 3D array of shape (29, 2, 3) 37 | /// 38 | pub fn reshape_prosit_array(flat_array: Vec) -> Vec>> { 39 | let mut array_return: Vec>> = vec![vec![vec![0.0; 3]; 2]; 29]; 40 | let mut ptr = 0; 41 | 42 | for c in 0..3 { 43 | for row in 0..29 { 44 | // Fill in the Y ion values 45 | array_return[row][0][c] = flat_array[ptr]; 46 | ptr += 1; 47 | } 48 | for row in 0..29 { 49 | // Fill in the B ion values 50 | array_return[row][1][c] = flat_array[ptr]; 51 | ptr += 1; 52 | } 53 | } 54 | 55 | array_return 56 | } 57 | 58 | #[pyfunction] 59 | pub fn flat_prosit_array_to_fragments_map(flat_intensities: Vec) -> BTreeMap<(u32, i32, i32), f32> { 60 | // Reshape the flat prosit array into a 3D array of shape (29, 2, 3) 61 | let reshaped_intensities = reshape_prosit_array(flat_intensities); 62 | 63 | // create hashmap of (kind, charge, ordinal) -> intensity 64 | let mut fragments: BTreeMap<(u32, i32, i32), f32> = BTreeMap::new(); 65 | for z in 1..=3 { 66 | let intensity_b: Vec = reshaped_intensities[..].iter().map(|x| x[1][z as usize - 1]).collect(); 67 | for i in 1..=29 { 68 | let intensity = intensity_b[i as usize - 1]; 69 | if intensity >= 0.0 { 70 | fragments.insert((0, z, i), intensity); 71 | } 72 | } 73 | 74 | let intensity_y: Vec = reshaped_intensities[..].iter().map(|x| x[0][z as usize - 1]).collect(); 75 | for i in 1..=29 { 76 | let intensity = intensity_y[i as usize - 1]; 77 | if intensity >= 0.0 { 78 | fragments.insert((1, z, i), intensity); 79 | } 80 | } 81 | } 82 | fragments 83 | } 84 | 85 | #[pyfunction] 86 | pub fn py_fragments_to_fragments_map(fragments: &PyFragments, normalize: bool) -> BTreeMap<(u32, i32, i32), f32> { 87 | let mut fragments_map: BTreeMap<(u32, i32, i32), f32> = BTreeMap::new(); 88 | 89 | let max_intensity = fragments.inner.intensities.iter().cloned().fold(f32::NEG_INFINITY, f32::max); 90 | 91 | for i in 0..fragments.inner.mz_calculated.len() { 92 | let kind = match fragments.inner.kinds[i] { 93 | Kind::B => 0, 94 | Kind::Y => 1, 95 | _ => panic!("Invalid ion kind"), 96 | }; 97 | 98 | let intensity = if normalize { 99 | fragments.inner.intensities[i] / max_intensity 100 | } else { 101 | fragments.inner.intensities[i] 102 | }; 103 | 104 | fragments_map.insert((kind, 105 | fragments.inner.charges[i], 106 | fragments.inner.fragment_ordinals[i]), intensity); 107 | } 108 | fragments_map 109 | } 110 | 111 | pub fn _map_to_py_fragments(fragments: &HashMap<(u32, i32, i32), f32>, 112 | mz_calculated: Vec, mz_experimental: Vec) -> PyFragments { 113 | 114 | let mut kinds: Vec = Vec::new(); 115 | let mut ordinals: Vec = Vec::new(); 116 | let mut charges: Vec = Vec::new(); 117 | let mut intensities: Vec = Vec::new(); 118 | 119 | for (kind, ordinal, charge) in fragments.keys() { 120 | let intensity = fragments.get(&(*kind, *charge, *ordinal)).unwrap(); 121 | let kind = match kind { 122 | 0 => Kind::B, 123 | 1 => Kind::Y, 124 | _ => panic!("Invalid ion kind"), 125 | }; 126 | kinds.push(kind); 127 | ordinals.push(*ordinal); 128 | charges.push(*charge); 129 | intensities.push(*intensity); 130 | } 131 | 132 | let fragments = Fragments { 133 | mz_calculated, 134 | mz_experimental, 135 | kinds, 136 | fragment_ordinals: ordinals, 137 | charges, 138 | intensities, 139 | }; 140 | 141 | PyFragments { 142 | inner: fragments, 143 | } 144 | } 145 | 146 | #[pyfunction] 147 | pub fn psms_to_json(psms: Vec, num_threads: usize) -> Vec { 148 | let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap(); 149 | 150 | thread_pool.install(|| { 151 | psms.par_iter().map(|psm| { 152 | serde_json::to_string(&psm.inner).unwrap() 153 | }).collect() 154 | }) 155 | } 156 | 157 | #[pyfunction] 158 | pub fn psms_to_json_bin(psms: Vec) -> Vec { 159 | let inner_psms = psms.iter().map(|psm| psm.inner.clone()).collect::>(); 160 | bincode::serialize(&inner_psms).unwrap() 161 | } 162 | 163 | #[pyfunction] 164 | pub fn json_bin_to_psms(json_bin: Vec) -> Vec { 165 | let inner_psms: Vec = bincode::deserialize(&json_bin).unwrap(); 166 | inner_psms.iter().map(|psm| PyPsm { 167 | inner: psm.clone(), 168 | }).collect() 169 | } 170 | 171 | #[pyfunction] 172 | pub fn sage_sequence_to_unimod(sequence: String, modifications: Vec, expected_modifications: HashSet) -> String { 173 | sage_sequence_to_unimod_sequence(sequence, &modifications, &expected_modifications) 174 | } 175 | 176 | #[pyfunction] 177 | pub fn psms_to_feature_matrix(psms: Vec, num_threads: usize) -> Vec> { 178 | let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap(); 179 | 180 | thread_pool.install(|| { 181 | psms.par_iter().map(|psm| { 182 | psm.inner.get_feature_vector() 183 | } 184 | ).collect() 185 | }) 186 | } 187 | 188 | #[pyfunction] 189 | pub fn get_psm_sequences_par(psms: Vec, num_threads: usize) -> Vec { 190 | let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap(); 191 | 192 | thread_pool.install(|| { 193 | psms.par_iter().map(|psm| { 194 | psm.inner.sequence.clone().unwrap().sequence 195 | }).collect() 196 | }) 197 | } 198 | 199 | #[pyfunction] 200 | pub fn get_psm_peptide_idx_par(psms: Vec, num_threads: usize) -> Vec { 201 | let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap(); 202 | 203 | thread_pool.install(|| { 204 | psms.par_iter().map(|psm| { 205 | psm.inner.sage_feature.peptide_idx.0.clone() 206 | }).collect() 207 | }) 208 | } 209 | 210 | #[pyfunction] 211 | pub fn get_psm_sequences_modified_par(psms: Vec, num_threads: usize) -> Vec { 212 | let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap(); 213 | 214 | thread_pool.install(|| { 215 | psms.par_iter().map(|psm| { 216 | psm.inner.sequence_modified.clone().unwrap().sequence 217 | }).collect() 218 | }) 219 | } 220 | 221 | #[pyfunction] 222 | pub fn get_psm_sequences_decoy_par(psms: Vec, num_threads: usize) -> Vec { 223 | let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap(); 224 | 225 | thread_pool.install(|| { 226 | psms.par_iter().map(|psm| { 227 | psm.inner.sequence_decoy.clone().unwrap().sequence 228 | }).collect() 229 | }) 230 | } 231 | 232 | #[pyfunction] 233 | pub fn get_psm_sequences_decoy_modified_par(psms: Vec, num_threads: usize) -> Vec { 234 | let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap(); 235 | 236 | thread_pool.install(|| { 237 | psms.par_iter().map(|psm| { 238 | 239 | let sequence = match &psm.inner.sequence_decoy_modified { 240 | Some(seq) => seq.sequence.clone(), 241 | None => "".to_string(), 242 | }; 243 | 244 | sequence 245 | 246 | }).collect() 247 | }) 248 | } 249 | 250 | #[pyfunction] 251 | pub fn get_psm_spec_idx_par(psms: Vec, num_threads: usize) -> Vec { 252 | let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap(); 253 | 254 | thread_pool.install(|| { 255 | psms.par_iter().map(|psm| { 256 | psm.inner.spec_idx.clone() 257 | }).collect() 258 | }) 259 | } 260 | 261 | #[pyfunction] 262 | pub fn get_psm_proteins_par(psms: Vec, num_threads: usize) -> Vec> { 263 | let thread_pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap(); 264 | 265 | thread_pool.install(|| { 266 | psms.par_iter().map(|psm| { 267 | psm.inner.proteins.clone() 268 | }).collect() 269 | }) 270 | } 271 | 272 | #[pyfunction] 273 | pub fn py_compress_psms(psms: Vec) -> Vec { 274 | let inner_psms = psms.iter().map(|psm| psm.inner.clone()).collect::>(); 275 | compress_psms(&inner_psms).unwrap() 276 | } 277 | 278 | #[pyfunction] 279 | pub fn py_decompress_psms(psms_bin: Vec) -> Vec { 280 | let inner_psms: Vec = decompress_psms(&psms_bin.as_slice()).unwrap(); 281 | inner_psms.iter().map(|psm| PyPsm { 282 | inner: psm.clone(), 283 | }).collect() 284 | } 285 | 286 | #[pymodule] 287 | pub fn py_utility(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 288 | m.add_function(wrap_pyfunction!(flat_prosit_array_to_fragments_map, m)?)?; 289 | m.add_function(wrap_pyfunction!(py_fragments_to_fragments_map, m)?)?; 290 | m.add_function(wrap_pyfunction!(psms_to_json, m)?)?; 291 | m.add_function(wrap_pyfunction!(psms_to_json_bin, m)?)?; 292 | m.add_function(wrap_pyfunction!(json_bin_to_psms, m)?)?; 293 | m.add_function(wrap_pyfunction!(cosim_to_spectral_angle, m)?)?; 294 | m.add_function(wrap_pyfunction!(sage_sequence_to_unimod, m)?)?; 295 | m.add_function(wrap_pyfunction!(psms_to_feature_matrix, m)?)?; 296 | m.add_function(wrap_pyfunction!(get_psm_sequences_par, m)?)?; 297 | m.add_function(wrap_pyfunction!(get_psm_sequences_modified_par, m)?)?; 298 | m.add_function(wrap_pyfunction!(get_psm_sequences_decoy_par, m)?)?; 299 | m.add_function(wrap_pyfunction!(get_psm_sequences_decoy_modified_par, m)?)?; 300 | m.add_function(wrap_pyfunction!(get_psm_spec_idx_par, m)?)?; 301 | m.add_function(wrap_pyfunction!(get_psm_peptide_idx_par, m)?)?; 302 | m.add_function(wrap_pyfunction!(get_psm_proteins_par, m)?)?; 303 | m.add_function(wrap_pyfunction!(py_compress_psms, m)?)?; 304 | m.add_function(wrap_pyfunction!(py_decompress_psms, m)?)?; 305 | Ok(()) 306 | } -------------------------------------------------------------------------------- /sagepy-connector/src/utilities.rs: -------------------------------------------------------------------------------- 1 | use unimod::unimod::{quanzie_mass, quantized_mass_to_unimod}; 2 | use std::collections::HashSet; 3 | 4 | /// Convert a Sage sequence and modifications to a Unimod sequence 5 | /// 6 | /// # Arguments 7 | /// 8 | /// * `sequence` - A string representing the amino acid sequence 9 | /// * `modifications` - A vector of floats representing the modifications 10 | /// 11 | /// # Returns 12 | /// 13 | /// * `String` - A string representing the Unimod sequence 14 | /// 15 | pub fn sage_sequence_to_unimod_sequence(sequence: String, modifications: &Vec, expected_modifications: &HashSet) -> String { 16 | 17 | assert_eq!(sequence.len(), modifications.len(), "Sequence and modifications must be the same length"); 18 | 19 | // go over each char and check if modification is present (not 0.0) and possibly convert to unimod 20 | let mut unimod_sequence = String::new(); 21 | let unimod_modifications_qunatized = quantized_mass_to_unimod(); 22 | let empty_vec = Vec::new(); 23 | 24 | for (idx, aa) in sequence.chars().enumerate() { 25 | 26 | // add amino acid to the unimod sequence 27 | unimod_sequence.push(aa); 28 | 29 | // check if the modification is nonzero, need to translate to unimod 30 | if modifications[idx] != 0.0 { 31 | 32 | // quantize the mass from nonzero modification 33 | let quantized_mass = quanzie_mass(modifications[idx]); 34 | 35 | // find the candidate modifications for the quantized mass 36 | let modifications = unimod_modifications_qunatized.get(&quantized_mass).unwrap_or(&empty_vec); 37 | 38 | let mut found = false; 39 | 40 | // check if the expected modification is in the candidate modifications 41 | for modification in modifications { 42 | if expected_modifications.contains(&modification.to_string()) { 43 | unimod_sequence.push_str(modification); 44 | found = true; 45 | } 46 | } 47 | 48 | // if the expected modification is not found, add a placeholder 49 | if !found { 50 | unimod_sequence.push_str("[UNIMOD:?]"); 51 | } 52 | } 53 | } 54 | unimod_sequence 55 | } -------------------------------------------------------------------------------- /sagepy/README.md: -------------------------------------------------------------------------------- 1 | # SAGEpy 2 | A python interface to the core [SAGE](https://github.com/lazear/sage) search engine for mass spectrometry proteomics 3 | 4 |

5 | logo 6 |

7 | 8 | ## Installation 9 | `sagepy` is now available via pip: 10 | ``` 11 | pip install sagepy 12 | ``` 13 | 14 | 15 | ### Build from source 16 | 17 | 1. Clone our fork of the SAGE repository: 18 | ``` 19 | git clone git@github.com:theGreatHerrLebert/sage.git 20 | ``` 21 | 22 | 2. Install the sage-core bindings using maturin, optionally in a virtual environment: 23 | ``` 24 | cd sage/crates/sagepy-connector 25 | 26 | # Install maturin 27 | pip install maturin 28 | 29 | # Build and install the bindings 30 | maturin build --release 31 | 32 | # Install the bindings 33 | pip install target/wheels/sagepy_connector-0.1.0-cp38-cp38-manylinux2014_x86_64.whl [--force-reinstall] 34 | ``` 35 | This will provide you with a python exposed version of the core SAGE library. 36 | 37 | 3. Install the sagepy python package with poetry: 38 | ``` 39 | git clone git@github.com:theGreatHerrLebert/sagepy.git 40 | 41 | cd sagepy 42 | 43 | # Install poetry 44 | pip install poetry 45 | 46 | # Install sagepy 47 | poetry install 48 | ``` 49 | 50 | ## Usage 51 | `sagepy` is a python interface to the core SAGE search engine. It exposes 52 | the core functionality of SAGE in a pythonic way, allowing you to use it for a direct integration 53 | into your python-based proteomics workflow. So far, it mainly mirrors structs that are available 54 | in the core SAGE library. 55 | 56 | ### Example generation of a sage database 57 | ```python 58 | import numpy as np 59 | from sagepy.core import EnzymeBuilder, SageSearchConfiguration 60 | 61 | # configure a trypsin-like digestor of fasta files 62 | enzyme_builder = EnzymeBuilder( 63 | missed_cleavages=2, 64 | min_len=5, 65 | max_len=50, 66 | cleave_at='KR', 67 | restrict='P', 68 | c_terminal=True, 69 | ) 70 | 71 | # UPDATE: Modification handling is simplified, using canonical UNIMOD notation 72 | static_mods = {"C": "[UNIMOD:4]"} # static cysteine modification 73 | variable_mods = {"M": ["[UNIMOD:35]"]} 74 | 75 | with open('path/to/reference.fasta', 'r') as infile: 76 | fasta = infile.read() 77 | 78 | # set-up a config for a sage-database 79 | sage_config = SageSearchConfiguration( 80 | fasta=fasta, 81 | static_mods=static_mods, 82 | variable_mods=variable_mods, 83 | enzyme_builder=enzyme_builder, 84 | generate_decoys=True, 85 | bucket_size=int(np.power(2, 14)) 86 | ) 87 | 88 | # generate the database for searching against 89 | indexed_db = sage_config.generate_indexed_database() 90 | ``` 91 | 92 | ### Generate a query 93 | ```python 94 | import numpy as np 95 | from sagepy.core import Precursor, RawSpectrum, ProcessedSpectrum, SpectrumProcessor, Tolerance, Scorer, Representation 96 | 97 | ### Example search of a sage database 98 | precursor = Precursor( 99 | charge=2, 100 | mz=506.77, 101 | ) 102 | 103 | intensity = np.array([ 202., 170., 205., 152., 1069., 595., 198., 805., 187., 104 | 194., 197., 169., 196., 209., 638., 372., 235., 399., 105 | 194., 185., 181., 170., 407., 150., 157., 175., 273., 106 | 1135., 881., 337., 311., 243., 310., 153., 162., 210., 107 | 277., 206., 189., 259., 658., 383., 166., 169., 219., 108 | 186., 221., 193., 367., 283., 237., 157., 372., 1276., 109 | 1618., 1102., 404., 232., 456., 765., 507., 223., 258., 110 | 402., 187., 158., 153., 304., 218., 223., 156., 1605., 111 | 1165., 1062., 434., 208., 155., 197., 221., 697., 397., 112 | 180., 195., 512., 252., 367., 305., 335., 175., 174., 113 | 296., 212.], dtype=np.float32) 114 | 115 | mz = np.array([272.16873692, 356.16844797, 406.71079396, 406.71396814, 116 | 406.71714233, 406.72031653, 407.21246768, 407.21564382, 117 | 407.21881996, 407.22199612, 407.7144506 , 407.71762869, 118 | 488.27537883, 488.28581266, 499.29228981, 499.29580676, 119 | 499.29932372, 499.30284069, 506.75478369, 507.26157767, 120 | 541.26272227, 553.29188809, 577.30432041, 577.30810217, 121 | 595.32672633, 597.2907525 , 603.27568881, 614.32036769, 122 | 614.32426881, 614.32816995, 615.3272682 , 615.33117252, 123 | 616.33108578, 617.33572156, 636.30924838, 637.30619081, 124 | 637.31016425, 665.36284673, 666.36197292, 674.35335834, 125 | 674.35744565, 674.36153297, 675.35511968, 675.36330039, 126 | 679.3531909 , 680.35044702, 680.35455247, 687.36822726, 127 | 687.37648041, 688.37547678, 697.3616813 , 700.3617026 , 128 | 715.36157366, 715.36578342, 715.36999319, 715.37420297, 129 | 715.37841277, 715.38262258, 716.36384605, 716.37227148, 130 | 716.38069696, 717.37103577, 725.35228543, 749.39291293, 131 | 749.39722166, 750.38424802, 786.44692356, 786.45575152, 132 | 787.4492132 , 787.45804678, 795.39284711, 812.41777208, 133 | 812.42225834, 812.42674462, 812.4312309 , 812.44020351, 134 | 813.40504794, 813.41851494, 813.42300396, 813.427493 , 135 | 813.43198205, 813.44544927, 814.43784098, 828.42202737, 136 | 828.4265576 , 851.43464868, 899.45327427, 899.46271517, 137 | 912.45278821, 913.44673363, 915.45053417, 915.46482091], dtype=np.float32) 138 | 139 | raw_spectrum = RawSpectrum( 140 | file_id=1, 141 | spec_id='DEMO-SPEC', 142 | total_ion_current=12667.0, 143 | precursors=[precursor], 144 | mz=mz, 145 | intensity=intensity 146 | ) 147 | 148 | spec_processor = SpectrumProcessor(take_top_n=75) 149 | query = spec_processor.process(raw_spectrum) 150 | ``` 151 | 152 | ### Search a database 153 | ```python 154 | from sagepy.core import Scorer 155 | 156 | # UPDATE: pass modifications to the scorer, necessary for PTM handling 157 | scorer = Scorer(report_psms=2, min_matched_peaks=5, variable_mods=variable_mods, static_mods=static_mods) 158 | results = scorer.score(db=indexed_db, spectrum=query) 159 | ``` 160 | 161 | potential output: 162 | ``` 163 | [Feature(idx: PeptideIx(1009105), peptide_len: 9, spec_id: DEMO-SPEC, file_id: 1, rank: 1, label: 1, exp. mass: 1011.5254516601562, cal. mass: 1011.5347900390625, charge: 2, retention time: 0.0, aligned rt: 0.0, predicted rt: 0.0, delta rt model: 0.9990000128746033, delta mass: 2989.41943359375, isotope error: 3.010050058364868, average ppm: 5.889466285705566, hyperscore: 15.020833459653923, delta_next: 0.0, delta_best: 0.0, matched peaks: 5, longest b: 0,longest y: 4, longest y pct: 0.4444444477558136, missed cleavages: 0, matched intensity pct: 14.81151294708252, scored candidates: 9340, poisson: -2.177020383746938, discriminant score: 0.0, posterior error: 1.0, spectrum q: 1.0, peptide q: 1.0, protein q: 1.0, ms2 intensity: 4652.0, ms1 intensity: 0.0), Feature(idx: PeptideIx(1009105), peptide_len: 9, spec_id: DEMO-SPEC, file_id: 1, rank: 2, label: 1, exp. mass: 1011.5254516601562, cal. mass: 1011.5347900390625, charge: 2, retention time: 0.0, aligned rt: 0.0, predicted rt: 0.0, delta rt model: 0.9990000128746033, delta mass: 1001.641845703125, isotope error: 1.003350019454956, average ppm: 5.889466285705566, hyperscore: 15.020833459653923, delta_next: 0.0, delta_best: 0.0, matched peaks: 5, longest b: 0,longest y: 4, longest y pct: 0.4444444477558136, missed cleavages: 0, matched intensity pct: 14.81151294708252, scored candidates: 9340, poisson: -2.177020383746938, discriminant score: 0.0, posterior error: 1.0, spectrum q: 1.0, peptide q: 1.0, protein q: 1.0, ms2 intensity: 4652.0, ms1 intensity: 0.0)] 164 | ``` 165 | -------------------------------------------------------------------------------- /sagepy/examples/lfq/helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Union, Dict, Any 3 | import pandas as pd 4 | 5 | from imspy.timstof.dda import TimsDatasetDDA 6 | 7 | from imspy.timstof.dbsearch.utility import ( 8 | get_ms1_ims_spectrum, 9 | sanitize_mz, 10 | sanitize_charge, 11 | get_searchable_spec 12 | ) 13 | 14 | from sagepy.core import ( 15 | SpectrumProcessor, 16 | Precursor, 17 | Tolerance 18 | ) 19 | 20 | def process_timstof_datasets( 21 | dataset_dirs: Union[str, List[str]], 22 | use_bruker_sdk: bool = False, 23 | max_peaks: int = 10_000, 24 | num_threads: int = 16, 25 | ms1_take_top_n: int = 10_000, 26 | ms1_deisotope: bool = True, 27 | fragment_take_top_n: int = 150 28 | ) -> Dict[str, Dict[str, Any]]: 29 | """ 30 | Process one or more Bruker TIMS .d folders to extract summarized fragment DataFrames 31 | and MS1 spectra suitable for downstream search. 32 | 33 | Parameters 34 | ---------- 35 | dataset_dirs : str or list of str 36 | A path or list of paths to the .d folders to process. 37 | use_bruker_sdk : bool, default False 38 | Whether to use the Bruker SDK when reading data. 39 | max_peaks : int, default 100000 40 | Maximum number of peaks to collect per precursor frame. 41 | num_threads : int, default 16 42 | Number of threads for parallel data extraction. 43 | ms1_take_top_n : int, default 10000 44 | Number of top peaks to keep when processing MS1 spectra. 45 | ms1_deisotope : bool, default True 46 | Whether to deisotope MS1 spectra. 47 | fragment_take_top_n : int, default 150 48 | Number of top peaks to keep when processing fragment spectra. 49 | 50 | Returns 51 | ------- 52 | results : dict 53 | A mapping from each dataset directory to a dict with keys: 54 | 'fragments' : pd.DataFrame of summarized fragment ions, 55 | 'ms1_spectra' : List of processed MS1 spectra objects. 56 | """ 57 | if isinstance(dataset_dirs, str): 58 | dataset_dirs = [dataset_dirs] 59 | 60 | results: Dict[str, Dict[str, Any]] = {} 61 | for file_id, dataset_dir in enumerate(dataset_dirs): 62 | ds_name = os.path.basename(dataset_dir.rstrip(os.sep)) 63 | handle = TimsDatasetDDA(dataset_dir, use_bruker_sdk=use_bruker_sdk) 64 | 65 | # Extract precursor frames and process MS1 spectra 66 | precursor_frames = handle.get_precursor_frames( 67 | max_peaks=max_peaks, 68 | num_threads=num_threads 69 | ) 70 | ms1_processor = SpectrumProcessor( 71 | take_top_n=ms1_take_top_n, 72 | ) 73 | ms1_spectra = [ 74 | get_ms1_ims_spectrum( 75 | raw_spectrum=spec, 76 | spec_id=f"{spec.frame_id}-{ds_name}", 77 | time=spec.retention_time / 60, 78 | spec_processor=ms1_processor, 79 | file_id=file_id 80 | ) for spec in precursor_frames 81 | ] 82 | 83 | del precursor_frames 84 | 85 | # Extract and summarize PASEF fragments 86 | fragments = handle.get_pasef_fragments(num_threads=num_threads) 87 | fragments = fragments.groupby('precursor_id').agg({ 88 | 'frame_id': 'first', 89 | 'time': 'first', 90 | 'precursor_id': 'first', 91 | 'raw_data': 'sum', 92 | 'scan_begin': 'first', 93 | 'scan_end': 'first', 94 | 'isolation_mz': 'first', 95 | 'isolation_width': 'first', 96 | 'collision_energy': 'first', 97 | 'largest_peak_mz': 'first', 98 | 'average_mz': 'first', 99 | 'monoisotopic_mz': 'first', 100 | 'charge': 'first', 101 | 'average_scan': 'first', 102 | 'intensity': 'first', 103 | 'parent_id': 'first', 104 | }) 105 | 106 | # Compute marginal ion mobility and build spec IDs 107 | fragments['mobility'] = fragments['raw_data'].apply( 108 | lambda rd: rd.get_inverse_mobility_along_scan_marginal() 109 | ) 110 | fragments['spec_id'] = fragments.apply( 111 | lambda r: f"{r.frame_id}-{r.precursor_id}-{ds_name}", 112 | axis=1 113 | ) 114 | 115 | # Build Precursor objects 116 | fragments['sage_precursor'] = fragments.apply( 117 | lambda r: Precursor( 118 | mz=sanitize_mz(r['monoisotopic_mz'], r['largest_peak_mz']), 119 | intensity=r['intensity'], 120 | charge=sanitize_charge(r['charge']), 121 | isolation_window=Tolerance(da=(-3, 3)), 122 | collision_energy=r['collision_energy'], 123 | inverse_ion_mobility=r['mobility'], 124 | spectrum_ref=r['spec_id'] 125 | ), 126 | axis=1 127 | ) 128 | 129 | # Process fragment spectra for searching 130 | fragments['processed_spec'] = fragments.apply( 131 | lambda r: get_searchable_spec( 132 | precursor=r['sage_precursor'], 133 | raw_fragment_data=r['raw_data'], 134 | spec_processor=SpectrumProcessor(take_top_n=fragment_take_top_n), 135 | spec_id=r['spec_id'], 136 | time=r['time'], 137 | file_id=file_id 138 | ), 139 | axis=1 140 | ) 141 | 142 | results[dataset_dir] = { 143 | 'fragments': fragments, 144 | 'ms1_spectra': ms1_spectra 145 | } 146 | 147 | return results -------------------------------------------------------------------------------- /sagepy/examples/readme/readme_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "07961419-6045-407b-badd-732af003583b", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "from sagepy.core import EnzymeBuilder, SageSearchConfiguration\n", 12 | "\n", 13 | "# configure a trypsin-like digestor of fasta files\n", 14 | "enzyme_builder = EnzymeBuilder(\n", 15 | " missed_cleavages=2, \n", 16 | " min_len=5, \n", 17 | " max_len=50, \n", 18 | " cleave_at='KR', \n", 19 | " restrict='P', \n", 20 | " c_terminal=True,\n", 21 | ")\n", 22 | "\n", 23 | "# UPDATE: Modification handling is simplified, using canonical UNIMOD notation\n", 24 | "static_mods = {\"C\": \"[UNIMOD:4]\"} # static cysteine modification\n", 25 | "variable_mods = {\"M\": [\"[UNIMOD:35]\"]}\n", 26 | "\n", 27 | "with open('/media/hd02/data/fasta/hela/plain/uniprotkb_proteome_UP000005640_AND_revi_2024_05_21.fasta', 'r') as infile:\n", 28 | " fasta = infile.read()\n", 29 | "\n", 30 | "# set-up a config for a sage-database\n", 31 | "sage_config = SageSearchConfiguration(\n", 32 | " fasta=fasta,\n", 33 | " static_mods=static_mods,\n", 34 | " variable_mods=variable_mods,\n", 35 | " enzyme_builder=enzyme_builder,\n", 36 | " generate_decoys=True,\n", 37 | " bucket_size=int(np.power(2, 14))\n", 38 | ")\n", 39 | "\n", 40 | "# generate the database for searching against\n", 41 | "indexed_db = sage_config.generate_indexed_database()" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "id": "7003b8b7-1342-44be-ab0a-4a5e9131388f", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import numpy as np\n", 52 | "from sagepy.core import Precursor, RawSpectrum, ProcessedSpectrum, SpectrumProcessor, Tolerance, Scorer, Representation\n", 53 | "\n", 54 | "### Example search of a sage database\n", 55 | "precursor = Precursor(\n", 56 | " charge=2,\n", 57 | " mz=506.77,\n", 58 | ")\n", 59 | "\n", 60 | "intensity = np.array([ 202., 170., 205., 152., 1069., 595., 198., 805., 187.,\n", 61 | " 194., 197., 169., 196., 209., 638., 372., 235., 399.,\n", 62 | " 194., 185., 181., 170., 407., 150., 157., 175., 273.,\n", 63 | " 1135., 881., 337., 311., 243., 310., 153., 162., 210.,\n", 64 | " 277., 206., 189., 259., 658., 383., 166., 169., 219.,\n", 65 | " 186., 221., 193., 367., 283., 237., 157., 372., 1276.,\n", 66 | " 1618., 1102., 404., 232., 456., 765., 507., 223., 258.,\n", 67 | " 402., 187., 158., 153., 304., 218., 223., 156., 1605.,\n", 68 | " 1165., 1062., 434., 208., 155., 197., 221., 697., 397.,\n", 69 | " 180., 195., 512., 252., 367., 305., 335., 175., 174.,\n", 70 | " 296., 212.], dtype=np.float32)\n", 71 | "\n", 72 | "mz = np.array([272.16873692, 356.16844797, 406.71079396, 406.71396814,\n", 73 | " 406.71714233, 406.72031653, 407.21246768, 407.21564382,\n", 74 | " 407.21881996, 407.22199612, 407.7144506 , 407.71762869,\n", 75 | " 488.27537883, 488.28581266, 499.29228981, 499.29580676,\n", 76 | " 499.29932372, 499.30284069, 506.75478369, 507.26157767,\n", 77 | " 541.26272227, 553.29188809, 577.30432041, 577.30810217,\n", 78 | " 595.32672633, 597.2907525 , 603.27568881, 614.32036769,\n", 79 | " 614.32426881, 614.32816995, 615.3272682 , 615.33117252,\n", 80 | " 616.33108578, 617.33572156, 636.30924838, 637.30619081,\n", 81 | " 637.31016425, 665.36284673, 666.36197292, 674.35335834,\n", 82 | " 674.35744565, 674.36153297, 675.35511968, 675.36330039,\n", 83 | " 679.3531909 , 680.35044702, 680.35455247, 687.36822726,\n", 84 | " 687.37648041, 688.37547678, 697.3616813 , 700.3617026 ,\n", 85 | " 715.36157366, 715.36578342, 715.36999319, 715.37420297,\n", 86 | " 715.37841277, 715.38262258, 716.36384605, 716.37227148,\n", 87 | " 716.38069696, 717.37103577, 725.35228543, 749.39291293,\n", 88 | " 749.39722166, 750.38424802, 786.44692356, 786.45575152,\n", 89 | " 787.4492132 , 787.45804678, 795.39284711, 812.41777208,\n", 90 | " 812.42225834, 812.42674462, 812.4312309 , 812.44020351,\n", 91 | " 813.40504794, 813.41851494, 813.42300396, 813.427493 ,\n", 92 | " 813.43198205, 813.44544927, 814.43784098, 828.42202737,\n", 93 | " 828.4265576 , 851.43464868, 899.45327427, 899.46271517,\n", 94 | " 912.45278821, 913.44673363, 915.45053417, 915.46482091], dtype=np.float32)\n", 95 | "\n", 96 | "raw_spectrum = RawSpectrum(\n", 97 | " file_id=1,\n", 98 | " spec_id='DEMO-SPEC',\n", 99 | " total_ion_current=12667.0,\n", 100 | " precursors=[precursor],\n", 101 | " mz=mz,\n", 102 | " intensity=intensity,\n", 103 | ")\n", 104 | "\n", 105 | "spec_processor = SpectrumProcessor(take_top_n=75)\n", 106 | "query = spec_processor.process(raw_spectrum)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 3, 112 | "id": "f340808b-6126-4f91-a39a-654b63acfb21", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "from sagepy.core import Scorer\n", 117 | "\n", 118 | "scorer = Scorer(report_psms=2, min_matched_peaks=5, variable_mods=variable_mods, static_mods=static_mods)\n", 119 | "results = scorer.score(db=indexed_db, spectrum=query)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "id": "8b468c19-c1e7-43a1-9797-a336646d3acd", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "[Feature(idx: PeptideIx(1014528), psm_id: 3, peptide_len: 9, spec_id: DEMO-SPEC, file_id: 1, rank: 1, label: 1, exp. mass: 1011.5254516601562, cal. mass: 1011.5347900390625, charge: 2, retention time: 0.0, aligned rt: 0.0, predicted rt: 0.0, delta rt model: 0.9990000128746033, delta mass: -1995.037109375, isotope error: 2.006700038909912, average ppm: 5.889466285705566, hyperscore: 15.020833685116404, delta_next: 0.0, delta_best: 0.0, matched peaks: 5, longest b: 0,longest y: 4, longest y pct: 0.4444444477558136, missed cleavages: 0, matched intensity pct: 14.81151294708252, scored candidates: 9359, poisson: -2.1735888459925277, discriminant score: 0.0, posterior error: 1.0, spectrum q: 1.0, peptide q: 1.0, protein q: 1.0, ms2 intensity: 4652.0), fragments: Fragments(charges: [1, 1, 1, 1, 1], ion_types: [IonType(Y), IonType(Y), IonType(Y), IonType(Y), IonType(Y)], fragment_ordinals: [8, 7, 6, 5, 4], intensities: [335.0, 1165.0, 1618.0, 1135.0, 399.0], mz_calculated: [899.4580078125, 812.4259643554688, 715.3732299804688, 614.3255615234375, 499.2986145019531], mz_experimental: [899.4627075195312, 812.4222412109375, 715.3699951171875, 614.3203735351562, 499.3028259277344])),\n", 132 | " Feature(idx: PeptideIx(1014528), psm_id: 4, peptide_len: 9, spec_id: DEMO-SPEC, file_id: 1, rank: 2, label: 1, exp. mass: 1011.5254516601562, cal. mass: 1011.5347900390625, charge: 2, retention time: 0.0, aligned rt: 0.0, predicted rt: 0.0, delta rt model: 0.9990000128746033, delta mass: -1001.641845703125, isotope error: 1.003350019454956, average ppm: 5.889466285705566, hyperscore: 15.020833685116404, delta_next: 0.0, delta_best: 0.0, matched peaks: 5, longest b: 0,longest y: 4, longest y pct: 0.4444444477558136, missed cleavages: 0, matched intensity pct: 14.81151294708252, scored candidates: 9359, poisson: -2.1735888459925277, discriminant score: 0.0, posterior error: 1.0, spectrum q: 1.0, peptide q: 1.0, protein q: 1.0, ms2 intensity: 4652.0), fragments: Fragments(charges: [1, 1, 1, 1, 1], ion_types: [IonType(Y), IonType(Y), IonType(Y), IonType(Y), IonType(Y)], fragment_ordinals: [8, 7, 6, 5, 4], intensities: [335.0, 1165.0, 1618.0, 1135.0, 399.0], mz_calculated: [899.4580078125, 812.4259643554688, 715.3732299804688, 614.3255615234375, 499.2986145019531], mz_experimental: [899.4627075195312, 812.4222412109375, 715.3699951171875, 614.3203735351562, 499.3028259277344]))]" 133 | ] 134 | }, 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "result_psm = scorer.score(db=indexed_db,spectrum=query)\n", 142 | "result_psm" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3 (ipykernel)", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.11.11" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 5 167 | } 168 | -------------------------------------------------------------------------------- /sagepy/examples/rescoring/data/psm_data.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1f9d469ae7fa55a195fde5ca5dbb5c270f6b1f2d58841c0d88778ea8e7b128d1 3 | size 98739151 4 | -------------------------------------------------------------------------------- /sagepy/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sagepy" 3 | version = "0.3.12" 4 | description = "" 5 | authors = ["theGreatHerrLebert "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.10" 10 | pyteomics = ">=4.7.3" 11 | mokapot = ">=0.10.0" 12 | sagepy-connector = ">=0.3.12" 13 | numpy = "==1.26.4" 14 | pandas = ">=1.3.3" 15 | numba = ">=0.59.1" 16 | scikit-learn = ">=1.6.1" 17 | tqdm = ">=4.62.3" 18 | lxml = ">=5.3.0" 19 | 20 | [build-system] 21 | requires = ["poetry-core"] 22 | build-backend = "poetry.core.masonry.api" 23 | -------------------------------------------------------------------------------- /sagepy/sagepy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theGreatHerrLebert/sagepy/4382f7b24dd4e35bfa256e8c8a32ead610782e99/sagepy/sagepy/__init__.py -------------------------------------------------------------------------------- /sagepy/sagepy/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .scoring import Scorer, Fragments, IonType, Psm, Feature 2 | from .database import IndexedDatabase, EnzymeBuilder, SageSearchConfiguration 3 | from .spectrum import RawSpectrum, ProcessedSpectrum, ProcessedIMSpectrum, Precursor, SpectrumProcessor, Representation 4 | from .mass import Tolerance 5 | from .ion_series import IonSeries 6 | from .peptide import Peptide 7 | from .modification import SAGE_KNOWN_MODS, validate_mods, validate_var_mods 8 | -------------------------------------------------------------------------------- /sagepy/sagepy/core/enzyme.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy as np 4 | import sagepy_connector 5 | psc = sagepy_connector.py_enzyme 6 | 7 | 8 | class Position: 9 | def __init__(self, position: Union[str, None] = None): 10 | """Position class representing location of peptide in protein 11 | 12 | Args: 13 | position (Union[str, None], optional): The position of the peptide in the protein. Defaults to None. 14 | """ 15 | if position is not None: 16 | try: 17 | self.__position_ptr = psc.PyPosition.from_string(position) 18 | except ValueError: 19 | raise ValueError("Invalid position string, allowed values are: c_term, n_term, full, internal") 20 | else: 21 | self.__position_ptr = psc.PyPosition.from_string('internal') 22 | 23 | @classmethod 24 | def from_py_position(cls, position: psc.PyPosition): 25 | instance = cls.__new__(cls) 26 | instance.__position_ptr = position 27 | return instance 28 | 29 | @property 30 | def position(self): 31 | return self.__position_ptr.to_string 32 | 33 | def __repr__(self): 34 | return f"Position({self.position})" 35 | 36 | def get_py_ptr(self): 37 | return self.__position_ptr 38 | 39 | 40 | class Enzyme: 41 | def __init__(self, cleave_pattern: str = 'KR', c_terminal: bool = True, skip_suffix: str = 'P', semi_enzymatic: bool = False): 42 | """Enzyme class, default enzyme is Trypsin 43 | 44 | Args: 45 | cleave_pattern (str, optional): The cleavage pattern of the enzyme. Defaults to 'KR'. 46 | c_terminal (bool, optional): Cleave from the C-terminal. Defaults to True. 47 | skip_suffix (str, optional): The skip suffix of the enzyme. Defaults to 'P'. 48 | semi_enzymatic (bool, optional): Is the enzyme semi enzymatic. Defaults to False. 49 | """ 50 | self.__enzyme_ptr = psc.PyEnzyme(cleave_pattern, c_terminal, skip_suffix, semi_enzymatic) 51 | 52 | @classmethod 53 | def from_py_enzyme(cls, enzyme: psc.PyEnzyme): 54 | instance = cls.__new__(cls) 55 | instance.__enzyme_ptr = enzyme 56 | return instance 57 | 58 | @property 59 | def c_terminal(self): 60 | if self.__enzyme_ptr is None: 61 | return None 62 | return self.__enzyme_ptr.c_terminal 63 | 64 | @property 65 | def skip_suffix(self): 66 | if self.__enzyme_ptr is None: 67 | return None 68 | return self.__enzyme_ptr.skip_suffix 69 | 70 | def cleavage_sites(self, sequence: str): 71 | if self.__enzyme_ptr is None: 72 | return None 73 | return self.__enzyme_ptr.cleavage_sites(sequence) 74 | 75 | def cleave(self, sequence: str, min_length: int = 1, max_length: int = np.inf): 76 | if self.__enzyme_ptr is None: 77 | raise ValueError("Enzyme is not defined") 78 | cleave_sites = self.__enzyme_ptr.cleavage_sites(sequence) 79 | 80 | cleaved_peptides = [sequence[start:end] for start, end in cleave_sites] 81 | filtered_peptides = [peptide for peptide in cleaved_peptides if min_length <= len(peptide) <= max_length] 82 | 83 | return filtered_peptides 84 | 85 | def get_py_ptr(self): 86 | return self.__enzyme_ptr 87 | 88 | def __repr__(self): 89 | return f"Enzyme(c_terminal: {self.c_terminal}, skip_suffix: {self.skip_suffix})" 90 | 91 | 92 | class Digest: 93 | def __init__(self, decoy: bool, sequence: str, protein: str, missed_cleavages: int, position: Position): 94 | """Digest class representing a peptide digest 95 | 96 | Args: 97 | decoy (bool): Is the digest peptide a decoy 98 | sequence (str): The sequence of the digest peptide 99 | protein (str): The protein that the digest peptide is found in 100 | missed_cleavages (int): The number of missed cleavages 101 | position (Position): The position of the digest peptide in the protein 102 | """ 103 | self.__digest_ptr = psc.PyDigest(decoy, sequence, protein, missed_cleavages, position.get_py_ptr()) 104 | 105 | @classmethod 106 | def from_py_digest(cls, digest: psc.PyDigest): 107 | instance = cls.__new__(cls) 108 | instance.__digest_ptr = digest 109 | return instance 110 | 111 | @property 112 | def decoy(self): 113 | return self.__digest_ptr.decoy 114 | 115 | @property 116 | def sequence(self): 117 | return self.__digest_ptr.sequence 118 | 119 | @property 120 | def protein(self): 121 | return self.__digest_ptr.protein 122 | 123 | @property 124 | def missed_cleavages(self): 125 | return self.__digest_ptr.missed_cleavages 126 | 127 | @property 128 | def position(self): 129 | return self.__digest_ptr.position 130 | 131 | def reverse(self): 132 | return self.__digest_ptr.reverse() 133 | 134 | def __eq__(self, other): 135 | return self.__digest_ptr == other.__digest_ptr 136 | 137 | def __hash__(self): 138 | return hash(self.__digest_ptr) 139 | 140 | def __repr__(self): 141 | return f"Digest(decoy: {self.decoy}, protein: {self.protein}, " \ 142 | f"missed_cleavages: {self.missed_cleavages}, position: {self.position}, sequence: {self.sequence})" 143 | 144 | def get_py_ptr(self): 145 | return self.__digest_ptr 146 | 147 | 148 | class EnzymeParameters: 149 | def __init__(self, missed_cleavages: int = 0, min_len: int = 5, max_len: int = 50, enzyme: Enzyme = None): 150 | """EnzymeParameters class representing the parameters of an enzyme 151 | 152 | Args: 153 | missed_cleavages (int, optional): The number of missed cleavages. Defaults to 0. 154 | min_len (int, optional): The minimum length of a peptide. Defaults to 5. 155 | max_len (int, optional): The maximum length of a peptide. Defaults to 50. 156 | enzyme (Enzyme, optional): The enzyme. Defaults to None. 157 | """ 158 | if enzyme is None: 159 | self.__enzyme_parameters_ptr = psc.PyEnzymeParameters(missed_cleavages, min_len, max_len, None) 160 | else: 161 | self.__enzyme_parameters_ptr = psc.PyEnzymeParameters(missed_cleavages, min_len, max_len, 162 | enzyme.get_py_ptr()) 163 | 164 | @classmethod 165 | def from_py_enzyme_parameters(cls, enzyme_parameters: psc.PyEnzymeParameters): 166 | instance = cls.__new__(cls) 167 | instance.__enzyme_parameters_ptr = enzyme_parameters 168 | return instance 169 | 170 | @property 171 | def missed_cleavages(self): 172 | return self.__enzyme_parameters_ptr.missed_cleavages 173 | 174 | @property 175 | def min_len(self): 176 | return self.__enzyme_parameters_ptr.min_len 177 | 178 | @property 179 | def max_len(self): 180 | return self.__enzyme_parameters_ptr.max_len 181 | 182 | @property 183 | def enzyme(self): 184 | return Enzyme.from_py_enzyme(self.__enzyme_parameters_ptr.enzyme) 185 | 186 | def __repr__(self): 187 | return f"EnzymeParameters(missed_cleavages: {self.missed_cleavages}, min_len: {self.min_len}, " \ 188 | f"max_len: {self.max_len}, enzyme: {self.enzyme})" 189 | 190 | def cleavage_sites(self, sequence: str): 191 | return self.__enzyme_parameters_ptr.cleavage_sites(sequence) 192 | 193 | def digest(self, sequence: str, protein: str): 194 | return [Digest.from_py_digest(s) for s in self.__enzyme_parameters_ptr.digest(sequence, protein)] 195 | 196 | def get_py_ptr(self): 197 | return self.__enzyme_parameters_ptr -------------------------------------------------------------------------------- /sagepy/sagepy/core/fasta.py: -------------------------------------------------------------------------------- 1 | from .enzyme import EnzymeParameters, Digest 2 | import sagepy_connector 3 | psc = sagepy_connector.py_fasta 4 | 5 | 6 | class Fasta: 7 | def __init__(self, fasta_str: str, decoy_tag: str = 'decoy_', generate_decoys: bool = False): 8 | """Fasta class 9 | 10 | Args: 11 | fasta_str (str): The fasta string 12 | decoy_tag (str, optional): The decoy tag. Defaults to 'decoy_'. 13 | generate_decoys (bool, optional): Should decoys be generated. Defaults to False. 14 | """ 15 | self.__fasta_ptr = psc.PyFasta.parse(fasta_str, decoy_tag, generate_decoys) 16 | 17 | @classmethod 18 | def from_py_fasta(cls, fasta: psc.PyFasta): 19 | instance = cls.__new__(cls) 20 | instance.__fasta_ptr = fasta 21 | return instance 22 | 23 | def _digest(self, enzyme_parameters: EnzymeParameters): 24 | return [Digest.from_py_digest(s) for s in self.__fasta_ptr.digest(enzyme_parameters.get_py_ptr())] 25 | -------------------------------------------------------------------------------- /sagepy/sagepy/core/fdr.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Union, Dict 2 | from sagepy.core.scoring import Psm 3 | from sagepy.core import IndexedDatabase, Feature 4 | from sagepy.core.database import PeptideIx 5 | import sagepy_connector 6 | psc = sagepy_connector.py_fdr 7 | 8 | 9 | class CompetitionPeptideIx: 10 | def __init__(self, forward: float, reverse: float, 11 | forward_ix: Optional[PeptideIx] = None, reverse_ix: Optional[PeptideIx] = None): 12 | 13 | self.__ptr = psc.PyCompetitionPeptideIx(forward, reverse, forward_ix.get_py_ptr() if forward_ix else None, 14 | reverse_ix.get_py_ptr() if reverse_ix else None) 15 | 16 | @classmethod 17 | def from_py_competition_peptide_ix(cls, competition_peptide_ix: psc.PyCompetitionPeptideIx): 18 | instance = cls.__new__(cls) 19 | instance.__ptr = competition_peptide_ix 20 | return instance 21 | 22 | def get_py_ptr(self): 23 | return self.__ptr 24 | 25 | @property 26 | def forward(self) -> float: 27 | return self.__ptr.forward 28 | 29 | @property 30 | def reverse(self) -> float: 31 | return self.__ptr.reverse 32 | 33 | @property 34 | def forward_ix(self) -> Optional[PeptideIx]: 35 | maybe_forward = self.__ptr.forward_ix 36 | if maybe_forward is None: 37 | return None 38 | return PeptideIx.from_py_peptide_ix(maybe_forward) 39 | 40 | @property 41 | def reverse_ix(self) -> Optional[PeptideIx]: 42 | maybe_reverse = self.__ptr.reverse_ix 43 | if maybe_reverse is None: 44 | return None 45 | return PeptideIx.from_py_peptide_ix(maybe_reverse) 46 | 47 | def __repr__(self): 48 | return (f"CompetitionPeptideIx(forward={self.forward}, reverse={self.reverse}, " 49 | f"forward_ix={self.forward_ix}, reverse_ix={self.reverse_ix})") 50 | 51 | def sage_fdr(feature_collection: List[Feature], indexed_db: IndexedDatabase, use_hyper_score: bool = True): 52 | """ Perform SAGE FDR on all levels (spectrum, peptide, protein), calculates q-values and PEPs for a given feature collection. 53 | Args: 54 | feature_collection: a list of features 55 | indexed_db: an indexed database 56 | use_hyper_score: whether to use hyper score or discriminant score for q-value calculation 57 | """ 58 | psc.py_sage_fdr( 59 | [feature.get_py_ptr() for feature in feature_collection], 60 | indexed_db.get_py_ptr(), 61 | use_hyper_score 62 | ) 63 | 64 | def sage_fdr_psm(psm_collection: Union[List[Psm], Dict[str, List[Psm]]], indexed_db: IndexedDatabase, use_hyper_score: bool = True): 65 | """ Perform SAGE FDR on all levels (spectrum, peptide, protein), calculates q-values and PEPs for a given feature collection. 66 | Args: 67 | psm_collection: a list of features 68 | indexed_db: an indexed database 69 | use_hyper_score: whether to use hyper score or discriminant score for q-value calculation 70 | """ 71 | 72 | f_collection = [] 73 | 74 | if isinstance(psm_collection, dict): 75 | for _, values in psm_collection.items(): 76 | f_collection.extend(values) 77 | 78 | else: 79 | f_collection = psm_collection 80 | 81 | psc.py_sage_fdr_psm( 82 | [feature.get_py_ptr() for feature in f_collection], 83 | indexed_db.get_py_ptr(), 84 | use_hyper_score 85 | ) 86 | -------------------------------------------------------------------------------- /sagepy/sagepy/core/ion_series.py: -------------------------------------------------------------------------------- 1 | import sagepy_connector 2 | 3 | from sagepy.core.peptide import Peptide 4 | 5 | psc = sagepy_connector.py_ion_series 6 | 7 | 8 | class IonType: 9 | def __init__(self, ion_type: str): 10 | """IonType class 11 | 12 | Args: 13 | ion_type (str): The ion type, allowed values are: a, b, c, x, y, z 14 | """ 15 | try: 16 | self.__ion_type_ptr = psc.PyKind(ion_type) 17 | except ValueError: 18 | raise ValueError("Invalid ion type, allowed values are: a, b, c, x, y, z") 19 | 20 | @classmethod 21 | def from_py_kind(cls, kind: psc.PyKind): 22 | instance = cls.__new__(cls) 23 | instance.__ion_type_ptr = kind 24 | return instance 25 | 26 | @classmethod 27 | def y(cls): 28 | return cls.from_py_kind(psc.PyKind('y')) 29 | 30 | @classmethod 31 | def b(cls): 32 | return cls.from_py_kind(psc.PyKind('b')) 33 | 34 | def __repr__(self): 35 | return f"IonType({self.__ion_type_ptr.kind_as_string()})" 36 | 37 | def __hash__(self): 38 | return hash(self.__ion_type_ptr.kind_as_string()) 39 | 40 | def __eq__(self, other): 41 | if not isinstance(other, IonType): 42 | return False 43 | return self.__ion_type_ptr.kind_as_string() == other.__ion_type_ptr.kind_as_string() 44 | 45 | def get_py_ptr(self): 46 | return self.__ion_type_ptr 47 | 48 | def to_string(self) -> str: 49 | return self.__ion_type_ptr.kind_as_string() 50 | 51 | 52 | class Ion: 53 | """Ion class 54 | 55 | Args: 56 | ion_type (IonType): The ion type, e.g. b, y 57 | mass (float): The mass of the ion 58 | """ 59 | def __init__(self, ion_type: IonType, mass: float): 60 | self.__ion_ptr = psc.PyIon(ion_type.get_py_ptr(), mass) 61 | 62 | @classmethod 63 | def from_py_ion(cls, ion: psc.PyIon): 64 | instance = cls.__new__(cls) 65 | instance.__ion_ptr = ion 66 | return instance 67 | 68 | @property 69 | def ion_type(self): 70 | return IonType.from_py_kind(self.__ion_ptr.kind) 71 | 72 | @property 73 | def mono_isotopic_mass(self): 74 | return self.__ion_ptr.monoisotopic_mass 75 | 76 | def __repr__(self): 77 | return f"Ion({self.ion_type}, {self.mono_isotopic_mass})" 78 | 79 | 80 | class IonSeries: 81 | """IonSeries class 82 | 83 | Args: 84 | peptide (Peptide): The peptide 85 | ion_type (IonType): The ion type, e.g. b, y 86 | """ 87 | def __init__(self, peptide: Peptide, ion_type: IonType): 88 | self.__ion_series_ptr = psc.PyIonSeries(peptide.get_py_ptr(), ion_type.get_py_ptr()) 89 | 90 | @classmethod 91 | def from_py_ion_series(cls, ion_series: psc.PyIonSeries): 92 | instance = cls.__new__(cls) 93 | instance.__ion_series_ptr = ion_series 94 | return instance 95 | 96 | @property 97 | def ion_type(self): 98 | return IonType.from_py_kind(self.__ion_series_ptr.kind) 99 | 100 | @property 101 | def cumulative_mass(self): 102 | return self.__ion_series_ptr.cumulative_mass 103 | 104 | @property 105 | def peptide(self): 106 | return Peptide.from_py_peptide(self.__ion_series_ptr.peptide) 107 | 108 | def __repr__(self): 109 | return f"IonSeries({self.ion_type}, {self.cumulative_mass}, {self.peptide})" 110 | 111 | def get_py_ptr(self): 112 | return self.__ion_series_ptr 113 | 114 | def get_ion_series(self): 115 | return [Ion.from_py_ion(i) for i in self.__ion_series_ptr.get_ion_series()] -------------------------------------------------------------------------------- /sagepy/sagepy/core/mass.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import sagepy_connector 4 | psc = sagepy_connector.py_mass 5 | 6 | 7 | def monoisotopic(aa: str) -> float: 8 | return psc.py_monoisotopic(aa) 9 | 10 | 11 | class Composition: 12 | def __init__(self, carbon, sulfur): 13 | """Composition class 14 | 15 | Args: 16 | carbon (int): The number of carbon atoms 17 | sulfur (int): The number of sulfur atoms 18 | """ 19 | self.__composition_ptr = psc.PyComposition(carbon, sulfur) 20 | 21 | @classmethod 22 | def from_py_composition(cls, composition: psc.PyComposition): 23 | instance = cls.__new__(cls) 24 | instance.__composition_ptr = composition 25 | return instance 26 | 27 | @property 28 | def carbon(self): 29 | return self.__composition_ptr.carbon 30 | 31 | @property 32 | def sulfur(self): 33 | return self.__composition_ptr.sulfur 34 | 35 | def __repr__(self): 36 | return f"Composition(carbon: {self.carbon}, sulfur: {self.sulfur})" 37 | 38 | def get_py_ptr(self): 39 | return self.__composition_ptr 40 | 41 | @staticmethod 42 | def sum(composition_list: List['Composition']) -> 'Composition': 43 | return psc.PyComposition.sum([c.get_py_ptr() for c in composition_list]) 44 | 45 | @staticmethod 46 | def aa_composition(aa: str) -> 'Composition': 47 | return Composition.from_py_composition(psc.PyComposition.py_composition(aa)) 48 | 49 | 50 | class CONSTANTS: 51 | @classmethod 52 | @property 53 | def NEUTRON(cls): 54 | return psc.neutron() 55 | 56 | @classmethod 57 | @property 58 | def PROTON(cls): 59 | return psc.proton() 60 | 61 | @classmethod 62 | @property 63 | def H2O(cls): 64 | return psc.h2o() 65 | 66 | @classmethod 67 | @property 68 | def NH3(cls): 69 | return psc.nh3() 70 | 71 | 72 | class Tolerance: 73 | def __init__(self, da: (float, float) = None, ppm: (float, float) = None): 74 | """Tolerance class 75 | 76 | Args: 77 | da (float, optional): The tolerance in Da. Defaults to None. 78 | ppm (float, optional): The tolerance in ppm. Defaults to None. 79 | """ 80 | if da is not None and ppm is not None: 81 | raise ValueError("Only one of da or ppm can be set") 82 | elif da is None and ppm is None: 83 | raise ValueError("One of da or ppm must be set") 84 | else: 85 | self.__tolerance_ptr = psc.PyTolerance(da, ppm) 86 | 87 | def get_py_ptr(self): 88 | return self.__tolerance_ptr 89 | 90 | @classmethod 91 | def from_py_tolerance(cls, tolerance: psc.PyTolerance) -> 'Tolerance': 92 | instance = cls.__new__(cls) 93 | instance.__tolerance_ptr = tolerance 94 | return instance 95 | 96 | @property 97 | def da(self) -> (float, float): 98 | return self.__tolerance_ptr.da 99 | 100 | @property 101 | def ppm(self) -> (float, float): 102 | return self.__tolerance_ptr.ppm 103 | 104 | def bounds(self, center: float) -> (float, float): 105 | return self.__tolerance_ptr.bounds(center) 106 | 107 | def contains(self, center: float, target: float) -> bool: 108 | return self.__tolerance_ptr.contains(center, target) 109 | 110 | def __repr__(self) -> str: 111 | if self.da is not None: 112 | return f"Tolerance(da={self.da})" 113 | else: 114 | return f"Tolerance(ppm={self.ppm})" 115 | 116 | def __mul__(self, other) -> 'Tolerance': 117 | if isinstance(other, float): 118 | return Tolerance.from_py_tolerance(self.__tolerance_ptr * other) 119 | 120 | elif isinstance(other, int): 121 | return Tolerance.from_py_tolerance(self.__tolerance_ptr * float(other)) 122 | 123 | else: 124 | raise ValueError("Tolerance can only be multiplied by a float or an int") 125 | -------------------------------------------------------------------------------- /sagepy/sagepy/core/ml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theGreatHerrLebert/sagepy/4382f7b24dd4e35bfa256e8c8a32ead610782e99/sagepy/sagepy/core/ml/__init__.py -------------------------------------------------------------------------------- /sagepy/sagepy/core/ml/mobility_model.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union, Dict 2 | from sagepy.core.scoring import Psm 3 | from sagepy.core import IndexedDatabase 4 | 5 | import sagepy_connector 6 | psc = sagepy_connector.py_mobility_model 7 | 8 | def predict_sage_im( 9 | psm_collection: Union[List[Psm], Dict[str, List[Psm]]], 10 | indexed_db: IndexedDatabase) -> None: 11 | """ Predict ion mobility using SAGE IM model. 12 | Args: 13 | psm_collection: a list of features 14 | indexed_db: an indexed database 15 | """ 16 | 17 | f_collection = [] 18 | 19 | if isinstance(psm_collection, dict): 20 | for _, values in psm_collection.items(): 21 | f_collection.extend(values) 22 | 23 | else: 24 | f_collection = psm_collection 25 | 26 | psc.py_predict_im( 27 | [feature.get_py_ptr() for feature in f_collection], 28 | indexed_db.get_py_ptr(), 29 | ) -------------------------------------------------------------------------------- /sagepy/sagepy/core/ml/pep.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | from numpy.typing import NDArray 3 | import numpy as np 4 | from numba import njit, prange 5 | 6 | 7 | @njit 8 | def std(sample: NDArray) -> float: 9 | """Calculate the standard deviation of the sample. 10 | 11 | Args: 12 | sample: numpy array of values 13 | 14 | Returns: 15 | float: standard deviation of the sample 16 | """ 17 | mean = np.mean(sample) 18 | variance = np.mean((sample - mean) ** 2) 19 | return np.sqrt(variance) 20 | 21 | 22 | @njit(parallel=True) 23 | def kde_pdf(sample: NDArray, 24 | bandwidth: float, 25 | x: float) -> float: 26 | """Calculate the KDE PDF for a given x. 27 | 28 | Args: 29 | sample: numpy array of values 30 | bandwidth: bandwidth parameter 31 | x: value for which to calculate the PDF 32 | 33 | Returns: 34 | float: KDE PDF for the given 35 | """ 36 | n = len(sample) 37 | constant = (2.0 * np.pi) ** 0.5 * bandwidth * n 38 | sum_pdf = 0.0 39 | 40 | for i in prange(n): 41 | sum_pdf += np.exp(-0.5 * ((x - sample[i]) / bandwidth) ** 2) 42 | 43 | return sum_pdf / constant 44 | 45 | 46 | @njit 47 | def calculate_pep_single( 48 | scores: NDArray, 49 | decoys: NDArray, 50 | bins: int = 1000, 51 | bw_adjust: float = 1.0, 52 | monotonic: bool = True 53 | ) -> Tuple[NDArray, float, float]: 54 | """Calculate the PEP using KDE and binning with linear interpolation. 55 | 56 | Args: 57 | scores: numpy array of scores 58 | decoys: numpy array of boolean values indicating decoys 59 | bins: number of bins for the PEP calculation 60 | bw_adjust: bandwidth adjustment factor 61 | monotonic: whether to enforce monotonicity 62 | 63 | Returns: 64 | Tuple[NDArray, float, float]: PEP values, minimum score, score step 65 | """ 66 | d = scores[decoys] 67 | t = scores[~decoys] 68 | 69 | pi = len(d) / len(scores) 70 | sigma_d = std(d) 71 | sigma_t = std(t) 72 | 73 | bandwidth_d = bw_adjust * sigma_d * (4. / 3. / len(d)) ** 0.2 74 | bandwidth_t = bw_adjust * sigma_t * (4. / 3. / len(t)) ** 0.2 75 | 76 | min_score = np.min(scores) 77 | max_score = np.max(scores) 78 | score_step = (max_score - min_score) / (bins - 1) 79 | 80 | pep_bins = np.zeros(bins) 81 | 82 | for bin_idx in range(bins): 83 | score = min_score + bin_idx * score_step 84 | decoy_pdf = kde_pdf(d, bandwidth_d, score) * pi 85 | target_pdf = kde_pdf(t, bandwidth_t, score) * (1.0 - pi) 86 | pep_bins[bin_idx] = decoy_pdf / (decoy_pdf + target_pdf) 87 | 88 | if monotonic: 89 | for i in range(len(pep_bins) - 2, -1, -1): 90 | pep_bins[i] = max(pep_bins[i], pep_bins[i + 1]) 91 | 92 | return pep_bins, min_score, score_step 93 | 94 | 95 | @njit 96 | def posterior_error(pep_bins: NDArray, 97 | min_score: float, 98 | score_step: float, 99 | score: float) -> float: 100 | """Interpolate PEP for a given score. 101 | 102 | Args: 103 | pep_bins: numpy array of PEP values 104 | min_score: minimum score 105 | score_step: score step 106 | score: score for which to calculate the PEP 107 | 108 | Returns: 109 | float: interpolated PEP value 110 | """ 111 | bin_lo = int((score - min_score) / score_step) 112 | bin_hi = min(bin_lo + 1, len(pep_bins) - 1) 113 | 114 | lower = pep_bins[bin_lo] 115 | upper = pep_bins[bin_hi] 116 | 117 | bin_lo_score = bin_lo * score_step + min_score 118 | linear = (score - bin_lo_score) / score_step 119 | 120 | delta = upper - lower 121 | return lower + (delta * linear) 122 | 123 | # caclulate pep for all scores 124 | @njit 125 | def calculate_pep(scores: NDArray, 126 | decoys: NDArray, 127 | bins: int = 1000, 128 | bw_adjust: float = 1.0, 129 | monotonic: bool = True) -> NDArray: 130 | """Calculate PEP for all scores. 131 | 132 | Args: 133 | scores: numpy array of scores 134 | decoys: numpy array of boolean values indicating decoys 135 | bins: number of bins for the PEP calculation 136 | bw_adjust: bandwidth adjustment factor 137 | monotonic: whether to enforce monotonicity 138 | 139 | Returns: 140 | numpy array: PEP values for all scores 141 | """ 142 | pep_bins, min_score, score_step = calculate_pep_single(scores, decoys, bins, bw_adjust, monotonic) 143 | pep = np.zeros(len(scores)) 144 | for i in range(len(scores)): 145 | pep[i] = posterior_error(pep_bins, min_score, score_step, scores[i]) 146 | return pep 147 | 148 | if __name__ == "__main__": 149 | # create 1000 radom scores between 0 and 50 150 | scores = np.random.uniform(0, 50, 50000) 151 | decoys = np.random.choice([True, False], 50000) 152 | 153 | # sort scores ascending 154 | scores = np.sort(scores) 155 | 156 | # sort decoys where true comes first 157 | decoys = np.sort(decoys)[::-1] 158 | 159 | pep_bins, min_score, score_step = calculate_pep_single(scores, decoys) 160 | pep = posterior_error(pep_bins, min_score, score_step, scores[0]) 161 | 162 | peps = calculate_pep(scores, decoys) 163 | 164 | from matplotlib import pyplot as plt 165 | plt.plot(scores, peps) 166 | plt.show() -------------------------------------------------------------------------------- /sagepy/sagepy/core/ml/retention_alignment.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union, Dict 2 | import sagepy_connector 3 | from imspy.simulation.annotation import RustWrapperObject 4 | 5 | from sagepy.core import Psm, Feature 6 | 7 | psc = sagepy_connector.py_retention_alignment 8 | 9 | 10 | class Alignment(RustWrapperObject): 11 | def __init__(self, file_id: int, max_rt: float, slope: float, intercept: float): 12 | self.__py_ptr = psc.PyAlignment(file_id, max_rt, slope, intercept) 13 | 14 | @classmethod 15 | def from_py_ptr(cls, py_ptr: psc.PyAlignment): 16 | instance = cls.__new__(cls) 17 | instance.__py_ptr = py_ptr 18 | return instance 19 | 20 | def get_py_ptr(self) -> psc.PyAlignment: 21 | return self.__py_ptr 22 | 23 | @property 24 | def file_id(self) -> int: 25 | return self.__py_ptr.file_id 26 | 27 | @property 28 | def max_rt(self) -> float: 29 | return self.__py_ptr.max_rt 30 | 31 | @property 32 | def slope(self) -> float: 33 | return self.__py_ptr.slope 34 | 35 | @property 36 | def intercept(self) -> float: 37 | return self.__py_ptr.intercept 38 | 39 | def __repr__(self): 40 | return f"Alignment(file_id={self.file_id}, max_rt={self.max_rt}, slope={self.slope}, intercept={self.intercept})" 41 | 42 | 43 | def global_alignment(features: List[Feature], n_files: int) -> List[Alignment]: 44 | """ Perform global alignment. 45 | Args: 46 | features: A list of features 47 | n_files: Number of files 48 | Returns: 49 | List[Alignment]: List of Alignment objects 50 | """ 51 | 52 | py_alignments = psc.py_global_alignment([f.get_py_ptr() for f in features], n_files) 53 | return [Alignment.from_py_ptr(py_alignment) for py_alignment in py_alignments] 54 | 55 | def global_alignment_psm(psms: Union[Dict[str, List[Psm]], List[Psm]]) -> List[Alignment]: 56 | """ Perform global alignment on PSMs. 57 | Args: 58 | psms: A list of PSMs 59 | Returns: 60 | List[Alignment]: List of Alignment objects 61 | """ 62 | 63 | if isinstance(psms, dict): 64 | psms = [psm for psm_list in psms.values() for psm in psm_list] 65 | 66 | n_files = len(set([p.sage_feature.file_id for p in psms])) 67 | 68 | py_alignments = psc.py_global_alignment_psm([p.get_py_ptr() for p in psms], n_files) 69 | return [Alignment.from_py_ptr(py_alignment) for py_alignment in py_alignments] 70 | -------------------------------------------------------------------------------- /sagepy/sagepy/core/ml/retention_model.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union, Dict 2 | from sagepy.core.scoring import Psm 3 | from sagepy.core import IndexedDatabase 4 | 5 | import sagepy_connector 6 | psc = sagepy_connector.py_retention_model 7 | 8 | 9 | def predict_sage_rt( 10 | psm_collection: Union[List[Psm], Dict[str, List[Psm]]], 11 | indexed_db: IndexedDatabase) -> None: 12 | """ Predict retention time using SAGE RT model. 13 | Args: 14 | psm_collection: a list of features 15 | indexed_db: an indexed database 16 | """ 17 | 18 | f_collection = [] 19 | 20 | if isinstance(psm_collection, dict): 21 | for _, values in psm_collection.items(): 22 | f_collection.extend(values) 23 | 24 | else: 25 | f_collection = psm_collection 26 | 27 | psc.py_predict_rt( 28 | [feature.get_py_ptr() for feature in f_collection], 29 | indexed_db.get_py_ptr(), 30 | ) -------------------------------------------------------------------------------- /sagepy/sagepy/core/modification.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import sagepy_connector 4 | 5 | psc = sagepy_connector.py_modification 6 | 7 | 8 | def process_variable_start_end_mods(variable_modifications): 9 | """Helper function to process variable modifications for start and end of peptides/proteins 10 | For some reason, the variable modification wildcards are not processed correctly when passed to SAGE 11 | This function processes the variable modifications and adds the start and end wildcards for amino acids 12 | Args: 13 | variable_modifications: The variable modifications 14 | 15 | Returns: 16 | Dict: The processed variable modifications 17 | """ 18 | 19 | # peptide C, peptide N, protein C, protein N 20 | targets = ["^", "$", "[", "]"] 21 | 22 | # combine targets with amino acids 23 | AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY" 24 | 25 | ret_dict = {} 26 | 27 | for key, values in variable_modifications.items(): 28 | if key in targets: 29 | for amino_acid in AMINO_ACIDS: 30 | ret_dict[key + amino_acid] = values 31 | 32 | return { **variable_modifications, **ret_dict } 33 | 34 | 35 | class SAGE_KNOWN_MODS: 36 | @staticmethod 37 | def n_terminal_static(): 38 | return "^", 304.207 39 | 40 | @staticmethod 41 | def lysine_static(): 42 | return "K", 304.207 43 | 44 | @staticmethod 45 | def cysteine_static(): 46 | return "C", 57.0215 47 | 48 | @staticmethod 49 | def phospho_serine_static(): 50 | return "S", 79.9663 51 | 52 | @staticmethod 53 | def phospho_threonine_static(): 54 | return "T", 79.9663 55 | 56 | @staticmethod 57 | def phospho_tyrosine_static(): 58 | return "Y", 79.9663 59 | 60 | @staticmethod 61 | def phospho_serine_variable(): 62 | return "S", [79.9663] 63 | 64 | @staticmethod 65 | def phospho_threonine_variable(): 66 | return "T", [79.9663] 67 | 68 | @staticmethod 69 | def phospho_tyrosine_variable(): 70 | return "Y", [79.9663] 71 | 72 | @staticmethod 73 | def methionine_variable(): 74 | return "M", [15.9949] 75 | 76 | @staticmethod 77 | def q_variable(): 78 | return "^Q", [-17.026549] 79 | 80 | @staticmethod 81 | def glutamic_acid_n_terminal_variable(): 82 | return "^E", [-18.010565] 83 | 84 | @staticmethod 85 | def peptide_c_terminal_variable(): 86 | return "$", [49.2, 22.9] 87 | 88 | @staticmethod 89 | def protein_n_terminus_variable(): 90 | return "[", [42.0] 91 | 92 | @staticmethod 93 | def protein_c_terminal_variable(): 94 | return "]", [111.0] 95 | 96 | def __repr__(self): 97 | return (f"SAGE_KNOWN_MODS({self.n_terminal_static()}, " 98 | f"{self.lysine_static()}, " 99 | f"{self.cysteine_static()}, " 100 | f"{self.methionine_variable()}, " 101 | f"{self.q_variable()}, " 102 | f"{self.glutamic_acid_n_terminal_variable()}, " 103 | f"{self.peptide_c_terminal_variable()}, " 104 | f"{self.protein_n_terminus_variable()}, " 105 | f"{self.protein_c_terminal_variable()})") 106 | 107 | 108 | # TODO: need to re-implement based on constant modification list 109 | class ModificationSpecificity: 110 | def __init__(self, s: str): 111 | self.__modification_specificity_ptr = psc.PyModificationSpecificity(s) 112 | 113 | @classmethod 114 | def from_py_modification_specificity(cls, specificity: psc.PyModificationSpecificity): 115 | instance = cls.__new__(cls) 116 | instance.__modification_specificity_ptr = specificity 117 | return instance 118 | 119 | def __repr__(self): 120 | return f"ModificationSpecificity({self.__modification_specificity_ptr.as_string})" 121 | 122 | def get_py_ptr(self): 123 | return self.__modification_specificity_ptr 124 | 125 | 126 | def validate_mods(mods: Dict[str, float]) -> Dict[ModificationSpecificity, float]: 127 | 128 | py_validate_dict = psc.py_validate_mods(mods) 129 | 130 | return {ModificationSpecificity.from_py_modification_specificity(k): v for k, v in py_validate_dict.items()} 131 | 132 | 133 | def validate_var_mods(mods: Dict[str, List[float]]) -> Dict[ModificationSpecificity, List[float]]: 134 | 135 | py_validate_dict = psc.py_validate_var_mods(mods) 136 | 137 | return {ModificationSpecificity.from_py_modification_specificity(k): v for k, v in py_validate_dict.items()} 138 | 139 | 140 | if __name__ == "__main__": 141 | static_mods = {k: v for k, v in [SAGE_KNOWN_MODS.cysteine_static()]} 142 | variable_mods = {k: v for k, v in [SAGE_KNOWN_MODS.methionine_variable()]} 143 | 144 | static = validate_mods(static_mods) 145 | variab = validate_var_mods(variable_mods) 146 | -------------------------------------------------------------------------------- /sagepy/sagepy/core/peptide.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | import numpy as np 4 | import sagepy_connector 5 | 6 | from sagepy.core.enzyme import Position, Digest 7 | 8 | psc = sagepy_connector.py_peptide 9 | 10 | def mass_to_mod(mass: float) -> str: 11 | """ Convert a mass to a UNIMOD modification annotation. 12 | 13 | Args: 14 | mass: a mass in Da 15 | 16 | Returns: 17 | a UNIMOD modification annotation 18 | """ 19 | maybe_key = int(np.round(mass)) 20 | # TODO: find a better way to do the map-back 21 | mod_dict = { 22 | 42: '[UNIMOD:1]', 23 | 57: '[UNIMOD:4]', 24 | 80: '[UNIMOD:21]', 25 | 16: '[UNIMOD:35]', 26 | 119: '[UNIMOD:312]', 27 | } 28 | # try to translate to UNIMOD annotation 29 | try: 30 | return mod_dict[maybe_key] 31 | except KeyError: 32 | raise KeyError(f"Rounded mass not in dict: {maybe_key}") 33 | 34 | 35 | class Peptide: 36 | def __init__(self, 37 | decoy: bool, 38 | sequence: str, 39 | modifications: List[float], 40 | mono_isotopic: float, 41 | missed_cleavages: int, 42 | position: Position, 43 | proteins: List[str], 44 | semi_enzymatic: bool = False, 45 | n_term: Optional[float] = None, 46 | c_term: Optional[float] = None 47 | ): 48 | """Peptide class 49 | 50 | Args: 51 | decoy (bool): Is the peptide a decoy 52 | sequence (str): The peptide sequence 53 | modifications (List[float]): The modifications of the peptide 54 | mono_isotopic (float): The monoisotopic mass of the peptide 55 | missed_cleavages (int): The number of missed cleavages 56 | position (Position): The position of the peptide 57 | proteins (List[str]): The proteins that the peptide is found in 58 | n_term (Optional[float], optional): Potential modifications on the N-terminal of the peptide. Defaults to None. 59 | c_term (Optional[float], optional): Potential modifications on the C-terminal of the peptide. Defaults to None. 60 | """ 61 | self.__peptide_ptr = psc.PyPeptide(decoy, sequence, modifications, 62 | mono_isotopic, missed_cleavages, position.get_py_ptr(), 63 | proteins, semi_enzymatic, n_term, c_term) 64 | 65 | @classmethod 66 | def from_digest(cls, digest: Digest) -> 'Peptide': 67 | instance = cls.__new__(cls) 68 | instance.__peptide_ptr = psc.PyPeptide.try_new_from_digest(digest.get_py_ptr()) 69 | return instance 70 | 71 | @classmethod 72 | def from_py_peptide(cls, peptide: psc.PyPeptide): 73 | instance = cls.__new__(cls) 74 | instance.__peptide_ptr = peptide 75 | return instance 76 | 77 | @property 78 | def decoy(self): 79 | return self.__peptide_ptr.decoy 80 | 81 | @property 82 | def sequence(self): 83 | return self.__peptide_ptr.sequence 84 | 85 | @property 86 | def modifications(self): 87 | return self.__peptide_ptr.modifications 88 | 89 | @property 90 | def mono_isotopic(self): 91 | return self.__peptide_ptr.monoisotopic 92 | 93 | @property 94 | def missed_cleavages(self): 95 | return self.__peptide_ptr.missed_cleavages 96 | 97 | @property 98 | def position(self): 99 | return Position.from_py_position(self.__peptide_ptr.position) 100 | 101 | @property 102 | def proteins(self): 103 | return self.__peptide_ptr.proteins 104 | 105 | @property 106 | def n_term(self): 107 | return self.__peptide_ptr.n_term 108 | 109 | @property 110 | def c_term(self): 111 | return self.__peptide_ptr.c_term 112 | 113 | @property 114 | def semi_enzymatic(self): 115 | return self.__peptide_ptr.semi_enzymatic 116 | 117 | def reverse(self, keep_ends: Union[bool, None]) -> 'Peptide': 118 | """Reverse the peptide sequence. 119 | 120 | Args: 121 | keep_ends (Union[bool, None]): Whether to keep the N- and C-terminal amino acids in place. 122 | 123 | Returns: 124 | Peptide: The reversed peptide. 125 | """ 126 | return Peptide.from_py_peptide(self.__peptide_ptr.reverse(keep_ends)) 127 | 128 | def shuffle(self, keep_ends: Union[bool, None]) -> 'Peptide': 129 | """Shuffle the peptide sequence. 130 | 131 | Args: 132 | keep_ends (Union[bool, None]): Whether to keep the N- and C-terminal amino acids in place. 133 | 134 | Returns: 135 | Peptide: The shuffled peptide. 136 | """ 137 | return Peptide.from_py_peptide(self.__peptide_ptr.shuffle(keep_ends)) 138 | 139 | def get_py_ptr(self): 140 | return self.__peptide_ptr 141 | 142 | def __repr__(self): 143 | return f"Peptide(decoy: {self.decoy}, sequence: {self.sequence}, " \ 144 | f"modifications: {self.modifications}, mono_isotopic: {self.mono_isotopic}, " \ 145 | f"missed_cleavages: {self.missed_cleavages}, position: {self.position}, " \ 146 | f"proteins: {self.proteins}, semi_enzymatic: {self.semi_enzymatic}, n_term: {self.n_term}, " \ 147 | f"c_term: {self.c_term})" 148 | 149 | def to_unimod_sequence(self) -> str: 150 | """ Get Peptide sequence with UNIMOD modification annotations. 151 | 152 | Returns: 153 | str: Peptide sequence with UNIMOD modification annotations. 154 | """ 155 | 156 | mods = self.modifications 157 | sequence = self.sequence 158 | 159 | seq = '' 160 | 161 | for i, (s, m) in enumerate(zip(sequence, mods)): 162 | if m != 0: 163 | # TODO: check if this is the correct way to handle N- and C-terminal mods 164 | if i == 0: 165 | if mass_to_mod(m) == '[UNIMOD:1]': 166 | seq += f'{mass_to_mod(m)}{s}' 167 | else: 168 | seq += f'{s}{mass_to_mod(m)}' 169 | else: 170 | seq += f'{s}{mass_to_mod(m)}' 171 | else: 172 | seq += s 173 | 174 | return seq 175 | -------------------------------------------------------------------------------- /sagepy/sagepy/core/tmt.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List 2 | 3 | import sagepy_connector 4 | 5 | from sagepy.core import ProcessedSpectrum 6 | from sagepy.core.scoring import Feature 7 | from sagepy.core.spectrum import Peak 8 | 9 | psc = sagepy_connector.py_tmt 10 | 11 | 12 | class Isobaric: 13 | def __init__(self, type_name: str): 14 | types = ["tmt6", "tmt10", "tmt11", "tmt16", "tmt18"] 15 | if type_name in types: 16 | self.__isobaric_ptr = psc.PyIsobaric(type_name) 17 | else: 18 | raise ValueError(f"Invalid isobaric type, allowed values are: {types}") 19 | 20 | @classmethod 21 | def from_py_isobaric(cls, isobaric: psc.PyIsobaric): 22 | instance = cls.__new__(cls) 23 | instance.__isobaric_ptr = isobaric 24 | return instance 25 | 26 | @property 27 | def type_name(self): 28 | return self.__isobaric_ptr.type_name 29 | 30 | def __repr__(self): 31 | return f"Isobaric({self.__isobaric_ptr.type_name})" 32 | 33 | def get_py_ptr(self): 34 | return self.__isobaric_ptr 35 | 36 | def modification_mass(self) -> Optional[float]: 37 | maybe_mass = self.__isobaric_ptr.modification_mass() 38 | if maybe_mass is None: 39 | return None 40 | return maybe_mass 41 | 42 | 43 | class Purity: 44 | def __init__(self, ratio: float, correct_precursors: int, incorrect_precursors: int): 45 | self.__purity_ptr = psc.PyPurity(ratio, correct_precursors, incorrect_precursors) 46 | 47 | @classmethod 48 | def from_py_purity(cls, purity: psc.PyPurity): 49 | instance = cls.__new__(cls) 50 | instance.__purity_ptr = purity 51 | return instance 52 | 53 | @property 54 | def ratio(self): 55 | return self.__purity_ptr.ratio 56 | 57 | @property 58 | def correct_precursors(self): 59 | return self.__purity_ptr.correct_precursors 60 | 61 | @property 62 | def incorrect_precursors(self): 63 | return self.__purity_ptr.incorrect_precursors 64 | 65 | def __repr__(self): 66 | return (f"Purity(ratio={self.ratio}, correct_precursors={self.correct_precursors}, " 67 | f"incorrect_precursors={self.incorrect_precursors})") 68 | 69 | def get_py_ptr(self): 70 | return self.__purity_ptr 71 | 72 | 73 | class Quant: 74 | def __init__(self, hit: Feature, hit_purity: Purity, spectrum: ProcessedSpectrum, 75 | chimera: Optional[Feature] = None, chimera_purity: Optional[Purity] = None, 76 | intensities: Optional[List[Peak]] = None): 77 | 78 | if chimera is not None: 79 | chimera = chimera.get_py_ptr() 80 | 81 | if chimera_purity is not None: 82 | chimera_purity = chimera_purity.get_py_ptr() 83 | 84 | if intensities is not None: 85 | intensities = [peak.get_py_ptr() for peak in intensities] 86 | 87 | self.__quant_ptr = psc.PyQuant(hit.get_py_ptr(), hit_purity.get_py_ptr(), 88 | spectrum.get_py_ptr(), chimera, chimera_purity, intensities) 89 | 90 | @classmethod 91 | def from_py_quant(cls, quant: psc.PyQuant): 92 | instance = cls.__new__(cls) 93 | instance.__quant_ptr = quant 94 | return instance 95 | 96 | @property 97 | def hit(self): 98 | return Feature.from_py_feature(self.__quant_ptr.hit()) 99 | 100 | @property 101 | def hit_purity(self): 102 | return Purity.from_py_purity(self.__quant_ptr.hit_purity()) 103 | 104 | @property 105 | def spectrum(self): 106 | return ProcessedSpectrum.from_py_processed_spectrum(self.__quant_ptr.spectrum()) 107 | 108 | @property 109 | def chimera(self): 110 | maybe_chimera = self.__quant_ptr.chimera() 111 | if maybe_chimera is None: 112 | return None 113 | return Feature.from_py_feature(maybe_chimera) 114 | 115 | @property 116 | def chimera_purity(self): 117 | maybe_chimera_purity = self.__quant_ptr.chimera_purity() 118 | if maybe_chimera_purity is None: 119 | return None 120 | return Purity.from_py_purity(maybe_chimera_purity) 121 | 122 | @property 123 | def intensities(self): 124 | intensities = self.__quant_ptr.intensities() 125 | if intensities is None: 126 | return None 127 | return [Peak.from_py_peak(peak) for peak in intensities] 128 | 129 | def __repr__(self): 130 | return (f"Quant(hit={self.hit}, hit_purity={self.hit_purity}, spectrum={self.spectrum}, " 131 | f"chimera={self.chimera}, chimera_purity={self.chimera_purity}, intensities={self.intensities})") 132 | 133 | def get_py_ptr(self): 134 | return self.__quant_ptr 135 | -------------------------------------------------------------------------------- /sagepy/sagepy/core/unimod.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Union, List 2 | from sagepy.core.modification import ModificationSpecificity 3 | 4 | from sagepy_connector import py_unimod as unimod 5 | from .modification import validate_mods, validate_var_mods 6 | 7 | 8 | def modification_title_to_unimod_id() -> Dict[str, str]: 9 | """ Get a dict that maps modification names to Unimod IDs. 10 | 11 | Returns: 12 | A dict that maps modification names to Unimod IDs. 13 | """ 14 | return unimod.title_to_unimod_ids() 15 | 16 | 17 | def modification_atomic_composition() -> Dict[str, Dict[str, int]]: 18 | """ Get a dict that maps modification names to atomic compositions. 19 | 20 | Returns: 21 | A dict that maps modification names to atomic compositions. 22 | """ 23 | return unimod.modification_atomic_compositions() 24 | 25 | 26 | def unimod_to_mass() -> Dict[str, float]: 27 | """ Get a dict that maps Unimod IDs to mass values. 28 | 29 | Returns: 30 | A dict that maps Unimod IDs to mass values. 31 | """ 32 | return unimod.unimod_modification_to_mass() 33 | 34 | 35 | def unimod_to_mass_numerical() -> Dict[int, float]: 36 | """ Get a dict that maps Unimod IDs given as integer to mass values. 37 | 38 | Returns: 39 | A dict that maps Unimod IDs to mass values. 40 | """ 41 | return unimod.unimod_modification_to_mass_numerical() 42 | 43 | 44 | def unimod_static_mods_to_sage_static_mods( 45 | unimod_static_mods: Union[Dict[str, str], Dict[str, int]] 46 | ) -> Dict[ModificationSpecificity, float]: 47 | """ Translate a dict that maps modification names to Unimod IDs 48 | to a dict that maps ModificationSpecificity objects and a set of modification names. 49 | Args: 50 | unimod_static_mods: A dict that maps modification names to Unimod IDs. 51 | Returns: 52 | A tuple containing a dict that maps ModificationSpecificity objects 53 | to mass values and a set of modification names. 54 | """ 55 | 56 | if len(unimod_static_mods) == 0: 57 | return {} 58 | 59 | mods_numeric = type(list(unimod_static_mods.values())[0]) is int 60 | if mods_numeric: 61 | mod_to_mass = unimod.unimod_modification_to_mass_numerical() 62 | else: 63 | mod_to_mass = unimod.unimod_modification_to_mass() 64 | 65 | sage_raw_dict = {} 66 | 67 | for key, value in unimod_static_mods.items(): 68 | mass = mod_to_mass[value] 69 | sage_raw_dict[key] = mass 70 | 71 | return validate_mods(sage_raw_dict) 72 | 73 | 74 | def unimod_variable_mods_to_sage_variable_mods( 75 | unimod_variable_mods: Union[Dict[str, List[str]], Dict[str, List[int]]] 76 | ) -> Dict[ModificationSpecificity, List[float]]: 77 | """ Translate a dict that maps modification names to Unimod IDs 78 | to a dict that maps ModificationSpecificity objects to lists of mass values and a set of modification names. 79 | 80 | Args: 81 | unimod_variable_mods: A dict that maps modification names to Unimod IDs. 82 | 83 | Returns: 84 | A tuple containing a dict that maps ModificationSpecificity objects 85 | to lists of mass values and a set of modification names. 86 | """ 87 | 88 | if len(unimod_variable_mods) == 0: 89 | return {} 90 | 91 | # Check if the modification IDs are numeric or string 92 | mods_numeric = type(list(unimod_variable_mods.values())[0]) is int 93 | 94 | if mods_numeric: 95 | mod_to_mass = unimod.unimod_modification_to_mass_numerical() 96 | else: 97 | mod_to_mass = unimod.unimod_modification_to_mass() 98 | 99 | sage_raw_dict: Dict[str, List[float]] = {} 100 | 101 | for key, values in unimod_variable_mods.items(): 102 | for value in values: 103 | mass = mod_to_mass[value] 104 | 105 | if key in sage_raw_dict: 106 | sage_raw_dict[key].append(mass) 107 | else: 108 | sage_raw_dict[key] = [mass] 109 | 110 | return validate_var_mods(sage_raw_dict) 111 | 112 | 113 | def static_unimod_mods_to_set( 114 | unimod_mods: Union[Dict[str, str], Dict[str, int]] 115 | ) -> set: 116 | """ Translate a dict that maps modification names to Unimod IDs to a set of modification names. 117 | 118 | Args: 119 | unimod_mods: A dict that maps modification names to Unimod IDs. 120 | 121 | Returns: 122 | A set of modification names. 123 | """ 124 | 125 | if len(unimod_mods) == 0: 126 | return set() 127 | 128 | if isinstance(next(iter(unimod_mods.values())), int): 129 | return {f"[UNIMOD:{value}]" for value in unimod_mods.values()} 130 | else: 131 | return set(unimod_mods.values()) 132 | 133 | def variable_unimod_mods_to_set( 134 | unimod_mods: Union[Dict[str, List[str]], Dict[str, List[int]] 135 | ]) -> set: 136 | """ Translate a dict that maps modification names to Unimod IDs to a set of modification names. 137 | 138 | Args: 139 | unimod_mods: A dict that maps modification names to Unimod IDs. 140 | 141 | Returns: 142 | A set of modification names. 143 | """ 144 | 145 | if isinstance(next(iter(unimod_mods.values())), int): 146 | return {f"[UNIMOD:{value}]" for values in unimod_mods.values() for value in values} 147 | else: 148 | return {value for values in unimod_mods.values() for value in values} 149 | -------------------------------------------------------------------------------- /sagepy/sagepy/qfdr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theGreatHerrLebert/sagepy/4382f7b24dd4e35bfa256e8c8a32ead610782e99/sagepy/sagepy/qfdr/__init__.py -------------------------------------------------------------------------------- /sagepy/sagepy/qfdr/tdc.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Optional 2 | 3 | import pandas as pd 4 | import sagepy_connector 5 | 6 | from sagepy.core import Psm 7 | 8 | psc = sagepy_connector.py_qfdr 9 | 10 | 11 | class TDCMethod: 12 | def __init__(self, method: str): 13 | self.methods = {"psm", "peptide_psm_only", "peptide_peptide_only", "peptide_psm_peptide", "picked_peptide", "picked_protein"} 14 | assert method in self.methods, f"Invalid method: {method}, allowed values are: {self.methods}" 15 | self.__py_ptr = psc.PyTDCMethod(method) 16 | 17 | @classmethod 18 | def from_py_ptr(cls, py_ptr: psc.PyTDCMethod): 19 | instance = cls.__new__(cls) 20 | instance.__py_ptr = py_ptr 21 | return instance 22 | 23 | def get_py_ptr(self) -> psc.PyTDCMethod: 24 | return self.__py_ptr 25 | 26 | def __repr__(self): 27 | return f"TDCMethod({self.__py_ptr.to_str()})" 28 | 29 | 30 | def target_decoy_competition( 31 | spectra_idx: List[str], 32 | match_idx: List[str], 33 | match_identity_candidates: List[List[str]], 34 | decoy: List[bool], 35 | scores: List[float], 36 | method: str = "peptide_psm_peptide") -> Tuple[List[str], List[str], List[List[str]], List[bool], List[float], List[float]]: 37 | """ Perform target-decoy competition. 38 | 39 | Args: 40 | spectra_idx: a list of spectrum indices 41 | match_idx: a list of match indices 42 | match_identity_candidates: a list of match identity candidates 43 | decoy: a list of decoy flags 44 | scores: a list of scores 45 | method: the method to use, allowed values are: psm, peptide_psm_only, peptide_peptide_only, peptide_psm_peptide 46 | 47 | Returns: 48 | a tuple of spectrum indices, match indices, decoy flags, scores, and q-values 49 | """ 50 | tdc_method = TDCMethod(method) 51 | spec_idx, match_idx, match_identity_candidates, decoy, scores, q_values = psc.target_decoy_competition( 52 | tdc_method.get_py_ptr(), spectra_idx, 53 | match_idx, decoy, scores, match_identity_candidates) 54 | return spec_idx, match_idx, match_identity_candidates, decoy, scores, q_values 55 | 56 | 57 | def target_decoy_competition_pandas( 58 | df: pd.DataFrame, 59 | method: str = "peptide_psm_peptide", 60 | score: Optional[str] = None, 61 | ) -> pd.DataFrame: 62 | """ Perform target-decoy competition on a pandas DataFrame. 63 | 64 | Args: 65 | df: a pandas DataFrame 66 | method: the method to use, allowed values are: psm, peptide_psm_only, peptide_peptide_only, peptide_psm_peptide 67 | score: the target column name (optional) 68 | 69 | Returns: 70 | a pandas DataFrame with q-values 71 | """ 72 | 73 | # Ensure necessary columns are present 74 | required_columns = ['spec_idx', 'match_idx', 'match_identity_candidates', 'decoy'] 75 | for col in required_columns: 76 | assert col in df.columns, f"{col} column not found" 77 | 78 | # Ensure score column is present 79 | score_col = score if score else 'hyperscore' 80 | assert score_col in df.columns, f"{score_col} column not found" 81 | 82 | target_score = df[score_col] 83 | spec_idx, match_idx, match_identity_candidates, target, scores = ( 84 | df['spec_idx'].tolist(), 85 | df['match_idx'].tolist(), 86 | df['match_identity_candidates'].tolist(), 87 | df['decoy'].tolist(), 88 | target_score.tolist() 89 | ) 90 | 91 | spec_idx, match_idx, match_identity_candidates, target, scores, q_values = target_decoy_competition( 92 | spec_idx, 93 | match_idx, 94 | match_identity_candidates, 95 | target, 96 | scores, 97 | method 98 | ) 99 | 100 | # Create df with TDC results 101 | df_tdc = pd.DataFrame({ 102 | 'spec_idx': spec_idx, 103 | 'match_idx': match_idx, 104 | 'match_identity_candidates': match_identity_candidates, 105 | 'decoy': target, 106 | f'{score_col}': scores, 107 | 'q_value': q_values 108 | }).sort_values(by=['q_value'], ascending=True) 109 | 110 | return df_tdc 111 | 112 | def assign_sage_spectrum_q(psm_list: List[Psm], use_hyper_score: bool = True): 113 | """ Assign SAGE spectrum q-values to PSMs. 114 | Args: 115 | psm_list: a list of PeptideSpectrumMatch objects 116 | use_hyper_score: whether to use hyper score or discriminant score for q-value calculation 117 | """ 118 | # Perform SAGE FDR 119 | psc.assign_spectrum_q([psm.get_py_ptr() for psm in psm_list], use_hyper_score) 120 | 121 | def assign_sage_peptide_q(psm_list: List[Psm], use_hyper_score: bool = True): 122 | """ Assign SAGE peptide q-values to PSMs. 123 | Args: 124 | psm_list: a list of PeptideSpectrumMatch objects 125 | use_hyper_score: whether to use hyper score or discriminant score for q-value calculation 126 | """ 127 | # Perform SAGE FDR 128 | psc.assign_peptide_q([psm.get_py_ptr() for psm in psm_list], use_hyper_score) 129 | 130 | def assign_sage_protein_q(psm_list: List[Psm], use_hyper_score: bool = True): 131 | """ Assign SAGE protein q-values to PSMs. 132 | Args: 133 | psm_list: a list of PeptideSpectrumMatch objects 134 | use_hyper_score: whether to use hyper score or discriminant score for q-value calculation 135 | """ 136 | # Perform SAGE FDR 137 | psc.assign_protein_q([psm.get_py_ptr() for psm in psm_list], use_hyper_score) -------------------------------------------------------------------------------- /sagepy/sagepy/rescore/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theGreatHerrLebert/sagepy/4382f7b24dd4e35bfa256e8c8a32ead610782e99/sagepy/sagepy/rescore/__init__.py -------------------------------------------------------------------------------- /sagepy/sagepy/rescore/lda.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 3 | from sklearn.preprocessing import StandardScaler 4 | 5 | from tqdm import tqdm 6 | from typing import Union, List, Dict 7 | 8 | from sagepy.core import Psm 9 | from sagepy.rescore.utility import get_features, generate_training_data, split_psm_list 10 | from sagepy.utility import psm_collection_to_pandas 11 | 12 | 13 | def rescore_lda( 14 | psm_collection: Union[List[Psm], Dict[str, List[Psm]]], 15 | num_splits: int = 5, 16 | verbose: bool = True, 17 | balance: bool = True, 18 | replace_nan: bool = True, 19 | score: str = "hyperscore", 20 | num_threads: int = 16, 21 | ) -> List[Psm]: 22 | """ Re-score PSMs using Linear Discriminant Analysis (LDA). 23 | Args: 24 | psm_collection: A collection of PSMs 25 | num_splits: Number of splits (folds) to use for cross-validation 26 | verbose: Whether to print progress 27 | balance: Whether to balance the dataset (equal number of target and decoy examples) 28 | replace_nan: Whether to replace NaN values with 0 29 | score: Score to use for rescoring 30 | num_threads: Number of threads to use for feature extraction 31 | 32 | Returns: 33 | List[PeptideSpectrumMatch]: List of PeptideSpectrumMatch objects 34 | """ 35 | 36 | psm_list = [] 37 | 38 | if isinstance(psm_collection, dict): 39 | for spec_id, psm_candidates in psm_collection.items(): 40 | psm_list.extend(psm_candidates) 41 | else: 42 | psm_list = psm_collection 43 | 44 | 45 | X_all, _ = get_features(psm_collection_to_pandas(psm_list, num_threads=num_threads), score=score, replace_nan=replace_nan) 46 | scaler = StandardScaler() 47 | scaler.fit(X_all) 48 | 49 | splits = split_psm_list(psm_list=psm_list, num_splits=num_splits) 50 | 51 | predictions = [] 52 | 53 | for i in tqdm(range(num_splits), disable=not verbose, desc='Re-scoring PSMs', ncols=100): 54 | 55 | target = splits[i] 56 | features = [] 57 | 58 | for j in range(num_splits): 59 | if j != i: 60 | features.extend(splits[j]) 61 | 62 | # generate training data 63 | X_train, Y_train = generate_training_data(features, balance=balance, replace_nan=replace_nan, num_threads=num_threads) 64 | 65 | # get features for target that we want to re-score 66 | X, _ = get_features(psm_collection_to_pandas(target, num_threads=num_threads), replace_nan=replace_nan) 67 | 68 | # experimenting with different settings for LDA showed that shrinkage should be used, which tries to 69 | # keep model weights small and helps to prevent overfitting 70 | lda = LinearDiscriminantAnalysis(solver="eigen", shrinkage="auto") 71 | lda.fit(scaler.transform(X_train), Y_train) 72 | 73 | try: 74 | # check for flip sign of LDA classification return to be compatible with good score ascending 75 | score_flip = 1.0 if Y_train[np.argmax(np.squeeze(lda.transform(scaler.transform(X_train))))] == 1.0 else -1.0 76 | except: 77 | score_flip = 1.0 78 | 79 | Y_pred = np.squeeze(lda.transform(scaler.transform(X))) * score_flip 80 | predictions.extend(Y_pred) 81 | 82 | for score, match in zip(predictions, psm_list): 83 | match.re_score = score 84 | 85 | return psm_list 86 | -------------------------------------------------------------------------------- /sagepy/sagepy/rescore/rescore.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import StandardScaler, MinMaxScaler 2 | 3 | from tqdm import tqdm 4 | from typing import Union, List, Dict 5 | 6 | from sagepy.core import Psm 7 | from sagepy.rescore.utility import get_features, generate_training_data, split_psm_list 8 | from sagepy.utility import psm_collection_to_pandas 9 | 10 | 11 | def rescore_psms( 12 | psm_collection: Union[List[Psm], Dict[str, List[Psm]]], 13 | model, 14 | use_min_max_scaler: bool = False, 15 | num_splits: int = 3, 16 | verbose: bool = True, 17 | balance: bool = True, 18 | replace_nan: bool = True, 19 | score: str = "hyperscore", 20 | num_threads: int = 16, 21 | **kwargs, 22 | ) -> List[Psm]: 23 | """ Re-score PSMs using a model (e.g. Random Forest, Gradient Boosting, etc.). 24 | Args: 25 | psm_collection: A collection of PSMs 26 | model: A model to use for re-scoring, needs to comply to the sklearn API 27 | use_min_max_scaler: Whether to use MinMaxScaler instead of StandardScaler 28 | num_splits: Number of splits (folds) to use for cross-validation 29 | verbose: Whether to print progress 30 | balance: Whether to balance the dataset (equal number of target and decoy examples) 31 | replace_nan: Whether to replace NaN values with 0 32 | score: Score to use for re-scoring 33 | num_threads: Number of threads to use for feature extraction 34 | 35 | Returns: 36 | List[PeptideSpectrumMatch]: List of PeptideSpectrumMatch objects 37 | """ 38 | 39 | psm_list = [] 40 | 41 | if isinstance(psm_collection, dict): 42 | for spec_id, psm_candidates in psm_collection.items(): 43 | psm_list.extend(psm_candidates) 44 | else: 45 | psm_list = psm_collection 46 | 47 | # get features for all PSMs, which will be a matrix of shape (n_samples, n_features) 48 | X_all, _ = get_features(psm_collection_to_pandas(psm_list, num_threads=num_threads), score=score, replace_nan=replace_nan) 49 | 50 | # use a scaler to scale the features 51 | if use_min_max_scaler: 52 | scaler = MinMaxScaler() 53 | else: 54 | scaler = StandardScaler() 55 | scaler.fit(X_all) 56 | 57 | # split the PSMs into num_splits folds to perform cross-validation 58 | splits = split_psm_list(psm_list=psm_list, num_splits=num_splits, **kwargs) 59 | 60 | predictions = [] 61 | final_psms = [] 62 | for i in tqdm(range(num_splits), disable=not verbose, desc='Re-scoring PSMs', ncols=100): 63 | 64 | target = splits[i] 65 | final_psms.extend(target) 66 | features = [] 67 | 68 | for j in range(num_splits): 69 | if j != i: 70 | features.extend(splits[j]) 71 | 72 | # generate training data 73 | X_train, Y_train = generate_training_data(features, balance=balance, replace_nan=replace_nan, num_threads=num_threads, **kwargs) 74 | 75 | # get features for target that we want to re-score 76 | X, _ = get_features(psm_collection_to_pandas(target), replace_nan=replace_nan) 77 | model.fit(scaler.transform(X_train), Y_train) 78 | 79 | # try to use decision function, otherwise use predict_proba 80 | try: 81 | Y_pred = model.decision_function(scaler.transform(X)) 82 | predictions.extend(Y_pred) # Use decision scores directly 83 | except AttributeError: 84 | Y_pred = model.predict_proba(scaler.transform(X)) 85 | predictions.extend(Y_pred[:, 1]) # Use class probabilities (second column for binary classification) 86 | 87 | # assign the re-scored values to the PSMs 88 | for score, match in zip(predictions, final_psms): 89 | match.re_score = score 90 | 91 | return final_psms 92 | -------------------------------------------------------------------------------- /sagepy/sagepy/rescore/rt_predictor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import mean_squared_error 9 | from sklearn.linear_model import Ridge, Lasso 10 | 11 | from sagepy.core import Psm 12 | from sagepy.qfdr.tdc import target_decoy_competition_pandas 13 | from sagepy.utility import psm_collection_to_pandas 14 | 15 | 16 | def tokenize_peptide(sequence: str) -> List[str]: 17 | """ 18 | Tokenize a peptide sequence into amino acid tokens. 19 | Args: 20 | sequence: A peptide sequence string 21 | 22 | Returns: 23 | A list of amino acid tokens, including modifications 24 | """ 25 | tokens = re.findall(r'[A-Z](?:\[UNIMOD:\d+\])?', sequence) 26 | return tokens 27 | 28 | 29 | def sequence_to_vector(sequence: str, token_alphabet: List[str]) -> np.ndarray: 30 | """ 31 | Convert a peptide sequence into a vector representation based on a token alphabet 32 | Args: 33 | sequence: A peptide sequence string 34 | token_alphabet: A list of amino acid tokens 35 | 36 | Returns: 37 | A vector representation of the peptide sequence 38 | """ 39 | tokens = tokenize_peptide(sequence) 40 | vector = np.zeros(len(token_alphabet)) 41 | for token in tokens: 42 | if token in token_alphabet: 43 | idx = token_alphabet.index(token) 44 | vector[idx] += 1 45 | return vector 46 | 47 | 48 | def create_token_alphabet(sequences): 49 | """ 50 | Create a token alphabet from a set of peptide sequences. 51 | Args: 52 | sequences: A list of peptide sequences 53 | 54 | Returns: 55 | A list of unique amino acid tokens 56 | """ 57 | unique_tokens = set() 58 | for seq in sequences: 59 | unique_tokens.update(tokenize_peptide(seq)) 60 | token_alphabet = sorted(unique_tokens) # Sort to maintain consistent ordering 61 | return token_alphabet 62 | 63 | 64 | def prepare_data(sequences, retention_times, token_alphabet): 65 | """ 66 | Prepare the dataset for training a retention time predictor 67 | Args: 68 | sequences: A list of peptide sequences 69 | retention_times: A list of retention times 70 | token_alphabet: A list of amino acid tokens 71 | 72 | Returns: 73 | X: A matrix of input features 74 | y: A vector of target values 75 | """ 76 | # Convert sequences to vectors based on the provided token_alphabet 77 | X = np.array([sequence_to_vector(seq, token_alphabet) for seq in sequences]) 78 | y = np.array(retention_times) 79 | 80 | return X, y 81 | 82 | 83 | def train_ridge_regression_model(X, y, alpha=1.0, verbose=False): 84 | """ 85 | Train a ridge regression model with L2 regularization. 86 | Args: 87 | X: tokenized peptide sequences 88 | y: retention times 89 | alpha: regularization strength 90 | verbose: whether to print the test MSE 91 | 92 | Returns: 93 | Trained model and train/test data 94 | """ 95 | 96 | model = Ridge(alpha=alpha) 97 | model.fit(X, y) 98 | 99 | if verbose: 100 | y_pred = model.predict(X) 101 | mse = mean_squared_error(y, y_pred) 102 | print(f"Test MSE (Ridge, alpha={alpha}): {mse}") 103 | 104 | return model 105 | 106 | 107 | def train_lasso_regression_model(X, y, alpha=1.0, verbose=False): 108 | """ 109 | Train a Lasso regression model with L1 regularization. 110 | Args: 111 | X: Peptide sequences 112 | y: Retention times 113 | alpha: Regularization strength 114 | verbose: Whether to print the test MSE 115 | 116 | Returns: 117 | Trained model and train/test data 118 | """ 119 | 120 | model = Lasso(alpha=alpha) 121 | model.fit(X, y) 122 | 123 | if verbose: 124 | y_pred = model.predict(X) 125 | mse = mean_squared_error(y, y_pred) 126 | print(f"Test MSE (Lasso, alpha={alpha}): {mse}") 127 | 128 | return model 129 | 130 | 131 | def transform_sequences(sequences, token_alphabet): 132 | """ 133 | Transform a list of peptide sequences into a matrix of token counts based on a token alphabet. 134 | Args: 135 | sequences: A list of peptide sequences 136 | token_alphabet: A list of amino acid tokens 137 | 138 | Returns: 139 | A matrix of token counts 140 | """ 141 | X_new = np.array([sequence_to_vector(seq, token_alphabet) for seq in sequences]) 142 | return X_new 143 | 144 | def predict_retention_times_psm(psm_collection: List[Psm], fdr_threshold: float = 0.01, alpha: float = 0.2): 145 | """ 146 | Predict retention times for peptide spectrum matches using a ridge regression model 147 | Args: 148 | psm_collection: A list of PeptideSpectrumMatch objects 149 | fdr_threshold: The false discovery rate threshold for selecting target hits 150 | alpha: The regularization strength for the ridge regression model 151 | 152 | Returns: 153 | None, the retention times are updated in place in the PeptideSpectrumMatch objects 154 | """ 155 | 156 | # Convert the peptide spectrum matches to a pandas DataFrame 157 | PSM_pandas = psm_collection_to_pandas(psm_collection) 158 | 159 | # Create a token alphabet from the peptide sequences 160 | token_alphabet = create_token_alphabet(PSM_pandas["sequence"]) 161 | 162 | # Prepare the dataset for training 163 | TDC = target_decoy_competition_pandas(PSM_pandas, method="psm") 164 | TDC = pd.merge(TDC.drop(columns=["score"]), 165 | PSM_pandas.drop(columns=["q_value"]), on=["spec_idx", "match_idx", "decoy"]) 166 | 167 | # we can select target hits with a q-value of 0.01, translating to 1 percent FDR 168 | FDR_controlled = TDC[(TDC.q_value <= fdr_threshold) & (TDC.decoy == False)] 169 | X, y = prepare_data(FDR_controlled["sequence"], FDR_controlled["retention_time_observed"], token_alphabet) 170 | 171 | # Train a ridge regression model 172 | model = train_ridge_regression_model(X, y, alpha=alpha) 173 | 174 | # Predict retention times for all PSMs 175 | X_new = transform_sequences(PSM_pandas["sequence"], token_alphabet) 176 | predicted_times = model.predict(X_new) 177 | 178 | for rt_pred, psm in zip(predicted_times, psm_collection): 179 | psm.retention_time_predicted = rt_pred 180 | -------------------------------------------------------------------------------- /sagepy/sagepy/rescore/utility.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import pandas as pd 4 | 5 | from numpy.typing import NDArray 6 | from typing import Optional, Tuple, List 7 | 8 | from sagepy.core.scoring import Psm 9 | from sagepy.utility import psm_collection_to_pandas 10 | 11 | from collections import defaultdict 12 | from typing import Callable 13 | 14 | peptide_key_maker = lambda psm: (psm.sequence,) 15 | ion_key_maker = lambda psm: (psm.sequence, psm.charge) 16 | 17 | 18 | def assign_random_groups(group_count, number_of_assignments, seed=None): 19 | """ 20 | Assign random groups to a number of assignments. 21 | Args: 22 | group_count: int, number of groups to assign to 23 | number_of_assignments: int, number of assignments to make 24 | seed: None | int, random seed for reproducibility 25 | 26 | Returns: 27 | list: a list of group assignments, where each assignment is an integer in the range [0, group_count - 1] 28 | """ 29 | if seed is not None: 30 | random.seed(seed) 31 | return [random.randint(0, group_count - 1) for _ in range(number_of_assignments)] 32 | 33 | 34 | def split_into_chunks( 35 | psms: list, 36 | splits_count: int = 3, 37 | key_maker: Callable = peptide_key_maker, 38 | seed: None | int = None, 39 | ) -> list[list]: 40 | """ 41 | Split a list of PSMs into multiple chunks based on a key maker function. 42 | Args: 43 | psms: list of Psm objects 44 | splits_count: int, number of splits to create 45 | key_maker: Callable, a function that takes a Psm object and returns a key for grouping 46 | seed: None | int, random seed for reproducibility 47 | 48 | Returns: 49 | list[list]: a list of lists, where each inner list contains Psm objects that share the same key 50 | """ 51 | grouped_psms = defaultdict(list) 52 | for psm in psms: 53 | grouped_psms[key_maker(psm)].append(psm) 54 | 55 | split_assignments = assign_random_groups(splits_count, len(grouped_psms), seed=seed) 56 | 57 | splits = [[] for _ in range(splits_count)] 58 | for split_assignment, (group, grouped_psms) in zip(split_assignments, grouped_psms.items()): 59 | splits[split_assignment].extend(grouped_psms) 60 | 61 | return splits 62 | 63 | 64 | def dict_to_dense_array(peak_dict, array_length=174): 65 | """ 66 | Convert a dictionary of peaks to a fixed-length array. 67 | Args: 68 | peak_dict: a dictionary of peaks (ion_type, charge, ordinal) -> intensity 69 | array_length: the length of the fixed-length array 70 | 71 | Returns: 72 | A fixed-length array of intensities. 73 | """ 74 | # initialize a fixed-length array of zeros 75 | intensities = np.zeros(array_length) 76 | 77 | half_length = array_length // 2 # first half for b ions, second half for y ions 78 | block_size = 29 # number of ordinals per charge state 79 | 80 | for (ion_type, charge, ordinal), intensity in peak_dict.items(): 81 | # map (b=0, y=1) ions to the correct index 82 | index = ion_type * half_length + (charge - 1) * block_size + (ordinal - 1) 83 | intensities[index] = intensity 84 | 85 | return intensities 86 | 87 | def get_features( 88 | ds: pd.DataFrame, 89 | score: Optional[str] = None, 90 | replace_nan: bool = True, 91 | ) -> Tuple[NDArray, NDArray]: 92 | """ 93 | Get features and labels from a dataset. 94 | Args: 95 | ds: a pandas DataFrame containing the dataset. 96 | score: the name of the target score column. 97 | replace_nan: if True, replace NaN values with 0. 98 | 99 | Returns: 100 | A tuple containing the features and labels. 101 | """ 102 | 103 | score = score if score is not None else "hyperscore" 104 | 105 | # The currently used features for the model fit 106 | # TODO: extend this list with additional features 107 | features = [ 108 | f"{score}", 109 | "delta_rt", 110 | "delta_ims", 111 | "cosine_similarity", 112 | "delta_mass", 113 | "rank", 114 | "isotope_error", 115 | "average_ppm", 116 | "delta_next", 117 | "delta_best", 118 | "matched_peaks", 119 | "longest_b", 120 | "longest_y", 121 | "longest_y_pct", 122 | "missed_cleavages", 123 | "matched_intensity_pct", 124 | "poisson", 125 | "charge", 126 | "intensity_ms1", 127 | "intensity_ms2", 128 | "collision_energy", 129 | "cosine_similarity", 130 | "spectral_angle_similarity", 131 | "pearson_correlation", 132 | "spearman_correlation", 133 | "spectral_entropy_similarity", 134 | ] 135 | ds = ds.copy() 136 | 137 | # Log-transform the intensity columns 138 | ds["intensity_ms1"] = ds["intensity_ms1"].apply(lambda x: np.log1p(x)) 139 | ds["intensity_ms2"] = ds["intensity_ms2"].apply(lambda x: np.log1p(x)) 140 | 141 | # avoid none values for cosine similarity 142 | ds["cosine_similarity"] = ds["cosine_similarity"].apply(lambda x: 0.0 if x is None else x) 143 | 144 | X = ds[features].to_numpy().astype(np.float32) 145 | 146 | if replace_nan: 147 | X = np.nan_to_num(X) 148 | 149 | Y = ds["decoy"].to_numpy() 150 | Y = np.array([0 if x else 1 for x in Y]).astype(np.float32) 151 | 152 | return X, Y 153 | 154 | 155 | def generate_training_data( 156 | psm_list: List[Psm], 157 | method: str = "peptide_q", 158 | q_max: float = 0.01, 159 | balance: bool = True, 160 | replace_nan: bool = True, 161 | num_threads: int = 16, 162 | **kwargs 163 | ) -> Tuple[NDArray, NDArray]: 164 | """ Generate training data. 165 | Args: 166 | psm_list: List of PeptideSpectrumMatch objects 167 | method: Method to use for training data generation 168 | q_max: Maximum q-value allowed for positive examples 169 | balance: Whether to balance the dataset 170 | replace_nan: Whether to replace NaN values with 0 171 | num_threads: Number of threads to use for feature extraction 172 | 173 | Returns: 174 | Tuple[NDArray, NDArray]: X_train and Y_train 175 | """ 176 | # create pandas table from psms 177 | PSM_pandas = psm_collection_to_pandas(psm_list, num_threads=num_threads) 178 | 179 | if method == "spectrum_q": 180 | TARGET = PSM_pandas[(PSM_pandas.decoy == False) & (PSM_pandas.spectrum_q <= q_max) & (PSM_pandas["rank"] == 1)] 181 | elif method == "peptide_q": 182 | TARGET = PSM_pandas[(PSM_pandas.decoy == False) & (PSM_pandas.peptide_q <= q_max) & (PSM_pandas["rank"] == 1)] 183 | 184 | elif method == "decoy_quantile": 185 | cutoff = PSM_pandas[PSM_pandas.decoy].hyperscore.quantile(1 - q_max) 186 | TARGET = PSM_pandas[(PSM_pandas.decoy == False) & (PSM_pandas["rank"] == 1) & (PSM_pandas.hyperscore >= cutoff)] 187 | else: 188 | raise ValueError(f"Unknown method: {method}. Use 'spectrum_q' or 'peptide_q'.") 189 | 190 | X_target, Y_target = get_features(TARGET, replace_nan=replace_nan) 191 | 192 | # select all decoys 193 | DECOY = PSM_pandas[PSM_pandas.decoy & (PSM_pandas["rank"] == 1)] 194 | X_decoy, Y_decoy = get_features(DECOY, replace_nan=replace_nan) 195 | 196 | # balance the dataset such that the number of target and decoy examples are equal 197 | if balance: 198 | num_target = np.min((len(DECOY), len(TARGET))) 199 | target_indices = np.random.choice(np.arange(len(X_target)), size=num_target) 200 | X_target = X_target[target_indices, :] 201 | Y_target = Y_target[target_indices] 202 | 203 | # combine target and decoy examples 204 | X_train = np.vstack((X_target, X_decoy)) 205 | Y_train = np.hstack((Y_target, Y_decoy)) 206 | 207 | return X_train, Y_train 208 | 209 | 210 | def get_list_index_by_sequence(psms, num_splits: int = 5, seed: int = 35): 211 | # Set random seed for reproducibility 212 | np.random.seed(seed) 213 | 214 | # Extract unique sequences 215 | unique_sequences = list({psm.sequence for psm in psms}) 216 | 217 | # Shuffle and split 218 | shuffled = np.random.permutation(unique_sequences) 219 | split = np.array_split(shuffled, num_splits) 220 | 221 | # Create mapping from sequence -> split index 222 | index_dict = {seq: i for i, group in enumerate(split) for seq in group} 223 | return index_dict 224 | 225 | 226 | def split_psm_list_broken(psm_list: List[Psm], num_splits: int = 5) -> List[List]: 227 | # Get sequence-to-split mapping 228 | seq_to_split = get_list_index_by_sequence(psm_list, num_splits) 229 | 230 | # Preallocate split containers 231 | splits = [[] for _ in range(num_splits)] 232 | 233 | # Assign PSMs to their respective splits 234 | for psm in psm_list: 235 | split_idx = seq_to_split[psm.sequence] 236 | splits[split_idx].append(psm) 237 | 238 | return splits 239 | 240 | def split_psm_list(psm_list: List[Psm], num_splits: int = 5, seed: None| int = None, key_maker: Callable = peptide_key_maker, **kwargs) -> List[List[Psm]]: 241 | """ 242 | Split PSMs into multiple splits. 243 | 244 | Args: 245 | psm_list: List of PeptideSpectrumMatch objects 246 | num_splits: Number of splits 247 | seed: Optional seed for reproducibility 248 | key_maker: Callable function to create keys for grouping PSMs 249 | 250 | Returns: 251 | List[List[PeptideSpectrumMatch]]: List of splits 252 | 253 | """ 254 | return split_into_chunks(psm_list, num_splits, seed=seed, key_maker=key_maker) 255 | 256 | """ 257 | def split_psm_list(psm_list: List[Psm], num_splits: int = 5) -> List[List[Psm]]: 258 | Split PSMs into multiple splits. 259 | 260 | Args: 261 | psm_list: List of PeptideSpectrumMatch objects 262 | num_splits: Number of splits 263 | 264 | Returns: 265 | List[List[PeptideSpectrumMatch]]: List of splits 266 | 267 | # floor division by num_splits 268 | split_size = len(psm_list) // num_splits 269 | 270 | # remainder for last split 271 | remainder = len(psm_list) % num_splits 272 | 273 | splits = [] 274 | 275 | start_index = 0 276 | 277 | for i in range(num_splits): 278 | end_index = start_index + split_size + (1 if i < remainder else 0) 279 | splits.append(psm_list[start_index:end_index]) 280 | start_index = end_index 281 | 282 | return splits 283 | """ 284 | 285 | def transform_psm_to_mokapot_pin(psm_df, seq_modified: bool = False): 286 | """ Transform a PSM DataFrame to a mokapot PIN DataFrame. 287 | Args: 288 | psm_df: a DataFrame containing PSMs 289 | seq_modified: whether the sequences are modified 290 | 291 | Returns: 292 | A DataFrame containing the PSMs in mokapot PIN format. 293 | """ 294 | 295 | columns_map = { 296 | # target columns mapping for mokapot 297 | 'spec_idx': 'SpecId', 298 | 'decoy': 'Label', 299 | 'charge': 'Charge', 300 | 'sequence_modified': 'Peptide', 301 | 'proteins': 'Proteins', 302 | 303 | # feature mapping for re-scoring 304 | 'hyperscore': 'Feature1', 305 | 'isotope_error': 'Feature2', 306 | 'delta_mass': 'Feature3', 307 | 'delta_rt': 'Feature4', 308 | 'delta_ims': 'Feature5', 309 | 'matched_peaks': 'Feature6', 310 | 'matched_intensity_pct': 'Feature7', 311 | 'intensity_ms1': 'Feature8', 312 | 'intensity_ms2': 'Feature9', 313 | 'average_ppm': 'Feature10', 314 | 'poisson': 'Feature11', 315 | 'spectral_entropy_similarity': 'Feature12', 316 | 'pearson_correlation': 'Feature13', 317 | 'spearman_correlation': 'Feature14', 318 | 'spectral_angle_similarity': 'Feature15', 319 | 'collision_energy': 'Feature16', 320 | 'delta_next': 'Feature17', 321 | 'delta_best': 'Feature18', 322 | 'longest_b': 'Feature19', 323 | 'longest_y': 'Feature20', 324 | 'longest_y_pct': 'Feature21', 325 | 'cosine_similarity': 'Feature22', 326 | 'rank': 'Feature23', 327 | 'missed_cleavages': 'Feature24', 328 | } 329 | 330 | if not seq_modified: 331 | columns_map['sequence'] = 'Peptide' 332 | columns_map.pop('sequence_modified') 333 | 334 | psm_df = psm_df[list(columns_map.keys())] 335 | df_pin = psm_df.rename(columns=columns_map) 336 | df_pin_clean = df_pin.dropna(axis=1, how='all') 337 | df_pin_clean = df_pin_clean.dropna() 338 | 339 | df_pin_clean['Label'] = df_pin_clean['Label'].apply(lambda x: -1 if x else 1) 340 | df_pin_clean['ScanNr'] = range(1, len(df_pin_clean) + 1) 341 | 342 | return df_pin_clean 343 | -------------------------------------------------------------------------------- /sagepy/sagepy_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theGreatHerrLebert/sagepy/4382f7b24dd4e35bfa256e8c8a32ead610782e99/sagepy/sagepy_logo.png -------------------------------------------------------------------------------- /unimod/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "unimod" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | -------------------------------------------------------------------------------- /unimod/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod unimod { 2 | pub mod modification_atomic_composition; 3 | pub mod title_to_unimod_id; 4 | pub mod unimod_quantized; 5 | pub mod unimod_to_mass; 6 | 7 | // Re-exporting functions to the parent module for easier access 8 | pub use modification_atomic_composition::modification_atomic_composition; 9 | pub use title_to_unimod_id::title_to_unimod_id; 10 | pub use unimod_quantized::{quanzie_mass, quantized_mass_to_unimod}; 11 | pub use unimod_to_mass::{unimod_modifications_mass, unimod_modifications_mass_numerical}; 12 | } --------------------------------------------------------------------------------